Async, second try.
authorRobert Haas <[email protected]>
Tue, 20 Sep 2016 22:30:03 +0000 (18:30 -0400)
committerRobert Haas <[email protected]>
Fri, 23 Sep 2016 21:34:59 +0000 (17:34 -0400)
Notes:
- ExecReScanAppend is busted.
- EPQ is busted.
- EXPLAIN ANALYZE instrumentation is busted.
- It's not actually asynchronous.

17 files changed:
contrib/postgres_fdw/postgres_fdw.c
src/backend/executor/Makefile
src/backend/executor/README
src/backend/executor/execAmi.c
src/backend/executor/execAsync.c [new file with mode: 0644]
src/backend/executor/nodeAppend.c
src/backend/executor/nodeForeignscan.c
src/backend/nodes/copyfuncs.c
src/backend/nodes/outfuncs.c
src/backend/nodes/readfuncs.c
src/backend/optimizer/plan/createplan.c
src/include/executor/execAsync.h [new file with mode: 0644]
src/include/executor/nodeAppend.h
src/include/executor/nodeForeignscan.h
src/include/foreign/fdwapi.h
src/include/nodes/execnodes.h
src/include/nodes/plannodes.h

index daf04385321b9928c0cc5248937632a8528031f2..ab69aa330cbb232100b3c0f9dd93edef1253501e 100644 (file)
@@ -19,6 +19,7 @@
 #include "commands/defrem.h"
 #include "commands/explain.h"
 #include "commands/vacuum.h"
+#include "executor/execAsync.h"
 #include "foreign/fdwapi.h"
 #include "funcapi.h"
 #include "miscadmin.h"
@@ -343,6 +344,14 @@ static void postgresGetForeignJoinPaths(PlannerInfo *root,
                                                        JoinPathExtraData *extra);
 static bool postgresRecheckForeignScan(ForeignScanState *node,
                                                   TupleTableSlot *slot);
+static bool postgresIsForeignPathAsyncCapable(ForeignPath *path);
+static void postgresForeignAsyncRequest(EState *estate,
+                                                       PendingAsyncRequest *areq);
+static void postgresForeignAsyncConfigureWait(EState *estate,
+                                                                 PendingAsyncRequest *areq,
+                                                                 bool reinit);
+static void postgresForeignAsyncNotify(EState *estate,
+                                                  PendingAsyncRequest *areq);
 
 /*
  * Helper functions
@@ -455,6 +464,12 @@ postgres_fdw_handler(PG_FUNCTION_ARGS)
        /* Support functions for join push-down */
        routine->GetForeignJoinPaths = postgresGetForeignJoinPaths;
 
+       /* Support functions for async execution */
+       routine->IsForeignPathAsyncCapable = postgresIsForeignPathAsyncCapable;
+       routine->ForeignAsyncRequest = postgresForeignAsyncRequest;
+       routine->ForeignAsyncConfigureWait = postgresForeignAsyncConfigureWait;
+       routine->ForeignAsyncNotify = postgresForeignAsyncNotify;
+
        PG_RETURN_POINTER(routine);
 }
 
@@ -4342,6 +4357,40 @@ postgresGetForeignJoinPaths(PlannerInfo *root,
        /* XXX Consider parameterized paths for the join relation */
 }
 
+static bool
+postgresIsForeignPathAsyncCapable(ForeignPath *path)
+{
+       return true;
+}
+
+/*
+ * XXX. Just for testing purposes, let's run everything through the async
+ * mechanism but return tuples synchronously.
+ */
+static void
+postgresForeignAsyncRequest(EState *estate, PendingAsyncRequest *areq)
+{
+       ForeignScanState *node = (ForeignScanState *) areq->requestee;
+       TupleTableSlot *slot;
+
+       Assert(IsA(node, ForeignScanState));
+       slot = postgresIterateForeignScan(node);
+       ExecAsyncRequestDone(estate, areq, (Node *) slot);
+}
+
+static void
+postgresForeignAsyncConfigureWait(EState *estate, PendingAsyncRequest *areq,
+                                                                 bool reinit)
+{
+       elog(ERROR, "postgresForeignAsyncConfigureWait");
+}
+
+static void
+postgresForeignAsyncNotify(EState *estate, PendingAsyncRequest *areq)
+{
+       elog(ERROR, "postgresForeignAsyncNotify");
+}
+
 /*
  * Create a tuple from the specified row of the PGresult.
  *
index 51edd4c5e709590d75fd9459f43b13d7eca2bad2..0675b0135670b52ee6f37ba35ba6e84a04747d62 100644 (file)
@@ -12,8 +12,8 @@ subdir = src/backend/executor
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = execAmi.o execCurrent.o execGrouping.o execIndexing.o execJunk.o \
-       execMain.o execParallel.o execProcnode.o execQual.o \
+OBJS = execAmi.o execAsync.o execCurrent.o execGrouping.o execIndexing.o \
+       execJunk.o execMain.o execParallel.o execProcnode.o execQual.o \
        execScan.o execTuples.o \
        execUtils.o functions.o instrument.o nodeAppend.o nodeAgg.o \
        nodeBitmapAnd.o nodeBitmapOr.o \
index f1d1e4c76ce9d78cfc70bc6da7348e2a03dae52d..1dee3dbd2712b7d82b1e4f8b5463a966621823ae 100644 (file)
@@ -199,3 +199,46 @@ query returning the same set of scan tuples multiple times.  Likewise,
 SRFs are disallowed in an UPDATE's targetlist.  There, they would have the
 effect of the same row being updated multiple times, which is not very
 useful --- and updates after the first would have no effect anyway.
+
+Asynchronous Execution
+----------------------
+
+In certain cases, it's desirable for a node to indicate that it cannot
+return any tuple immediately but may be able to do at a later time.  This
+might either because the node is waiting on an event external to the
+database system, such as a ForeignScan awaiting network I/O, or because
+the node is waiting for an event internal to the database system - e.g.
+one process involved in a parallel query may find that it cannot progress
+a certain parallel operation until some other process reaches a certain
+point in the computation.  A process which discovers this type of situation
+can always handle it simply by blocking, but this may waste time that could
+be spent executing some other part of the plan where progress could be
+made immediately.  This is particularly likely to occur when the plan
+contains an Append node.
+
+To use asynchronous execution, a node must first request a tuple from an
+async-capable child node using ExecAsyncRequest.  Next, it must execute
+the asynchronous event loop using ExecAsyncEventLoop; it can avoid giving
+up control indefinitely by passing a timeout to this function, even passing
+-1 to poll for events without blocking.  Eventually, when a node to which an
+asynchronous request has been made produces a tuple, the requesting node
+will receive a callback from the event loop via ExecAsyncResponse. Typically,
+the ExecAsyncResponse callback is the only one required for nodes that wish
+to request tuples asynchronously.
+
+On the other hand, nodes that wish to produce tuples asynchronously
+generally need to implement three methods:
+
+1. When an asynchronous request is made, the node's ExecAsyncRequest callback
+will be invoked; it should use ExecAsyncSetRequiredEvents to indicate the
+number of file descriptor events for which it wishes to wait and whether it
+wishes to receive a callback when the process latch is set. Alternatively,
+it can instead use ExecAsyncRequestDone if a result is available immediately.
+
+2. When the event loop wishes to wait or poll for file descriptor events and
+the process latch, the ExecAsyncConfigureWait callback is invoked to configure
+the file descriptor wait events for which the node wishes to wait.  This
+callback isn't needed if the node only cares about the process latch.
+
+3. When file descriptors or the process latch become ready, the node's
+ExecAsyncNotify callback is invoked.
index 2587ef704626e10b0b744c645cc3a40d89b2c23d..9fcc4e45d66af37a4aad765d24777a46e974a420 100644 (file)
@@ -464,11 +464,16 @@ ExecSupportsBackwardScan(Plan *node)
                        {
                                ListCell   *l;
 
+                               /* With async, tuples may be interleaved, so can't back up. */
+                               if (((Append *) node)->nasyncplans != 0)
+                                       return false;
+
                                foreach(l, ((Append *) node)->appendplans)
                                {
                                        if (!ExecSupportsBackwardScan((Plan *) lfirst(l)))
                                                return false;
                                }
+
                                /* need not check tlist because Append doesn't evaluate it */
                                return true;
                        }
diff --git a/src/backend/executor/execAsync.c b/src/backend/executor/execAsync.c
new file mode 100644 (file)
index 0000000..5858bb5
--- /dev/null
@@ -0,0 +1,462 @@
+/*-------------------------------------------------------------------------
+ *
+ * execAsync.c
+ *       Support routines for asynchronous execution.
+ *
+ * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *       src/backend/executor/execAsync.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "executor/execAsync.h"
+#include "executor/nodeAppend.h"
+#include "executor/nodeForeignscan.h"
+#include "miscadmin.h"
+#include "storage/latch.h"
+
+static bool ExecAsyncEventWait(EState *estate, long timeout);
+static void ExecAsyncConfigureWait(EState *estate, PendingAsyncRequest *areq,
+       bool reinit);
+static void ExecAsyncNotify(EState *estate, PendingAsyncRequest *areq);
+static void ExecAsyncResponse(EState *estate, PendingAsyncRequest *areq);
+
+#define EVENT_BUFFER_SIZE      16
+
+/*
+ * Asynchronously request a tuple from a designed async-aware node.
+ *
+ * requestor is the node that wants the tuple; requestee is the node from
+ * which it wants the tuple.  request_index is an arbitrary integer specified
+ * by the requestor which will be available at the time the requestor receives
+ * the tuple.  This is useful if the requestor has multiple children and
+ * needs an easy way to figure out which one is delivering a tuple.
+ */
+void
+ExecAsyncRequest(EState *estate, PlanState *requestor, int request_index,
+                                PlanState *requestee)
+{
+       PendingAsyncRequest *areq = NULL;
+       int             i = estate->es_num_pending_async;
+
+       /*
+        * If the number of pending asynchronous nodes exceeds the number of
+        * available slots in the es_pending_async array, expand the array.
+        * We start with 16 slots, and thereafter double the array size each
+        * time we run out of slots.
+        */
+       if (i >= estate->es_max_pending_async)
+       {
+               int     newmax;
+
+               newmax = estate->es_max_pending_async * 2;
+               if (estate->es_max_pending_async == 0)
+               {
+                       newmax = 16;
+                       estate->es_pending_async =
+                               MemoryContextAllocZero(estate->es_query_cxt,
+                                                                  newmax * sizeof(PendingAsyncRequest *));
+               }
+               else
+               {
+                       int     newentries = newmax - estate->es_max_pending_async;
+
+                       estate->es_pending_async =
+                               repalloc(estate->es_pending_async,
+                                                newmax * sizeof(PendingAsyncRequest *));
+                       MemSet(&estate->es_pending_async[estate->es_max_pending_async],
+                                  0, newentries * sizeof(PendingAsyncRequest *));
+               }
+               estate->es_max_pending_async = newmax;
+       }
+
+       /*
+        * To avoid unnecessary palloc traffic, we reuse a previously-allocated
+        * PendingAsyncRequest if there is one.  If not, we must allocate a new
+        * one.
+        */
+       if (estate->es_pending_async[i] == NULL)
+       {
+               areq = MemoryContextAllocZero(estate->es_query_cxt,
+                                                                         sizeof(PendingAsyncRequest));
+               estate->es_pending_async[i] = areq;
+       }
+       else
+       {
+               areq = estate->es_pending_async[i];
+               MemSet(areq, 0, sizeof(PendingAsyncRequest));
+       }
+       areq->myindex = estate->es_num_pending_async++;
+
+       /* Initialize the new request. */
+       areq->requestor = requestor;
+       areq->request_index = request_index;
+       areq->requestee = requestee;
+
+       /* Give the requestee a chance to do whatever it wants. */
+       switch (nodeTag(requestee))
+       {
+               case T_ForeignScanState:
+                       ExecAsyncForeignScanRequest(estate, areq);
+                       break;
+               default:
+                       /* If requestee doesn't support async, caller messed up. */
+                       elog(ERROR, "unrecognized node type: %d",
+                               (int) nodeTag(requestee));
+       }
+}
+
+/*
+ * Execute the main loop until the timeout expires or a result is delivered
+ * to the requestor.
+ *
+ * If the timeout is -1, there is no timeout; wait indefinitely until a
+ * result is ready for requestor.  If the timeout is 0, do not block, but
+ * poll for events and fire callbacks for as long as we can do so without
+ * blocking.  If timeout is greater than 0, block for at most the number
+ * of milliseconds indicated by the timeout.
+ *
+ * Returns true if a result was delivered to the requestor.  A return value
+ * of false indicates that the timeout was reached without delivering a
+ * result to the requestor.
+ */
+bool
+ExecAsyncEventLoop(EState *estate, PlanState *requestor, long timeout)
+{
+       instr_time start_time;
+       long cur_timeout = timeout;
+       bool    requestor_done = false;
+
+       Assert(requestor != NULL);
+
+       /*
+        * If we plan to wait - but not indefinitely - we need to record the
+        * current time.
+        */
+       if (timeout > 0)
+               INSTR_TIME_SET_CURRENT(start_time);
+
+       /* Main event loop: poll for events, deliver notifications. */
+       for (;;)
+       {
+               int             i;
+               bool    any_node_done = false;
+
+               CHECK_FOR_INTERRUPTS();
+
+               /*
+                * Check for events, but don't block if there notifications that
+                * have not been delivered yet.
+                */
+               if (estate->es_async_callback_pending > 0)
+                       ExecAsyncEventWait(estate, 0);
+               else if (!ExecAsyncEventWait(estate, cur_timeout))
+                       cur_timeout = 0;                        /* Timeout was reached. */
+               else
+               {
+                       instr_time      cur_time;
+                       long            cur_timeout = -1;
+
+                       INSTR_TIME_SET_CURRENT(cur_time);
+                       INSTR_TIME_SUBTRACT(cur_time, start_time);
+                       cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
+                       if (cur_timeout < 0)
+                               cur_timeout = 0;
+               }
+
+               /* Deliver notifications. */
+               for (i = 0; i < estate->es_num_pending_async; ++i)
+               {
+                       PendingAsyncRequest *areq = estate->es_pending_async[i];
+
+                       /* Skip it if no callback is pending. */
+                       if (!areq->callback_pending)
+                               continue;
+
+                       /*
+                        * Mark it as no longer needing a callback.  We must do this
+                        * before dispatching the callback in case the callback resets
+                        * the flag.
+                        */
+                       areq->callback_pending = false;
+                       estate->es_async_callback_pending--;
+
+                       /* Perform the actual callback; set request_done if appropraite. */
+                       if (!areq->request_complete)
+                               ExecAsyncNotify(estate, areq);
+                       else
+                       {
+                               any_node_done = true;
+                               if (requestor == areq->requestor)
+                                       requestor_done = true;
+                               ExecAsyncResponse(estate, areq);
+                       }
+               }
+
+               /* If any node completed, compact the array. */
+               if (any_node_done)
+               {
+                       int             hidx = 0,
+                                       tidx;
+
+                       /*
+                        * Swap all non-yet-completed items to the start of the array.
+                        * Keep them in the same order.
+                        */
+                       for (tidx = 0; tidx < estate->es_num_pending_async; ++tidx)
+                       {
+                               PendingAsyncRequest *head;
+                               PendingAsyncRequest *tail = estate->es_pending_async[tidx];
+
+                               if (!tail->callback_pending && tail->request_complete)
+                                       continue;
+                               head = estate->es_pending_async[hidx];
+                               estate->es_pending_async[tidx] = head;
+                               estate->es_pending_async[hidx] = tail;
+                               ++hidx;
+                       }
+                       estate->es_num_pending_async = hidx;
+               }
+
+               /*
+                * We only consider exiting the loop when no notifications are
+                * pending.  Otherwise, each call to this function might advance
+                * the computation by only a very small amount; to the contrary,
+                * we want to push it forward as far as possible.
+                */
+               if (estate->es_async_callback_pending == 0)
+               {
+                       /* If requestor is ready, exit. */
+                       if (requestor_done)
+                               return true;
+                       /* If timeout was 0 or has expired, exit. */
+                       if (cur_timeout == 0)
+                               return false;
+               }
+       }
+}
+
+/*
+ * Wait or poll for events.  As with ExecAsyncEventLoop, a timeout of -1
+ * means wait forever, 0 means don't wait at all, and >0 means wait for the
+ * indicated number of milliseconds.
+ *
+ * Returns true if we found some events and false if we timed out.
+ */
+static bool
+ExecAsyncEventWait(EState *estate, long timeout)
+{
+       WaitEvent   occurred_event[EVENT_BUFFER_SIZE];
+       int             noccurred;
+       int             i;
+       int             n;
+       bool    reinit = false;
+       bool    process_latch_set = false;
+
+       if (estate->es_wait_event_set == NULL)
+       {
+               /*
+                * Allow for a few extra events without reinitializing.  It
+                * doesn't seem worth the complexity of doing anything very
+                * aggressive here, because plans that depend on massive numbers
+                * of external FDs are likely to run afoul of kernel limits anyway.
+                */
+               estate->es_allocated_fd_events = estate->es_total_fd_events + 16;
+               estate->es_wait_event_set =
+                       CreateWaitEventSet(estate->es_query_cxt,
+                                                          estate->es_allocated_fd_events + 1);
+               AddWaitEventToSet(estate->es_wait_event_set,
+                                                 WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL);
+               reinit = true;
+       }
+
+       /* Give each waiting node a chance to add or modify events. */
+       for (i = 0; i < estate->es_num_pending_async; ++i)
+       {
+               PendingAsyncRequest *areq = estate->es_pending_async[i];
+
+               if (areq->num_fd_events > 0)
+                       ExecAsyncConfigureWait(estate, areq, reinit);
+       }
+
+       /* Wait for at least one event to occur. */
+       noccurred = WaitEventSetWait(estate->es_wait_event_set, timeout,
+                                                                occurred_event, EVENT_BUFFER_SIZE);
+       if (noccurred == 0)
+               return false;
+
+       /*
+        * Loop over the occurred events and set the callback_pending flags
+        * for the appropriate requests.  The waiting nodes should have
+        * registered their wait events with user_data pointing back to the
+        * PendingAsyncRequest, but the process latch needs special handling.
+        */
+       for (n = 0; n < noccurred; ++n)
+       {
+               WaitEvent  *w = &occurred_event[n];
+
+               if ((w->events & WL_LATCH_SET) != 0)
+               {
+                       process_latch_set = true;
+                       continue;
+               }
+
+               if ((w->events & (WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE)) != 0)
+               {
+                       PendingAsyncRequest *areq = w->user_data;
+
+                       if (!areq->callback_pending)
+                       {
+                               Assert(!areq->request_complete);
+                               areq->callback_pending = true;
+                               estate->es_async_callback_pending++;
+                       }
+               }
+       }
+
+       /*
+        * If the process latch got set, we must schedule a callback for every
+        * requestee that cares about it.
+        */
+       if (process_latch_set)
+       {
+               for (i = 0; i < estate->es_num_pending_async; ++i)
+               {
+                       PendingAsyncRequest *areq = estate->es_pending_async[i];
+
+                       if (areq->wants_process_latch)
+                       {
+                               Assert(!areq->request_complete);
+                               areq->callback_pending = true;
+                       }
+               }
+       }
+
+       return true;
+}
+
+/*
+ * Give the asynchronous node a chance to configure the file descriptor
+ * events for which it wishes to wait.  We expect the node-type specific
+ * callback to make one or more calls of the following form:
+ *
+ * AddWaitEventToSet(es->es_wait_event_set, events, fd, NULL, areq);
+ *
+ * The events should include only WL_SOCKET_READABLE or WL_SOCKET_WRITEABLE,
+ * and the number of calls should not exceed areq->num_fd_events (as
+ * prevously set via ExecAsyncSetRequiredEvents).
+ */
+static void
+ExecAsyncConfigureWait(EState *estate, PendingAsyncRequest *areq,
+                                          bool reinit)
+{
+       switch (nodeTag(areq->requestee))
+       {
+               case T_ForeignScanState:
+                       ExecAsyncForeignScanConfigureWait(estate, areq, reinit);
+                       break;
+               default:
+                       elog(ERROR, "unrecognized node type: %d",
+                               (int) nodeTag(areq->requestee));
+       }
+}
+
+/*
+ * Call the asynchronous node back when a relevant event has occurred.
+ */
+static void
+ExecAsyncNotify(EState *estate, PendingAsyncRequest *areq)
+{
+       switch (nodeTag(areq->requestee))
+       {
+               case T_ForeignScanState:
+                       ExecAsyncForeignScanNotify(estate, areq);
+                       break;
+               default:
+                       elog(ERROR, "unrecognized node type: %d",
+                               (int) nodeTag(areq->requestee));
+       }
+}
+
+/*
+ * Call the requestor back when an asynchronous node has produced a result.
+ */
+static void
+ExecAsyncResponse(EState *estate, PendingAsyncRequest *areq)
+{
+       switch (nodeTag(areq->requestor))
+       {
+               case T_AppendState:
+                       ExecAsyncAppendResponse(estate, areq);
+                       break;
+               default:
+                       elog(ERROR, "unrecognized node type: %d",
+                               (int) nodeTag(areq->requestor));
+       }
+}
+
+/*
+ * An executor node should call this function to signal that it needs to wait
+ * on one or more file descriptor events that can be registered on a
+ * WaitEventSet, and possibly also on the process latch.  num_fd_events
+ * should be the maximum number of file descriptor events that it will wish to
+ * register.  force_reset should be true if the node can't reuse the
+ * WaitEventSet it most recently initialized, for example because it needs to
+ * drop a wait event from the set.
+ */
+void
+ExecAsyncSetRequiredEvents(EState *estate, PendingAsyncRequest *areq,
+       int num_fd_events, bool wants_process_latch,
+       bool force_reset)
+{
+       estate->es_total_fd_events += num_fd_events - areq->num_fd_events;
+       areq->num_fd_events = num_fd_events;
+       areq->wants_process_latch = wants_process_latch;
+
+       if (force_reset && estate->es_wait_event_set != NULL)
+       {
+               FreeWaitEventSet(estate->es_wait_event_set);
+               estate->es_wait_event_set = NULL;
+       }
+}
+
+/*
+ * An async-capable node should call this function to deliver the tuple to
+ * the node which requested it.  The node can call this from its
+ * ExecAsyncRequest callback if the requested tuple is available immediately,
+ * or at a later time from its ExecAsyncNotify callback.
+ */
+void
+ExecAsyncRequestDone(EState *estate, PendingAsyncRequest *areq, Node *result)
+{
+       /*
+        * Since the request is complete, the requestee is no longer allowed
+        * to wait for any events.  Note that this forces a rebuild of
+        * es_wait_event_set every time a process that was previously waiting
+        * stops doing so.  It might be possible to defer that decision until
+        * we actually wait again, because it's quite possible that a new
+        * request will be made of the same node before any wait actually
+        * happens.  However, we have to balance the cost of rebuilding the
+        * WaitEventSet against the additional overhead of tracking which nodes
+        * need a callback to remove registered wait events.  It's not clear
+        * that we would come out ahead, so use brute force for now.
+        */
+       if (areq->num_fd_events > 0 || areq->wants_process_latch)
+               ExecAsyncSetRequiredEvents(estate, areq, 0, false, true);
+
+       /* Save result and mark request as complete. */
+       areq->result = result;
+       areq->request_complete = true;
+
+       /* Make sure this request is flagged for a callback. */
+       if (!areq->callback_pending)
+       {
+               areq->callback_pending = true;
+               estate->es_async_callback_pending++;
+       }
+}
index a26bd6354c1183ebe71408412cb35f67e9b8f898..bb06569fd1a6c61d88b5ebb83dee854cf67344e8 100644 (file)
@@ -58,6 +58,7 @@
 #include "postgres.h"
 
 #include "executor/execdebug.h"
+#include "executor/execAsync.h"
 #include "executor/nodeAppend.h"
 
 static bool exec_append_initialize_next(AppendState *appendstate);
@@ -79,16 +80,21 @@ exec_append_initialize_next(AppendState *appendstate)
        /*
         * get information from the append node
         */
-       whichplan = appendstate->as_whichplan;
+       whichplan = appendstate->as_whichsyncplan;
 
-       if (whichplan < 0)
+       /*
+        * This routine is only responsible for setting up for nodes being scanned
+        * synchronously, so the first node we can scan is given by nasyncplans
+        * and the last is given by as_nplans - 1.
+        */
+       if (whichplan < appendstate->as_nasyncplans)
        {
                /*
                 * if scanning in reverse, we start at the last scan in the list and
                 * then proceed back to the first.. in any case we inform ExecAppend
                 * that we are at the end of the line by returning FALSE
                 */
-               appendstate->as_whichplan = 0;
+               appendstate->as_whichsyncplan = appendstate->as_nasyncplans;
                return FALSE;
        }
        else if (whichplan >= appendstate->as_nplans)
@@ -96,7 +102,7 @@ exec_append_initialize_next(AppendState *appendstate)
                /*
                 * as above, end the scan if we go beyond the last scan in our list..
                 */
-               appendstate->as_whichplan = appendstate->as_nplans - 1;
+               appendstate->as_whichsyncplan = appendstate->as_nplans - 1;
                return FALSE;
        }
        else
@@ -142,6 +148,15 @@ ExecInitAppend(Append *node, EState *estate, int eflags)
        appendstate->ps.state = estate;
        appendstate->appendplans = appendplanstates;
        appendstate->as_nplans = nplans;
+       appendstate->as_nasyncplans = node->nasyncplans;
+       appendstate->as_syncdone = (node->nasyncplans == nplans);
+       appendstate->as_asyncresult = (TupleTableSlot **)
+               palloc0(node->nasyncplans * sizeof(TupleTableSlot *));
+
+       /* initially, all async requests need a request */
+       for (i = 0; i < appendstate->as_nasyncplans; ++i)
+               appendstate->as_needrequest =
+                       bms_add_member(appendstate->as_needrequest, i);
 
        /*
         * Miscellaneous initialization
@@ -176,9 +191,9 @@ ExecInitAppend(Append *node, EState *estate, int eflags)
        appendstate->ps.ps_ProjInfo = NULL;
 
        /*
-        * initialize to scan first subplan
+        * initialize to scan first synchronous subplan
         */
-       appendstate->as_whichplan = 0;
+       appendstate->as_whichsyncplan = appendstate->as_nasyncplans;
        exec_append_initialize_next(appendstate);
 
        return appendstate;
@@ -193,15 +208,78 @@ ExecInitAppend(Append *node, EState *estate, int eflags)
 TupleTableSlot *
 ExecAppend(AppendState *node)
 {
+       if (node->as_nasyncplans > 0)
+       {
+               EState *estate = node->ps.state;
+               int     i;
+
+               /*
+                * If there are any asynchronously-generated results that have
+                * not yet been returned, return one of them.
+                */
+               if (node->as_nasyncresult > 0)
+               {
+                       --node->as_nasyncresult;
+                       return node->as_asyncresult[node->as_nasyncresult];
+               }
+
+               /*
+                * If there are any nodes that need a new asynchronous request,
+                * make all of them.
+                */
+               while ((i = bms_first_member(node->as_needrequest)) >= 0)
+               {
+                       ExecAsyncRequest(estate, &node->ps, i, node->appendplans[i]);
+                       node->as_nasyncpending++;
+               }
+       }
+
        for (;;)
        {
                PlanState  *subnode;
                TupleTableSlot *result;
 
                /*
-                * figure out which subplan we are currently processing
+                * if we have async requests outstanding, run the event loop
                 */
-               subnode = node->appendplans[node->as_whichplan];
+               if (node->as_nasyncpending > 0)
+               {
+                       long    timeout = node->as_syncdone ? -1 : 0;
+
+                       for (;;)
+                       {
+                               if (node->as_nasyncpending == 0)
+                               {
+                                       /*
+                                        * If there is no asynchronous activity still pending
+                                        * and the synchronous activity is also complete, we're
+                                        * totally done scanning this node.  Otherwise, we're
+                                        * done with the asynchronous stuff but must continue
+                                        * scanning the synchronous children.
+                                        */
+                                       if (node->as_syncdone)
+                                               return ExecClearTuple(node->ps.ps_ResultTupleSlot);
+                                       break;
+                               }
+                               if (!ExecAsyncEventLoop(node->ps.state, &node->ps, timeout))
+                               {
+                                       /* Timeout reached. */
+                                       break;
+                               }
+                               if (node->as_nasyncresult > 0)
+                               {
+                                       /* Asynchronous subplan returned a tuple! */
+                                       --node->as_nasyncresult;
+                                       return node->as_asyncresult[node->as_nasyncresult];
+                               }
+                       }
+               }
+
+               /*
+                * figure out which synchronous subplan we are currently processing
+                */
+               Assert(!node->as_syncdone);
+               subnode = node->appendplans[node->as_whichsyncplan];
 
                /*
                 * get a tuple from the subplan
@@ -221,14 +299,21 @@ ExecAppend(AppendState *node)
                /*
                 * Go on to the "next" subplan in the appropriate direction. If no
                 * more subplans, return the empty slot set up for us by
-                * ExecInitAppend.
+                * ExecInitAppend, unless there are async plans we have yet to finish.
                 */
                if (ScanDirectionIsForward(node->ps.state->es_direction))
-                       node->as_whichplan++;
+                       node->as_whichsyncplan++;
                else
-                       node->as_whichplan--;
+                       node->as_whichsyncplan--;
                if (!exec_append_initialize_next(node))
-                       return ExecClearTuple(node->ps.ps_ResultTupleSlot);
+               {
+                       node->as_syncdone = true;
+                       if (node->as_nasyncpending == 0)
+                       {
+                               Assert(bms_is_empty(node->as_needrequest));
+                               return ExecClearTuple(node->ps.ps_ResultTupleSlot);
+                       }
+               }
 
                /* Else loop back and try to get a tuple from the new subplan */
        }
@@ -267,6 +352,16 @@ ExecReScanAppend(AppendState *node)
 {
        int                     i;
 
+       /*
+        * XXX. Cancel outstanding asynchronous tuple requests here! (How?)
+        */
+
+       /* Reset async state. */
+       for (i = 0; i < node->as_nasyncplans; ++i)
+               node->as_needrequest = bms_add_member(node->as_needrequest, i);
+       node->as_nasyncresult = 0;
+       node->as_syncdone = (node->as_nasyncplans == node->as_nplans);
+
        for (i = 0; i < node->as_nplans; i++)
        {
                PlanState  *subnode = node->appendplans[i];
@@ -285,6 +380,47 @@ ExecReScanAppend(AppendState *node)
                if (subnode->chgParam == NULL)
                        ExecReScan(subnode);
        }
-       node->as_whichplan = 0;
+       node->as_whichsyncplan = node->as_nasyncplans;
        exec_append_initialize_next(node);
 }
+
+/* ----------------------------------------------------------------
+ *             ExecAsyncAppendResponse
+ *
+ *             Receive a response from an asynchronous request we made.
+ * ----------------------------------------------------------------
+ */
+void
+ExecAsyncAppendResponse(EState *estate, PendingAsyncRequest *areq)
+{
+       AppendState *node = (AppendState *) areq->requestor;
+       TupleTableSlot *slot;
+
+       /* We shouldn't be called until the request is complete. */
+       Assert(areq->request_complete);
+
+       /* Our result slot shouldn't already be occupied. */
+       Assert(TupIsNull(node->ps.ps_ResultTupleSlot));
+
+       /* Result should be a TupleTableSlot or NULL. */
+       slot = (TupleTableSlot *) areq->result;
+       Assert(slot == NULL || IsA(slot, TupleTableSlot));
+
+       /* Request is no longer pending. */
+       Assert(node->as_nasyncpending > 0);
+       --node->as_nasyncpending;
+
+       /* If the result is NULL or an empty slot, there's nothing more to do. */
+       if (TupIsNull(slot))
+               return;
+
+       /* Save result so we can return it. */
+       Assert(node->as_nasyncresult < node->as_nasyncplans);
+       node->as_asyncresult[node->as_nasyncresult++] = slot;
+
+       /*
+        * Mark the node that returned a result as ready for a new request.  We
+        * don't launch another one here immediately because it might compelte
+        */
+       bms_add_member(node->as_needrequest, areq->request_index);
+}
index d886aaf64d6776252020622ddd454633791f1f25..85d436f2f2ac304bcbfeb772a4ef3a27f1ed5c92 100644 (file)
@@ -355,3 +355,52 @@ ExecForeignScanInitializeWorker(ForeignScanState *node, shm_toc *toc)
                fdwroutine->InitializeWorkerForeignScan(node, toc, coordinate);
        }
 }
+
+/* ----------------------------------------------------------------
+ *             ExecAsyncForeignScanRequest
+ *
+ *             Initiate an asynchronous request
+ * ----------------------------------------------------------------
+ */
+void
+ExecAsyncForeignScanRequest(EState *estate, PendingAsyncRequest *areq)
+{
+       ForeignScanState *node = (ForeignScanState *) areq->requestee;
+       FdwRoutine *fdwroutine = node->fdwroutine;
+
+       Assert(fdwroutine->ForeignAsyncRequest != NULL);
+       fdwroutine->ForeignAsyncRequest(estate, areq);
+}
+
+/* ----------------------------------------------------------------
+ *             ExecAsyncForeignScanConfigureWait
+ *
+ *             In async mode, configure for a wait
+ * ----------------------------------------------------------------
+ */
+void
+ExecAsyncForeignScanConfigureWait(EState *estate,
+       PendingAsyncRequest *areq, bool reinit)
+{
+       ForeignScanState *node = (ForeignScanState *) areq->requestee;
+       FdwRoutine *fdwroutine = node->fdwroutine;
+
+       Assert(fdwroutine->ForeignAsyncConfigureWait != NULL);
+       fdwroutine->ForeignAsyncConfigureWait(estate, areq, reinit);
+}
+
+/* ----------------------------------------------------------------
+ *             ExecAsyncForeignScanNotify
+ *
+ *             Event loop callback
+ * ----------------------------------------------------------------
+ */
+void
+ExecAsyncForeignScanNotify(EState *estate, PendingAsyncRequest *areq)
+{
+       ForeignScanState *node = (ForeignScanState *) areq->requestee;
+       FdwRoutine *fdwroutine = node->fdwroutine;
+
+       Assert(fdwroutine->ForeignAsyncNotify != NULL);
+       fdwroutine->ForeignAsyncNotify(estate, areq);
+}
index 71714bc1d6709617787f06842c4690d6d0edf54d..23b4e1833435f1221a3bad0c175efbdaa02c4b87 100644 (file)
@@ -218,6 +218,7 @@ _copyAppend(const Append *from)
         * copy remainder of node
         */
        COPY_NODE_FIELD(appendplans);
+       COPY_SCALAR_FIELD(nasyncplans);
 
        return newnode;
 }
index ae869547f357da2944e3cf122ec6fd20026622ee..dc5b938530a0729303f60a9a1a867a9c0ec3767d 100644 (file)
@@ -359,6 +359,7 @@ _outAppend(StringInfo str, const Append *node)
        _outPlanInfo(str, (const Plan *) node);
 
        WRITE_NODE_FIELD(appendplans);
+       WRITE_INT_FIELD(nasyncplans);
 }
 
 static void
index 917e6c8a65efe96aa84e51780f702376bb70473c..69453b5f3e5a76aedcfc55cc05c9a8e7622fa43e 100644 (file)
@@ -1519,6 +1519,7 @@ _readAppend(void)
        ReadCommonPlan(&local_node->plan);
 
        READ_NODE_FIELD(appendplans);
+       READ_INT_FIELD(nasyncplans);
 
        READ_DONE();
 }
index 47158f646802aaeab776192b98329a940bbdbea3..e7e55c01caf48b6aac98a2849e9c19439850d267 100644 (file)
@@ -193,7 +193,7 @@ static CteScan *make_ctescan(List *qptlist, List *qpqual,
                         Index scanrelid, int ctePlanId, int cteParam);
 static WorkTableScan *make_worktablescan(List *qptlist, List *qpqual,
                                   Index scanrelid, int wtParam);
-static Append *make_append(List *appendplans, List *tlist);
+static Append *make_append(List *asyncplans, int nasyncplans, List *tlist);
 static RecursiveUnion *make_recursive_union(List *tlist,
                                         Plan *lefttree,
                                         Plan *righttree,
@@ -270,6 +270,7 @@ static ModifyTable *make_modifytable(PlannerInfo *root,
                                 List *resultRelations, List *subplans,
                                 List *withCheckOptionLists, List *returningLists,
                                 List *rowMarks, OnConflictExpr *onconflict, int epqParam);
+static bool is_async_capable_path(Path *path);
 
 
 /*
@@ -955,8 +956,10 @@ create_append_plan(PlannerInfo *root, AppendPath *best_path)
 {
        Append     *plan;
        List       *tlist = build_path_tlist(root, &best_path->path);
-       List       *subplans = NIL;
+       List       *asyncplans = NIL;
+       List       *syncplans = NIL;
        ListCell   *subpaths;
+       int                     nasyncplans = 0;
 
        /*
         * The subpaths list could be empty, if every child was proven empty by
@@ -991,7 +994,14 @@ create_append_plan(PlannerInfo *root, AppendPath *best_path)
                /* Must insist that all children return the same tlist */
                subplan = create_plan_recurse(root, subpath, CP_EXACT_TLIST);
 
-               subplans = lappend(subplans, subplan);
+               /* Classify as async-capable or not */
+               if (is_async_capable_path(subpath))
+               {
+                       asyncplans = lappend(asyncplans, subplan);
+                       ++nasyncplans;
+               }
+               else
+                       syncplans = lappend(syncplans, subplan);
        }
 
        /*
@@ -1001,7 +1011,7 @@ create_append_plan(PlannerInfo *root, AppendPath *best_path)
         * parent-rel Vars it'll be asked to emit.
         */
 
-       plan = make_append(subplans, tlist);
+       plan = make_append(list_concat(asyncplans, syncplans), nasyncplans, tlist);
 
        copy_generic_path_info(&plan->plan, (Path *) best_path);
 
@@ -4934,7 +4944,7 @@ make_foreignscan(List *qptlist,
 }
 
 static Append *
-make_append(List *appendplans, List *tlist)
+make_append(List *appendplans, int nasyncplans, List *tlist)
 {
        Append     *node = makeNode(Append);
        Plan       *plan = &node->plan;
@@ -4944,6 +4954,7 @@ make_append(List *appendplans, List *tlist)
        plan->lefttree = NULL;
        plan->righttree = NULL;
        node->appendplans = appendplans;
+       node->nasyncplans = nasyncplans;
 
        return node;
 }
@@ -6218,3 +6229,27 @@ is_projection_capable_plan(Plan *plan)
        }
        return true;
 }
+
+/*
+ * is_projection_capable_path
+ *             Check whether a given Path node is async-capable.
+ */
+static bool
+is_async_capable_path(Path *path)
+{
+       switch (nodeTag(path))
+       {
+               case T_ForeignPath:
+                       {
+                               FdwRoutine *fdwroutine = path->parent->fdwroutine;
+
+                               Assert(fdwroutine != NULL);
+                               if (fdwroutine->IsForeignPathAsyncCapable != NULL &&
+                                       fdwroutine->IsForeignPathAsyncCapable((ForeignPath *) path))
+                                       return true;
+                       }
+               default:
+                       break;
+       }
+       return false;
+}
diff --git a/src/include/executor/execAsync.h b/src/include/executor/execAsync.h
new file mode 100644 (file)
index 0000000..2abc32d
--- /dev/null
@@ -0,0 +1,29 @@
+/*--------------------------------------------------------------------
+ * execAsync.c
+ *             Support functions for asynchronous query execution
+ *
+ * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *             src/backend/executor/execAsync.c
+ *--------------------------------------------------------------------
+ */
+
+#ifndef EXECASYNC_H
+#define EXECASYNC_H
+
+#include "nodes/execnodes.h"
+
+extern void ExecAsyncRequest(EState *estate, PlanState *requestor,
+               int request_index, PlanState *requestee);
+extern bool ExecAsyncEventLoop(EState *estate, PlanState *requestor,
+                               long timeout);
+
+extern void ExecAsyncSetRequiredEvents(EState *estate,
+       PendingAsyncRequest *areq, int num_fd_events,
+       bool wants_process_latch, bool force_reset);
+extern void ExecAsyncRequestDone(EState *estate,
+       PendingAsyncRequest *areq, Node *result);
+
+#endif   /* EXECASYNC_H */
index 51c381ee88b884ea7727c0a2a3650610ed2e716b..81a079d01c18ff28abf86590172d46c82bc22d75 100644 (file)
@@ -21,4 +21,7 @@ extern TupleTableSlot *ExecAppend(AppendState *node);
 extern void ExecEndAppend(AppendState *node);
 extern void ExecReScanAppend(AppendState *node);
 
+extern void ExecAsyncAppendResponse(EState *estate,
+       PendingAsyncRequest *areq);
+
 #endif   /* NODEAPPEND_H */
index 0cdec4e843e87eea83c0ef4ca19957a19a84ed16..3e69ab0f7cae5a1ba6d235355e111facc2fa4862 100644 (file)
@@ -29,4 +29,11 @@ extern void ExecForeignScanInitializeDSM(ForeignScanState *node,
 extern void ExecForeignScanInitializeWorker(ForeignScanState *node,
                                                                shm_toc *toc);
 
+extern void ExecAsyncForeignScanRequest(EState *estate,
+       PendingAsyncRequest *areq);
+extern void ExecAsyncForeignScanConfigureWait(EState *estate,
+       PendingAsyncRequest *areq, bool reinit);
+extern void ExecAsyncForeignScanNotify(EState *estate,
+       PendingAsyncRequest *areq);
+
 #endif   /* NODEFOREIGNSCAN_H */
index e1b0d0da7df0a7163ca979549560e5db6f9461df..88feb9abec557004c47d006b184a1ea01ece9ec8 100644 (file)
@@ -155,6 +155,15 @@ typedef bool (*IsForeignScanParallelSafe_function) (PlannerInfo *root,
                                                                                                                         RelOptInfo *rel,
                                                                                                                 RangeTblEntry *rte);
 
+typedef bool (*IsForeignPathAsyncCapable_function) (ForeignPath *path);
+typedef void (*ForeignAsyncRequest_function) (EState *estate,
+                                                                                       PendingAsyncRequest *areq);
+typedef void (*ForeignAsyncConfigureWait_function) (EState *estate,
+                                                                                       PendingAsyncRequest *areq,
+                                                                                       bool reinit);
+typedef void (*ForeignAsyncNotify_function) (EState *estate,
+                                                                                       PendingAsyncRequest *areq);
+
 /*
  * FdwRoutine is the struct returned by a foreign-data wrapper's handler
  * function.  It provides pointers to the callback functions needed by the
@@ -224,6 +233,12 @@ typedef struct FdwRoutine
        EstimateDSMForeignScan_function EstimateDSMForeignScan;
        InitializeDSMForeignScan_function InitializeDSMForeignScan;
        InitializeWorkerForeignScan_function InitializeWorkerForeignScan;
+
+       /* Support functions for asynchronous execution */
+       IsForeignPathAsyncCapable_function IsForeignPathAsyncCapable;
+       ForeignAsyncRequest_function ForeignAsyncRequest;
+       ForeignAsyncConfigureWait_function ForeignAsyncConfigureWait;
+       ForeignAsyncNotify_function ForeignAsyncNotify;
 } FdwRoutine;
 
 
index 4fa366178f5ee14f4fa3aef56454c54eae731218..e5282b58eb09e871cb7616afabcc4572b4669735 100644 (file)
@@ -346,6 +346,25 @@ typedef struct ResultRelInfo
        List       *ri_onConflictSetWhere;
 } ResultRelInfo;
 
+/* ----------------
+ *       PendingAsyncRequest
+ *
+ * State for an asynchronous tuple request.
+ * ----------------
+ */
+typedef struct PendingAsyncRequest
+{
+       int                     myindex;                        /* Index in es_pending_async. */
+       struct PlanState *requestor;    /* Node that wants a tuple. */
+       struct PlanState *requestee;    /* Node from which a tuple is wanted. */
+       int                     request_index;  /* Scratch space for requestor. */
+       int                     num_fd_events;  /* Max number of FD events requestee needs. */
+       bool            wants_process_latch;    /* Requestee cares about MyLatch. */
+       bool            callback_pending;                       /* Callback is needed. */
+       bool            request_complete;               /* Request complete, result valid. */
+       Node       *result;                     /* Result (NULL if no more tuples). */
+} PendingAsyncRequest;
+
 /* ----------------
  *       EState information
  *
@@ -422,6 +441,31 @@ typedef struct EState
        HeapTuple  *es_epqTuple;        /* array of EPQ substitute tuples */
        bool       *es_epqTupleSet; /* true if EPQ tuple is provided */
        bool       *es_epqScanDone; /* true if EPQ tuple has been fetched */
+
+       /*
+        * Support for asynchronous execution.
+        *
+        * es_max_pending_async is the allocated size of es_pending_async, and
+        * es_num_pending_aync is the number of entries that are currently valid.
+        * (Entries after that may point to storage that can be reused.)
+        * es_async_callback_pending is the number of PendingAsyncRequests for
+        * which callback_pending is true.
+        *
+        * es_total_fd_events is the total number of FD events needed by all
+        * pending async nodes, and es_allocated_fd_events is the number any
+        * current wait event set was allocated to handle.  es_wait_event_set, if
+        * non-NULL, is a previously allocated event set that may be reusable by a
+        * future wait provided that nothing's been removed and not too many more
+        * events have been added.
+        */
+       int                     es_num_pending_async;
+       int                     es_max_pending_async;
+       int                     es_async_callback_pending;
+       PendingAsyncRequest **es_pending_async;
+
+       int                     es_total_fd_events;
+       int                     es_allocated_fd_events;
+       struct WaitEventSet *es_wait_event_set;
 } EState;
 
 
@@ -1141,17 +1185,20 @@ typedef struct ModifyTableState
 
 /* ----------------
  *      AppendState information
- *
- *             nplans                  how many plans are in the array
- *             whichplan               which plan is being executed (0 .. n-1)
  * ----------------
  */
 typedef struct AppendState
 {
        PlanState       ps;                             /* its first field is NodeTag */
        PlanState **appendplans;        /* array of PlanStates for my inputs */
-       int                     as_nplans;
-       int                     as_whichplan;
+       int                     as_nplans;              /* total # of children */
+       int                     as_nasyncplans; /* # of async-capable children */
+       int                     as_whichsyncplan; /* which sync plan is being executed  */
+       bool            as_syncdone;    /* all synchronous plans done? */
+       Bitmapset  *as_needrequest;     /* async plans needing a new request */
+       TupleTableSlot **as_asyncresult;        /* unreturned results of async plans */
+       int                     as_nasyncresult;        /* # of valid entries in as_asyncresult */
+       int                     as_nasyncpending;       /* # of outstanding async requests */
 } AppendState;
 
 /* ----------------
index e2fbc7d5a7844e46442b8a5befb88dc4ca912a53..327119b53f6ee895f12169c4ed84968fd9d362a1 100644 (file)
@@ -208,6 +208,7 @@ typedef struct Append
 {
        Plan            plan;
        List       *appendplans;
+       int                     nasyncplans;    /* # of async plans, always at start of list */
 } Append;
 
 /* ----------------