#include "commands/defrem.h"
#include "commands/explain.h"
#include "commands/vacuum.h"
+#include "executor/execAsync.h"
#include "foreign/fdwapi.h"
#include "funcapi.h"
#include "miscadmin.h"
JoinPathExtraData *extra);
static bool postgresRecheckForeignScan(ForeignScanState *node,
TupleTableSlot *slot);
+static bool postgresIsForeignPathAsyncCapable(ForeignPath *path);
+static void postgresForeignAsyncRequest(EState *estate,
+ PendingAsyncRequest *areq);
+static void postgresForeignAsyncConfigureWait(EState *estate,
+ PendingAsyncRequest *areq,
+ bool reinit);
+static void postgresForeignAsyncNotify(EState *estate,
+ PendingAsyncRequest *areq);
/*
* Helper functions
/* Support functions for join push-down */
routine->GetForeignJoinPaths = postgresGetForeignJoinPaths;
+ /* Support functions for async execution */
+ routine->IsForeignPathAsyncCapable = postgresIsForeignPathAsyncCapable;
+ routine->ForeignAsyncRequest = postgresForeignAsyncRequest;
+ routine->ForeignAsyncConfigureWait = postgresForeignAsyncConfigureWait;
+ routine->ForeignAsyncNotify = postgresForeignAsyncNotify;
+
PG_RETURN_POINTER(routine);
}
/* XXX Consider parameterized paths for the join relation */
}
+static bool
+postgresIsForeignPathAsyncCapable(ForeignPath *path)
+{
+ return true;
+}
+
+/*
+ * XXX. Just for testing purposes, let's run everything through the async
+ * mechanism but return tuples synchronously.
+ */
+static void
+postgresForeignAsyncRequest(EState *estate, PendingAsyncRequest *areq)
+{
+ ForeignScanState *node = (ForeignScanState *) areq->requestee;
+ TupleTableSlot *slot;
+
+ Assert(IsA(node, ForeignScanState));
+ slot = postgresIterateForeignScan(node);
+ ExecAsyncRequestDone(estate, areq, (Node *) slot);
+}
+
+static void
+postgresForeignAsyncConfigureWait(EState *estate, PendingAsyncRequest *areq,
+ bool reinit)
+{
+ elog(ERROR, "postgresForeignAsyncConfigureWait");
+}
+
+static void
+postgresForeignAsyncNotify(EState *estate, PendingAsyncRequest *areq)
+{
+ elog(ERROR, "postgresForeignAsyncNotify");
+}
+
/*
* Create a tuple from the specified row of the PGresult.
*
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
-OBJS = execAmi.o execCurrent.o execGrouping.o execIndexing.o execJunk.o \
- execMain.o execParallel.o execProcnode.o execQual.o \
+OBJS = execAmi.o execAsync.o execCurrent.o execGrouping.o execIndexing.o \
+ execJunk.o execMain.o execParallel.o execProcnode.o execQual.o \
execScan.o execTuples.o \
execUtils.o functions.o instrument.o nodeAppend.o nodeAgg.o \
nodeBitmapAnd.o nodeBitmapOr.o \
SRFs are disallowed in an UPDATE's targetlist. There, they would have the
effect of the same row being updated multiple times, which is not very
useful --- and updates after the first would have no effect anyway.
+
+Asynchronous Execution
+----------------------
+
+In certain cases, it's desirable for a node to indicate that it cannot
+return any tuple immediately but may be able to do at a later time. This
+might either because the node is waiting on an event external to the
+database system, such as a ForeignScan awaiting network I/O, or because
+the node is waiting for an event internal to the database system - e.g.
+one process involved in a parallel query may find that it cannot progress
+a certain parallel operation until some other process reaches a certain
+point in the computation. A process which discovers this type of situation
+can always handle it simply by blocking, but this may waste time that could
+be spent executing some other part of the plan where progress could be
+made immediately. This is particularly likely to occur when the plan
+contains an Append node.
+
+To use asynchronous execution, a node must first request a tuple from an
+async-capable child node using ExecAsyncRequest. Next, it must execute
+the asynchronous event loop using ExecAsyncEventLoop; it can avoid giving
+up control indefinitely by passing a timeout to this function, even passing
+-1 to poll for events without blocking. Eventually, when a node to which an
+asynchronous request has been made produces a tuple, the requesting node
+will receive a callback from the event loop via ExecAsyncResponse. Typically,
+the ExecAsyncResponse callback is the only one required for nodes that wish
+to request tuples asynchronously.
+
+On the other hand, nodes that wish to produce tuples asynchronously
+generally need to implement three methods:
+
+1. When an asynchronous request is made, the node's ExecAsyncRequest callback
+will be invoked; it should use ExecAsyncSetRequiredEvents to indicate the
+number of file descriptor events for which it wishes to wait and whether it
+wishes to receive a callback when the process latch is set. Alternatively,
+it can instead use ExecAsyncRequestDone if a result is available immediately.
+
+2. When the event loop wishes to wait or poll for file descriptor events and
+the process latch, the ExecAsyncConfigureWait callback is invoked to configure
+the file descriptor wait events for which the node wishes to wait. This
+callback isn't needed if the node only cares about the process latch.
+
+3. When file descriptors or the process latch become ready, the node's
+ExecAsyncNotify callback is invoked.
{
ListCell *l;
+ /* With async, tuples may be interleaved, so can't back up. */
+ if (((Append *) node)->nasyncplans != 0)
+ return false;
+
foreach(l, ((Append *) node)->appendplans)
{
if (!ExecSupportsBackwardScan((Plan *) lfirst(l)))
return false;
}
+
/* need not check tlist because Append doesn't evaluate it */
return true;
}
--- /dev/null
+/*-------------------------------------------------------------------------
+ *
+ * execAsync.c
+ * Support routines for asynchronous execution.
+ *
+ * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/executor/execAsync.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "executor/execAsync.h"
+#include "executor/nodeAppend.h"
+#include "executor/nodeForeignscan.h"
+#include "miscadmin.h"
+#include "storage/latch.h"
+
+static bool ExecAsyncEventWait(EState *estate, long timeout);
+static void ExecAsyncConfigureWait(EState *estate, PendingAsyncRequest *areq,
+ bool reinit);
+static void ExecAsyncNotify(EState *estate, PendingAsyncRequest *areq);
+static void ExecAsyncResponse(EState *estate, PendingAsyncRequest *areq);
+
+#define EVENT_BUFFER_SIZE 16
+
+/*
+ * Asynchronously request a tuple from a designed async-aware node.
+ *
+ * requestor is the node that wants the tuple; requestee is the node from
+ * which it wants the tuple. request_index is an arbitrary integer specified
+ * by the requestor which will be available at the time the requestor receives
+ * the tuple. This is useful if the requestor has multiple children and
+ * needs an easy way to figure out which one is delivering a tuple.
+ */
+void
+ExecAsyncRequest(EState *estate, PlanState *requestor, int request_index,
+ PlanState *requestee)
+{
+ PendingAsyncRequest *areq = NULL;
+ int i = estate->es_num_pending_async;
+
+ /*
+ * If the number of pending asynchronous nodes exceeds the number of
+ * available slots in the es_pending_async array, expand the array.
+ * We start with 16 slots, and thereafter double the array size each
+ * time we run out of slots.
+ */
+ if (i >= estate->es_max_pending_async)
+ {
+ int newmax;
+
+ newmax = estate->es_max_pending_async * 2;
+ if (estate->es_max_pending_async == 0)
+ {
+ newmax = 16;
+ estate->es_pending_async =
+ MemoryContextAllocZero(estate->es_query_cxt,
+ newmax * sizeof(PendingAsyncRequest *));
+ }
+ else
+ {
+ int newentries = newmax - estate->es_max_pending_async;
+
+ estate->es_pending_async =
+ repalloc(estate->es_pending_async,
+ newmax * sizeof(PendingAsyncRequest *));
+ MemSet(&estate->es_pending_async[estate->es_max_pending_async],
+ 0, newentries * sizeof(PendingAsyncRequest *));
+ }
+ estate->es_max_pending_async = newmax;
+ }
+
+ /*
+ * To avoid unnecessary palloc traffic, we reuse a previously-allocated
+ * PendingAsyncRequest if there is one. If not, we must allocate a new
+ * one.
+ */
+ if (estate->es_pending_async[i] == NULL)
+ {
+ areq = MemoryContextAllocZero(estate->es_query_cxt,
+ sizeof(PendingAsyncRequest));
+ estate->es_pending_async[i] = areq;
+ }
+ else
+ {
+ areq = estate->es_pending_async[i];
+ MemSet(areq, 0, sizeof(PendingAsyncRequest));
+ }
+ areq->myindex = estate->es_num_pending_async++;
+
+ /* Initialize the new request. */
+ areq->requestor = requestor;
+ areq->request_index = request_index;
+ areq->requestee = requestee;
+
+ /* Give the requestee a chance to do whatever it wants. */
+ switch (nodeTag(requestee))
+ {
+ case T_ForeignScanState:
+ ExecAsyncForeignScanRequest(estate, areq);
+ break;
+ default:
+ /* If requestee doesn't support async, caller messed up. */
+ elog(ERROR, "unrecognized node type: %d",
+ (int) nodeTag(requestee));
+ }
+}
+
+/*
+ * Execute the main loop until the timeout expires or a result is delivered
+ * to the requestor.
+ *
+ * If the timeout is -1, there is no timeout; wait indefinitely until a
+ * result is ready for requestor. If the timeout is 0, do not block, but
+ * poll for events and fire callbacks for as long as we can do so without
+ * blocking. If timeout is greater than 0, block for at most the number
+ * of milliseconds indicated by the timeout.
+ *
+ * Returns true if a result was delivered to the requestor. A return value
+ * of false indicates that the timeout was reached without delivering a
+ * result to the requestor.
+ */
+bool
+ExecAsyncEventLoop(EState *estate, PlanState *requestor, long timeout)
+{
+ instr_time start_time;
+ long cur_timeout = timeout;
+ bool requestor_done = false;
+
+ Assert(requestor != NULL);
+
+ /*
+ * If we plan to wait - but not indefinitely - we need to record the
+ * current time.
+ */
+ if (timeout > 0)
+ INSTR_TIME_SET_CURRENT(start_time);
+
+ /* Main event loop: poll for events, deliver notifications. */
+ for (;;)
+ {
+ int i;
+ bool any_node_done = false;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * Check for events, but don't block if there notifications that
+ * have not been delivered yet.
+ */
+ if (estate->es_async_callback_pending > 0)
+ ExecAsyncEventWait(estate, 0);
+ else if (!ExecAsyncEventWait(estate, cur_timeout))
+ cur_timeout = 0; /* Timeout was reached. */
+ else
+ {
+ instr_time cur_time;
+ long cur_timeout = -1;
+
+ INSTR_TIME_SET_CURRENT(cur_time);
+ INSTR_TIME_SUBTRACT(cur_time, start_time);
+ cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
+ if (cur_timeout < 0)
+ cur_timeout = 0;
+ }
+
+ /* Deliver notifications. */
+ for (i = 0; i < estate->es_num_pending_async; ++i)
+ {
+ PendingAsyncRequest *areq = estate->es_pending_async[i];
+
+ /* Skip it if no callback is pending. */
+ if (!areq->callback_pending)
+ continue;
+
+ /*
+ * Mark it as no longer needing a callback. We must do this
+ * before dispatching the callback in case the callback resets
+ * the flag.
+ */
+ areq->callback_pending = false;
+ estate->es_async_callback_pending--;
+
+ /* Perform the actual callback; set request_done if appropraite. */
+ if (!areq->request_complete)
+ ExecAsyncNotify(estate, areq);
+ else
+ {
+ any_node_done = true;
+ if (requestor == areq->requestor)
+ requestor_done = true;
+ ExecAsyncResponse(estate, areq);
+ }
+ }
+
+ /* If any node completed, compact the array. */
+ if (any_node_done)
+ {
+ int hidx = 0,
+ tidx;
+
+ /*
+ * Swap all non-yet-completed items to the start of the array.
+ * Keep them in the same order.
+ */
+ for (tidx = 0; tidx < estate->es_num_pending_async; ++tidx)
+ {
+ PendingAsyncRequest *head;
+ PendingAsyncRequest *tail = estate->es_pending_async[tidx];
+
+ if (!tail->callback_pending && tail->request_complete)
+ continue;
+ head = estate->es_pending_async[hidx];
+ estate->es_pending_async[tidx] = head;
+ estate->es_pending_async[hidx] = tail;
+ ++hidx;
+ }
+ estate->es_num_pending_async = hidx;
+ }
+
+ /*
+ * We only consider exiting the loop when no notifications are
+ * pending. Otherwise, each call to this function might advance
+ * the computation by only a very small amount; to the contrary,
+ * we want to push it forward as far as possible.
+ */
+ if (estate->es_async_callback_pending == 0)
+ {
+ /* If requestor is ready, exit. */
+ if (requestor_done)
+ return true;
+ /* If timeout was 0 or has expired, exit. */
+ if (cur_timeout == 0)
+ return false;
+ }
+ }
+}
+
+/*
+ * Wait or poll for events. As with ExecAsyncEventLoop, a timeout of -1
+ * means wait forever, 0 means don't wait at all, and >0 means wait for the
+ * indicated number of milliseconds.
+ *
+ * Returns true if we found some events and false if we timed out.
+ */
+static bool
+ExecAsyncEventWait(EState *estate, long timeout)
+{
+ WaitEvent occurred_event[EVENT_BUFFER_SIZE];
+ int noccurred;
+ int i;
+ int n;
+ bool reinit = false;
+ bool process_latch_set = false;
+
+ if (estate->es_wait_event_set == NULL)
+ {
+ /*
+ * Allow for a few extra events without reinitializing. It
+ * doesn't seem worth the complexity of doing anything very
+ * aggressive here, because plans that depend on massive numbers
+ * of external FDs are likely to run afoul of kernel limits anyway.
+ */
+ estate->es_allocated_fd_events = estate->es_total_fd_events + 16;
+ estate->es_wait_event_set =
+ CreateWaitEventSet(estate->es_query_cxt,
+ estate->es_allocated_fd_events + 1);
+ AddWaitEventToSet(estate->es_wait_event_set,
+ WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL);
+ reinit = true;
+ }
+
+ /* Give each waiting node a chance to add or modify events. */
+ for (i = 0; i < estate->es_num_pending_async; ++i)
+ {
+ PendingAsyncRequest *areq = estate->es_pending_async[i];
+
+ if (areq->num_fd_events > 0)
+ ExecAsyncConfigureWait(estate, areq, reinit);
+ }
+
+ /* Wait for at least one event to occur. */
+ noccurred = WaitEventSetWait(estate->es_wait_event_set, timeout,
+ occurred_event, EVENT_BUFFER_SIZE);
+ if (noccurred == 0)
+ return false;
+
+ /*
+ * Loop over the occurred events and set the callback_pending flags
+ * for the appropriate requests. The waiting nodes should have
+ * registered their wait events with user_data pointing back to the
+ * PendingAsyncRequest, but the process latch needs special handling.
+ */
+ for (n = 0; n < noccurred; ++n)
+ {
+ WaitEvent *w = &occurred_event[n];
+
+ if ((w->events & WL_LATCH_SET) != 0)
+ {
+ process_latch_set = true;
+ continue;
+ }
+
+ if ((w->events & (WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE)) != 0)
+ {
+ PendingAsyncRequest *areq = w->user_data;
+
+ if (!areq->callback_pending)
+ {
+ Assert(!areq->request_complete);
+ areq->callback_pending = true;
+ estate->es_async_callback_pending++;
+ }
+ }
+ }
+
+ /*
+ * If the process latch got set, we must schedule a callback for every
+ * requestee that cares about it.
+ */
+ if (process_latch_set)
+ {
+ for (i = 0; i < estate->es_num_pending_async; ++i)
+ {
+ PendingAsyncRequest *areq = estate->es_pending_async[i];
+
+ if (areq->wants_process_latch)
+ {
+ Assert(!areq->request_complete);
+ areq->callback_pending = true;
+ }
+ }
+ }
+
+ return true;
+}
+
+/*
+ * Give the asynchronous node a chance to configure the file descriptor
+ * events for which it wishes to wait. We expect the node-type specific
+ * callback to make one or more calls of the following form:
+ *
+ * AddWaitEventToSet(es->es_wait_event_set, events, fd, NULL, areq);
+ *
+ * The events should include only WL_SOCKET_READABLE or WL_SOCKET_WRITEABLE,
+ * and the number of calls should not exceed areq->num_fd_events (as
+ * prevously set via ExecAsyncSetRequiredEvents).
+ */
+static void
+ExecAsyncConfigureWait(EState *estate, PendingAsyncRequest *areq,
+ bool reinit)
+{
+ switch (nodeTag(areq->requestee))
+ {
+ case T_ForeignScanState:
+ ExecAsyncForeignScanConfigureWait(estate, areq, reinit);
+ break;
+ default:
+ elog(ERROR, "unrecognized node type: %d",
+ (int) nodeTag(areq->requestee));
+ }
+}
+
+/*
+ * Call the asynchronous node back when a relevant event has occurred.
+ */
+static void
+ExecAsyncNotify(EState *estate, PendingAsyncRequest *areq)
+{
+ switch (nodeTag(areq->requestee))
+ {
+ case T_ForeignScanState:
+ ExecAsyncForeignScanNotify(estate, areq);
+ break;
+ default:
+ elog(ERROR, "unrecognized node type: %d",
+ (int) nodeTag(areq->requestee));
+ }
+}
+
+/*
+ * Call the requestor back when an asynchronous node has produced a result.
+ */
+static void
+ExecAsyncResponse(EState *estate, PendingAsyncRequest *areq)
+{
+ switch (nodeTag(areq->requestor))
+ {
+ case T_AppendState:
+ ExecAsyncAppendResponse(estate, areq);
+ break;
+ default:
+ elog(ERROR, "unrecognized node type: %d",
+ (int) nodeTag(areq->requestor));
+ }
+}
+
+/*
+ * An executor node should call this function to signal that it needs to wait
+ * on one or more file descriptor events that can be registered on a
+ * WaitEventSet, and possibly also on the process latch. num_fd_events
+ * should be the maximum number of file descriptor events that it will wish to
+ * register. force_reset should be true if the node can't reuse the
+ * WaitEventSet it most recently initialized, for example because it needs to
+ * drop a wait event from the set.
+ */
+void
+ExecAsyncSetRequiredEvents(EState *estate, PendingAsyncRequest *areq,
+ int num_fd_events, bool wants_process_latch,
+ bool force_reset)
+{
+ estate->es_total_fd_events += num_fd_events - areq->num_fd_events;
+ areq->num_fd_events = num_fd_events;
+ areq->wants_process_latch = wants_process_latch;
+
+ if (force_reset && estate->es_wait_event_set != NULL)
+ {
+ FreeWaitEventSet(estate->es_wait_event_set);
+ estate->es_wait_event_set = NULL;
+ }
+}
+
+/*
+ * An async-capable node should call this function to deliver the tuple to
+ * the node which requested it. The node can call this from its
+ * ExecAsyncRequest callback if the requested tuple is available immediately,
+ * or at a later time from its ExecAsyncNotify callback.
+ */
+void
+ExecAsyncRequestDone(EState *estate, PendingAsyncRequest *areq, Node *result)
+{
+ /*
+ * Since the request is complete, the requestee is no longer allowed
+ * to wait for any events. Note that this forces a rebuild of
+ * es_wait_event_set every time a process that was previously waiting
+ * stops doing so. It might be possible to defer that decision until
+ * we actually wait again, because it's quite possible that a new
+ * request will be made of the same node before any wait actually
+ * happens. However, we have to balance the cost of rebuilding the
+ * WaitEventSet against the additional overhead of tracking which nodes
+ * need a callback to remove registered wait events. It's not clear
+ * that we would come out ahead, so use brute force for now.
+ */
+ if (areq->num_fd_events > 0 || areq->wants_process_latch)
+ ExecAsyncSetRequiredEvents(estate, areq, 0, false, true);
+
+ /* Save result and mark request as complete. */
+ areq->result = result;
+ areq->request_complete = true;
+
+ /* Make sure this request is flagged for a callback. */
+ if (!areq->callback_pending)
+ {
+ areq->callback_pending = true;
+ estate->es_async_callback_pending++;
+ }
+}
#include "postgres.h"
#include "executor/execdebug.h"
+#include "executor/execAsync.h"
#include "executor/nodeAppend.h"
static bool exec_append_initialize_next(AppendState *appendstate);
/*
* get information from the append node
*/
- whichplan = appendstate->as_whichplan;
+ whichplan = appendstate->as_whichsyncplan;
- if (whichplan < 0)
+ /*
+ * This routine is only responsible for setting up for nodes being scanned
+ * synchronously, so the first node we can scan is given by nasyncplans
+ * and the last is given by as_nplans - 1.
+ */
+ if (whichplan < appendstate->as_nasyncplans)
{
/*
* if scanning in reverse, we start at the last scan in the list and
* then proceed back to the first.. in any case we inform ExecAppend
* that we are at the end of the line by returning FALSE
*/
- appendstate->as_whichplan = 0;
+ appendstate->as_whichsyncplan = appendstate->as_nasyncplans;
return FALSE;
}
else if (whichplan >= appendstate->as_nplans)
/*
* as above, end the scan if we go beyond the last scan in our list..
*/
- appendstate->as_whichplan = appendstate->as_nplans - 1;
+ appendstate->as_whichsyncplan = appendstate->as_nplans - 1;
return FALSE;
}
else
appendstate->ps.state = estate;
appendstate->appendplans = appendplanstates;
appendstate->as_nplans = nplans;
+ appendstate->as_nasyncplans = node->nasyncplans;
+ appendstate->as_syncdone = (node->nasyncplans == nplans);
+ appendstate->as_asyncresult = (TupleTableSlot **)
+ palloc0(node->nasyncplans * sizeof(TupleTableSlot *));
+
+ /* initially, all async requests need a request */
+ for (i = 0; i < appendstate->as_nasyncplans; ++i)
+ appendstate->as_needrequest =
+ bms_add_member(appendstate->as_needrequest, i);
/*
* Miscellaneous initialization
appendstate->ps.ps_ProjInfo = NULL;
/*
- * initialize to scan first subplan
+ * initialize to scan first synchronous subplan
*/
- appendstate->as_whichplan = 0;
+ appendstate->as_whichsyncplan = appendstate->as_nasyncplans;
exec_append_initialize_next(appendstate);
return appendstate;
TupleTableSlot *
ExecAppend(AppendState *node)
{
+ if (node->as_nasyncplans > 0)
+ {
+ EState *estate = node->ps.state;
+ int i;
+
+ /*
+ * If there are any asynchronously-generated results that have
+ * not yet been returned, return one of them.
+ */
+ if (node->as_nasyncresult > 0)
+ {
+ --node->as_nasyncresult;
+ return node->as_asyncresult[node->as_nasyncresult];
+ }
+
+ /*
+ * If there are any nodes that need a new asynchronous request,
+ * make all of them.
+ */
+ while ((i = bms_first_member(node->as_needrequest)) >= 0)
+ {
+ ExecAsyncRequest(estate, &node->ps, i, node->appendplans[i]);
+ node->as_nasyncpending++;
+ }
+ }
+
for (;;)
{
PlanState *subnode;
TupleTableSlot *result;
/*
- * figure out which subplan we are currently processing
+ * if we have async requests outstanding, run the event loop
*/
- subnode = node->appendplans[node->as_whichplan];
+ if (node->as_nasyncpending > 0)
+ {
+ long timeout = node->as_syncdone ? -1 : 0;
+
+ for (;;)
+ {
+ if (node->as_nasyncpending == 0)
+ {
+ /*
+ * If there is no asynchronous activity still pending
+ * and the synchronous activity is also complete, we're
+ * totally done scanning this node. Otherwise, we're
+ * done with the asynchronous stuff but must continue
+ * scanning the synchronous children.
+ */
+ if (node->as_syncdone)
+ return ExecClearTuple(node->ps.ps_ResultTupleSlot);
+ break;
+ }
+ if (!ExecAsyncEventLoop(node->ps.state, &node->ps, timeout))
+ {
+ /* Timeout reached. */
+ break;
+ }
+ if (node->as_nasyncresult > 0)
+ {
+ /* Asynchronous subplan returned a tuple! */
+ --node->as_nasyncresult;
+ return node->as_asyncresult[node->as_nasyncresult];
+ }
+ }
+ }
+
+ /*
+ * figure out which synchronous subplan we are currently processing
+ */
+ Assert(!node->as_syncdone);
+ subnode = node->appendplans[node->as_whichsyncplan];
/*
* get a tuple from the subplan
/*
* Go on to the "next" subplan in the appropriate direction. If no
* more subplans, return the empty slot set up for us by
- * ExecInitAppend.
+ * ExecInitAppend, unless there are async plans we have yet to finish.
*/
if (ScanDirectionIsForward(node->ps.state->es_direction))
- node->as_whichplan++;
+ node->as_whichsyncplan++;
else
- node->as_whichplan--;
+ node->as_whichsyncplan--;
if (!exec_append_initialize_next(node))
- return ExecClearTuple(node->ps.ps_ResultTupleSlot);
+ {
+ node->as_syncdone = true;
+ if (node->as_nasyncpending == 0)
+ {
+ Assert(bms_is_empty(node->as_needrequest));
+ return ExecClearTuple(node->ps.ps_ResultTupleSlot);
+ }
+ }
/* Else loop back and try to get a tuple from the new subplan */
}
{
int i;
+ /*
+ * XXX. Cancel outstanding asynchronous tuple requests here! (How?)
+ */
+
+ /* Reset async state. */
+ for (i = 0; i < node->as_nasyncplans; ++i)
+ node->as_needrequest = bms_add_member(node->as_needrequest, i);
+ node->as_nasyncresult = 0;
+ node->as_syncdone = (node->as_nasyncplans == node->as_nplans);
+
for (i = 0; i < node->as_nplans; i++)
{
PlanState *subnode = node->appendplans[i];
if (subnode->chgParam == NULL)
ExecReScan(subnode);
}
- node->as_whichplan = 0;
+ node->as_whichsyncplan = node->as_nasyncplans;
exec_append_initialize_next(node);
}
+
+/* ----------------------------------------------------------------
+ * ExecAsyncAppendResponse
+ *
+ * Receive a response from an asynchronous request we made.
+ * ----------------------------------------------------------------
+ */
+void
+ExecAsyncAppendResponse(EState *estate, PendingAsyncRequest *areq)
+{
+ AppendState *node = (AppendState *) areq->requestor;
+ TupleTableSlot *slot;
+
+ /* We shouldn't be called until the request is complete. */
+ Assert(areq->request_complete);
+
+ /* Our result slot shouldn't already be occupied. */
+ Assert(TupIsNull(node->ps.ps_ResultTupleSlot));
+
+ /* Result should be a TupleTableSlot or NULL. */
+ slot = (TupleTableSlot *) areq->result;
+ Assert(slot == NULL || IsA(slot, TupleTableSlot));
+
+ /* Request is no longer pending. */
+ Assert(node->as_nasyncpending > 0);
+ --node->as_nasyncpending;
+
+ /* If the result is NULL or an empty slot, there's nothing more to do. */
+ if (TupIsNull(slot))
+ return;
+
+ /* Save result so we can return it. */
+ Assert(node->as_nasyncresult < node->as_nasyncplans);
+ node->as_asyncresult[node->as_nasyncresult++] = slot;
+
+ /*
+ * Mark the node that returned a result as ready for a new request. We
+ * don't launch another one here immediately because it might compelte
+ */
+ bms_add_member(node->as_needrequest, areq->request_index);
+}
fdwroutine->InitializeWorkerForeignScan(node, toc, coordinate);
}
}
+
+/* ----------------------------------------------------------------
+ * ExecAsyncForeignScanRequest
+ *
+ * Initiate an asynchronous request
+ * ----------------------------------------------------------------
+ */
+void
+ExecAsyncForeignScanRequest(EState *estate, PendingAsyncRequest *areq)
+{
+ ForeignScanState *node = (ForeignScanState *) areq->requestee;
+ FdwRoutine *fdwroutine = node->fdwroutine;
+
+ Assert(fdwroutine->ForeignAsyncRequest != NULL);
+ fdwroutine->ForeignAsyncRequest(estate, areq);
+}
+
+/* ----------------------------------------------------------------
+ * ExecAsyncForeignScanConfigureWait
+ *
+ * In async mode, configure for a wait
+ * ----------------------------------------------------------------
+ */
+void
+ExecAsyncForeignScanConfigureWait(EState *estate,
+ PendingAsyncRequest *areq, bool reinit)
+{
+ ForeignScanState *node = (ForeignScanState *) areq->requestee;
+ FdwRoutine *fdwroutine = node->fdwroutine;
+
+ Assert(fdwroutine->ForeignAsyncConfigureWait != NULL);
+ fdwroutine->ForeignAsyncConfigureWait(estate, areq, reinit);
+}
+
+/* ----------------------------------------------------------------
+ * ExecAsyncForeignScanNotify
+ *
+ * Event loop callback
+ * ----------------------------------------------------------------
+ */
+void
+ExecAsyncForeignScanNotify(EState *estate, PendingAsyncRequest *areq)
+{
+ ForeignScanState *node = (ForeignScanState *) areq->requestee;
+ FdwRoutine *fdwroutine = node->fdwroutine;
+
+ Assert(fdwroutine->ForeignAsyncNotify != NULL);
+ fdwroutine->ForeignAsyncNotify(estate, areq);
+}
* copy remainder of node
*/
COPY_NODE_FIELD(appendplans);
+ COPY_SCALAR_FIELD(nasyncplans);
return newnode;
}
_outPlanInfo(str, (const Plan *) node);
WRITE_NODE_FIELD(appendplans);
+ WRITE_INT_FIELD(nasyncplans);
}
static void
ReadCommonPlan(&local_node->plan);
READ_NODE_FIELD(appendplans);
+ READ_INT_FIELD(nasyncplans);
READ_DONE();
}
Index scanrelid, int ctePlanId, int cteParam);
static WorkTableScan *make_worktablescan(List *qptlist, List *qpqual,
Index scanrelid, int wtParam);
-static Append *make_append(List *appendplans, List *tlist);
+static Append *make_append(List *asyncplans, int nasyncplans, List *tlist);
static RecursiveUnion *make_recursive_union(List *tlist,
Plan *lefttree,
Plan *righttree,
List *resultRelations, List *subplans,
List *withCheckOptionLists, List *returningLists,
List *rowMarks, OnConflictExpr *onconflict, int epqParam);
+static bool is_async_capable_path(Path *path);
/*
{
Append *plan;
List *tlist = build_path_tlist(root, &best_path->path);
- List *subplans = NIL;
+ List *asyncplans = NIL;
+ List *syncplans = NIL;
ListCell *subpaths;
+ int nasyncplans = 0;
/*
* The subpaths list could be empty, if every child was proven empty by
/* Must insist that all children return the same tlist */
subplan = create_plan_recurse(root, subpath, CP_EXACT_TLIST);
- subplans = lappend(subplans, subplan);
+ /* Classify as async-capable or not */
+ if (is_async_capable_path(subpath))
+ {
+ asyncplans = lappend(asyncplans, subplan);
+ ++nasyncplans;
+ }
+ else
+ syncplans = lappend(syncplans, subplan);
}
/*
* parent-rel Vars it'll be asked to emit.
*/
- plan = make_append(subplans, tlist);
+ plan = make_append(list_concat(asyncplans, syncplans), nasyncplans, tlist);
copy_generic_path_info(&plan->plan, (Path *) best_path);
}
static Append *
-make_append(List *appendplans, List *tlist)
+make_append(List *appendplans, int nasyncplans, List *tlist)
{
Append *node = makeNode(Append);
Plan *plan = &node->plan;
plan->lefttree = NULL;
plan->righttree = NULL;
node->appendplans = appendplans;
+ node->nasyncplans = nasyncplans;
return node;
}
}
return true;
}
+
+/*
+ * is_projection_capable_path
+ * Check whether a given Path node is async-capable.
+ */
+static bool
+is_async_capable_path(Path *path)
+{
+ switch (nodeTag(path))
+ {
+ case T_ForeignPath:
+ {
+ FdwRoutine *fdwroutine = path->parent->fdwroutine;
+
+ Assert(fdwroutine != NULL);
+ if (fdwroutine->IsForeignPathAsyncCapable != NULL &&
+ fdwroutine->IsForeignPathAsyncCapable((ForeignPath *) path))
+ return true;
+ }
+ default:
+ break;
+ }
+ return false;
+}
--- /dev/null
+/*--------------------------------------------------------------------
+ * execAsync.c
+ * Support functions for asynchronous query execution
+ *
+ * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/executor/execAsync.c
+ *--------------------------------------------------------------------
+ */
+
+#ifndef EXECASYNC_H
+#define EXECASYNC_H
+
+#include "nodes/execnodes.h"
+
+extern void ExecAsyncRequest(EState *estate, PlanState *requestor,
+ int request_index, PlanState *requestee);
+extern bool ExecAsyncEventLoop(EState *estate, PlanState *requestor,
+ long timeout);
+
+extern void ExecAsyncSetRequiredEvents(EState *estate,
+ PendingAsyncRequest *areq, int num_fd_events,
+ bool wants_process_latch, bool force_reset);
+extern void ExecAsyncRequestDone(EState *estate,
+ PendingAsyncRequest *areq, Node *result);
+
+#endif /* EXECASYNC_H */
extern void ExecEndAppend(AppendState *node);
extern void ExecReScanAppend(AppendState *node);
+extern void ExecAsyncAppendResponse(EState *estate,
+ PendingAsyncRequest *areq);
+
#endif /* NODEAPPEND_H */
extern void ExecForeignScanInitializeWorker(ForeignScanState *node,
shm_toc *toc);
+extern void ExecAsyncForeignScanRequest(EState *estate,
+ PendingAsyncRequest *areq);
+extern void ExecAsyncForeignScanConfigureWait(EState *estate,
+ PendingAsyncRequest *areq, bool reinit);
+extern void ExecAsyncForeignScanNotify(EState *estate,
+ PendingAsyncRequest *areq);
+
#endif /* NODEFOREIGNSCAN_H */
RelOptInfo *rel,
RangeTblEntry *rte);
+typedef bool (*IsForeignPathAsyncCapable_function) (ForeignPath *path);
+typedef void (*ForeignAsyncRequest_function) (EState *estate,
+ PendingAsyncRequest *areq);
+typedef void (*ForeignAsyncConfigureWait_function) (EState *estate,
+ PendingAsyncRequest *areq,
+ bool reinit);
+typedef void (*ForeignAsyncNotify_function) (EState *estate,
+ PendingAsyncRequest *areq);
+
/*
* FdwRoutine is the struct returned by a foreign-data wrapper's handler
* function. It provides pointers to the callback functions needed by the
EstimateDSMForeignScan_function EstimateDSMForeignScan;
InitializeDSMForeignScan_function InitializeDSMForeignScan;
InitializeWorkerForeignScan_function InitializeWorkerForeignScan;
+
+ /* Support functions for asynchronous execution */
+ IsForeignPathAsyncCapable_function IsForeignPathAsyncCapable;
+ ForeignAsyncRequest_function ForeignAsyncRequest;
+ ForeignAsyncConfigureWait_function ForeignAsyncConfigureWait;
+ ForeignAsyncNotify_function ForeignAsyncNotify;
} FdwRoutine;
List *ri_onConflictSetWhere;
} ResultRelInfo;
+/* ----------------
+ * PendingAsyncRequest
+ *
+ * State for an asynchronous tuple request.
+ * ----------------
+ */
+typedef struct PendingAsyncRequest
+{
+ int myindex; /* Index in es_pending_async. */
+ struct PlanState *requestor; /* Node that wants a tuple. */
+ struct PlanState *requestee; /* Node from which a tuple is wanted. */
+ int request_index; /* Scratch space for requestor. */
+ int num_fd_events; /* Max number of FD events requestee needs. */
+ bool wants_process_latch; /* Requestee cares about MyLatch. */
+ bool callback_pending; /* Callback is needed. */
+ bool request_complete; /* Request complete, result valid. */
+ Node *result; /* Result (NULL if no more tuples). */
+} PendingAsyncRequest;
+
/* ----------------
* EState information
*
HeapTuple *es_epqTuple; /* array of EPQ substitute tuples */
bool *es_epqTupleSet; /* true if EPQ tuple is provided */
bool *es_epqScanDone; /* true if EPQ tuple has been fetched */
+
+ /*
+ * Support for asynchronous execution.
+ *
+ * es_max_pending_async is the allocated size of es_pending_async, and
+ * es_num_pending_aync is the number of entries that are currently valid.
+ * (Entries after that may point to storage that can be reused.)
+ * es_async_callback_pending is the number of PendingAsyncRequests for
+ * which callback_pending is true.
+ *
+ * es_total_fd_events is the total number of FD events needed by all
+ * pending async nodes, and es_allocated_fd_events is the number any
+ * current wait event set was allocated to handle. es_wait_event_set, if
+ * non-NULL, is a previously allocated event set that may be reusable by a
+ * future wait provided that nothing's been removed and not too many more
+ * events have been added.
+ */
+ int es_num_pending_async;
+ int es_max_pending_async;
+ int es_async_callback_pending;
+ PendingAsyncRequest **es_pending_async;
+
+ int es_total_fd_events;
+ int es_allocated_fd_events;
+ struct WaitEventSet *es_wait_event_set;
} EState;
/* ----------------
* AppendState information
- *
- * nplans how many plans are in the array
- * whichplan which plan is being executed (0 .. n-1)
* ----------------
*/
typedef struct AppendState
{
PlanState ps; /* its first field is NodeTag */
PlanState **appendplans; /* array of PlanStates for my inputs */
- int as_nplans;
- int as_whichplan;
+ int as_nplans; /* total # of children */
+ int as_nasyncplans; /* # of async-capable children */
+ int as_whichsyncplan; /* which sync plan is being executed */
+ bool as_syncdone; /* all synchronous plans done? */
+ Bitmapset *as_needrequest; /* async plans needing a new request */
+ TupleTableSlot **as_asyncresult; /* unreturned results of async plans */
+ int as_nasyncresult; /* # of valid entries in as_asyncresult */
+ int as_nasyncpending; /* # of outstanding async requests */
} AppendState;
/* ----------------
{
Plan plan;
List *appendplans;
+ int nasyncplans; /* # of async plans, always at start of list */
} Append;
/* ----------------