Create an infrastructure for parallel computation in PostgreSQL.
authorRobert Haas <[email protected]>
Fri, 30 Jan 2015 13:39:21 +0000 (08:39 -0500)
committerRobert Haas <[email protected]>
Fri, 30 Jan 2015 15:19:43 +0000 (10:19 -0500)
Robert Haas, Amit Kapila, Noah Misch, Rushabh Lathia, Jeevan Chalke.
Suggestions and review from Heikki Linnakangas, Jim Nasby, Simon Riggs,
and Andres Freund.

37 files changed:
contrib/postgres_fdw/connection.c
src/backend/access/heap/heapam.c
src/backend/access/transam/Makefile
src/backend/access/transam/README.parallel [new file with mode: 0644]
src/backend/access/transam/parallel.c [new file with mode: 0644]
src/backend/access/transam/varsup.c
src/backend/access/transam/xact.c
src/backend/commands/copy.c
src/backend/commands/sequence.c
src/backend/executor/execMain.c
src/backend/executor/functions.c
src/backend/executor/spi.c
src/backend/libpq/pqmq.c
src/backend/postmaster/bgworker.c
src/backend/storage/ipc/procarray.c
src/backend/storage/ipc/procsignal.c
src/backend/storage/ipc/standby.c
src/backend/storage/lmgr/lock.c
src/backend/storage/lmgr/predicate.c
src/backend/tcop/postgres.c
src/backend/tcop/utility.c
src/backend/utils/adt/lockfuncs.c
src/backend/utils/fmgr/dfmgr.c
src/backend/utils/misc/guc.c
src/backend/utils/time/combocid.c
src/backend/utils/time/snapmgr.c
src/include/access/parallel.h [new file with mode: 0644]
src/include/access/xact.h
src/include/fmgr.h
src/include/libpq/pqmq.h
src/include/miscadmin.h
src/include/postmaster/bgworker.h
src/include/storage/lock.h
src/include/storage/procarray.h
src/include/storage/procsignal.h
src/include/utils/combocid.h
src/include/utils/snapmgr.h

index 4e02cb289dfe6196e110887bf980889d85eba21c..1a1e5b5eae3b6a76412fe33aea8762faa85bb918 100644 (file)
@@ -546,6 +546,7 @@ pgfdw_xact_callback(XactEvent event, void *arg)
 
                        switch (event)
                        {
+                               case XACT_EVENT_PARALLEL_PRE_COMMIT:
                                case XACT_EVENT_PRE_COMMIT:
                                        /* Commit all remote transactions during pre-commit */
                                        do_sql_command(entry->conn, "COMMIT TRANSACTION");
@@ -588,11 +589,13 @@ pgfdw_xact_callback(XactEvent event, void *arg)
                                                        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                                                         errmsg("cannot prepare a transaction that modified remote tables")));
                                        break;
+                               case XACT_EVENT_PARALLEL_COMMIT:
                                case XACT_EVENT_COMMIT:
                                case XACT_EVENT_PREPARE:
                                        /* Pre-commit should have closed the open transaction */
                                        elog(ERROR, "missed cleaning up connection during pre-commit");
                                        break;
+                               case XACT_EVENT_PARALLEL_ABORT:
                                case XACT_EVENT_ABORT:
                                        /* Assume we might have lost track of prepared statements */
                                        entry->have_error = true;
index 21e9d067b6fc6b47774dce8c9e872e70ead78038..57408d3254dc229542144196d56f0b724a4125d1 100644 (file)
@@ -2234,6 +2234,17 @@ static HeapTuple
 heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid,
                                        CommandId cid, int options)
 {
+       /*
+        * For now, parallel operations are required to be strictly read-only.
+        * Unlike heap_update() and heap_delete(), an insert should never create
+        * a combo CID, so it might be possible to relax this restrction, but
+        * not without more thought and testing.
+        */
+       if (IsInParallelMode())
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+                                errmsg("cannot insert tuples during a parallel operation")));
+
        if (relation->rd_rel->relhasoids)
        {
 #ifdef NOT_USED
@@ -2641,6 +2652,16 @@ heap_delete(Relation relation, ItemPointer tid,
 
        Assert(ItemPointerIsValid(tid));
 
+       /*
+        * Forbid this during a parallel operation, lest it allocate a combocid.
+        * Other workers might need that combocid for visibility checks, and we
+        * have no provision for broadcasting it to them.
+        */
+       if (IsInParallelMode())
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+                                errmsg("cannot delete tuples during a parallel operation")));
+
        block = ItemPointerGetBlockNumber(tid);
        buffer = ReadBuffer(relation, block);
        page = BufferGetPage(buffer);
@@ -3078,6 +3099,16 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 
        Assert(ItemPointerIsValid(otid));
 
+       /*
+        * Forbid this during a parallel operation, lest it allocate a combocid.
+        * Other workers might need that combocid for visibility checks, and we
+        * have no provision for broadcasting it to them.
+        */
+       if (IsInParallelMode())
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+                                errmsg("cannot update tuples during a parallel operation")));
+
        /*
         * Fetch the list of attributes to be checked for HOT update.  This is
         * wasted effort if we fail to update or have to put the new tuple on a
@@ -5382,6 +5413,17 @@ heap_inplace_update(Relation relation, HeapTuple tuple)
        uint32          oldlen;
        uint32          newlen;
 
+       /*
+        * For now, parallel operations are required to be strictly read-only.
+        * Unlike a regular update, this should never create a combo CID, so it
+        * might be possible to relax this restrction, but not without more
+        * thought and testing.  It's not clear that it would be useful, anyway.
+        */
+       if (IsInParallelMode())
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+                                errmsg("cannot update tuples during a parallel operation")));
+
        buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self)));
        LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
        page = (Page) BufferGetPage(buffer);
index 9d4d5dbc9753a2829c2cbcaa33151287a139483f..94455b23f7ef42d6d6428d5711d8a4918453016e 100644 (file)
@@ -12,7 +12,7 @@ subdir = src/backend/access/transam
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = clog.o commit_ts.o multixact.o rmgr.o slru.o subtrans.o \
+OBJS = clog.o commit_ts.o multixact.o parallel.o rmgr.o slru.o subtrans.o \
        timeline.o transam.o twophase.o twophase_rmgr.o varsup.o \
        xact.o xlog.o xlogarchive.o xlogfuncs.o \
        xloginsert.o xlogreader.o xlogutils.o
diff --git a/src/backend/access/transam/README.parallel b/src/backend/access/transam/README.parallel
new file mode 100644 (file)
index 0000000..bcdb18b
--- /dev/null
@@ -0,0 +1,208 @@
+Overview
+========
+
+PostgreSQL provides some simple facilities to make writing parallel algorithms
+easier.  Using a data structure called a ParallelContext, you can arrange to
+launch background worker processes, initialize their state to match that of
+the backend which initiated paralellism, communicate with them via dynamic
+shared memory, and write reasonably complex code that can run either in the
+user backend or in one of the parallel workers without needing to be aware of
+where it's running.
+
+The backend which starts a parallel operation (hereafter, the initiating
+backend) starts by creating a dynamic shared memory segment which will last
+for the lifetime of the parallel operation.  This dynamic shared memory segment
+will contain (1) a shm_mq that can be used to transport errors (and other
+messages reported via elog/ereport) from the worker back to the initiating
+backend; (2) serialized representations of the initiating backend's private
+state, so that the worker can synchronize its state with of the initiating
+backend; and (3) any other data structures which a particular user of the
+ParallelContext data structure may wish to add for its own purposes.  Once
+the initiating backend has initialized the dynamic shared memory segment, it
+asks the postmaster to launch the appropriate number of parallel workers.
+These workers then connect to the dynamic shared memory segment, initiate
+their state, and then invoke the appropriate entrypoint, as further detailed
+below.
+
+Error Reporting
+===============
+
+When started, each parallel worker begins by attaching the dynamic shared
+memory segment and locating the shm_mq to be used for error reporting; it
+redirects all of its protocol messages to this shm_mq.  Prior to this point,
+any failure of the background worker will not be reported to the initiating
+backend; from the point of view of the initiating backend, the worker simply
+failed to start.  The initiating backend must anyway be prepared to cope
+with fewer parallel workers than it originally requested, so catering to
+this case imposes no additional burden.
+
+Whenever a new message (or partial message; very large messages may wrap) is
+sent to the error-reporting queue, PROCSIG_PARALLEL_MESSAGE is sent to the
+initiating backend.  This causes the next CHECK_FOR_INTERRUPTS() in the
+initiating backend to read and rethrow the message.  For the most part, this
+makes error reporting in parallel mode "just work".  Of course, to work
+properly, it is important that the code the initiating backend is executing
+CHECK_FOR_INTERRUPTS() regularly and avoid blocking interrupt processing for
+long periods of time, but those are good things to do anyway.
+
+(A currently-unsolved problem is that some messages may get written to the
+system log twice, once in the backend where the report was originally
+generated, and again when the initiating backend rethrows the message.  If
+we decide to suppress one of these reports, it should probably be second one;
+otherwise, if the worker is for some reason unable to propagate the message
+back to the initiating backend, the message will be lost altogether.)
+
+State Sharing
+=============
+
+It's possible to write C code which works correctly without parallelism, but
+which fails when parallelism is used.  No parallel infrastructure can
+completely eliminate this problem, because any global variable is a risk.
+There's no general mechanism for ensuring that every global variable in the
+worker will have the same value that it does in the initiating backend; even
+if we could ensure that, some function we're calling could update the variable
+after each call, and only the backend where that update is performed will see
+the new value.  Similar problems can arise with any more-complex data
+structure we might choose to use.  For example, a pseudo-random number
+generator should, given a particular seed value, produce the same predictable
+series of values every time.  But it does this by relying on some private
+state which won't automatically be shared between cooperating backends.  A
+parallel-safe PRNG would need to store its state in dynamic shared memory, and
+would require locking.  The parallelism infrastructure has no way of knowing
+whether the user intends to call code that has this sort of problem, and can't
+do anything about it anyway.
+
+Instead, we take a more pragmatic approach: we try to make as many of the
+operations that are safe outside of parallel mode work correctly in parallel
+mode as well, and we try to prohibit the rest via suitable error checks.
+The error checks are engaged via EnterParallelMode(), which should be called
+before creating a parallel context, and disarmed via ExitParallelMode(),
+which should be called after all parallel contexts have been destroyed.
+The most significant restriction imposed by parallel mode is that all
+operations must be strictly read-only; we allow no writes to the database
+and no DDL.  We might try to relax these restrictions in the future.
+
+To make as many operations as possible safe in parallel mode, we try to copy
+the most important pieces of state from the initiating backend to each parallel
+worker.  This includes:
+
+  - The authenticated user ID and current database.  Each parallel worker
+    will connect to the same database as the initiating backend, using the
+    same user ID.
+
+  - The set of libraries dynamically loaded by dfmgr.c.
+
+  - The values of all GUCs.  Accordingly, permanent changes to the value of 
+    any GUC are forbidden while in parallel mode; but temporary changes,
+    such as entering a function with non-NULL proconfig, are potentially OK.
+
+  - The current subtransaction's XID, the top-level transaction's XID, and
+    the list of XIDs considered current (that is, they are in-progress or
+    subcommitted).  This information is needed to ensure that tuple visibility
+    checks return the same results in the worker as they do in the
+    initiating backend.  See also the section Transaction Integration, below.
+
+  - The combo CID mappings.  This is needed to ensure consistent answers to
+    tuple visibility checks.  The need to synchronize this data structure is
+    a major reason why we can't support writes in parallel mode: such writes
+    might create new combo CIDs, and we have now way to let other workers
+    (or the initiating backend) know about them.
+
+  - The transaction snapshot.
+
+  - The active snapshot, which might be different from the transaction
+    snapshot.
+
+  - The currently active user ID and security context.  Note that this is
+    the fourth user ID we restore: the initial step of binding to the correct
+    database also involves restoring the authenticated user ID.  When GUC
+    values are restored, this incidentally sets SessionUserId and OuterUserId
+    to the correct values.  This final step restores CurrentUserId.
+
+Transaction Integration
+=======================
+
+Regardless of what the TransactionState stack looks like in the master, the
+parallel backend ends up with a stack of depth 1.  This stack entry is
+marked with the special transaction block state TBLOCK_PARALLEL_INPROGRESS
+so that it's not confused with an ordinary toplevel transaction.  The
+XID of this TransactionState is set to the XID of the innermost
+currently-active subtransaction in the initiating backend.  The initiating
+backend's toplevel XID, and the XIDs of all current (in-progress or
+subcommitted) XIDs are stored separately from the TransactionState stack,
+but in such a way that GetTopTransactionId(), GetTopTransactionIdIfAny(),
+and TransactionIdIsCurrentTransactionId() return the same values that they
+would in the initiating backend.  We could copy the entire transaction state
+stack, but most of it would be useless: for example, you can't roll back to
+a savepoint from within a parallel worker, and there are no resources to
+associated with the memory contexts or resource owners of intermediate
+subtransactions.
+
+No meaningful change to the transaction state can be made while in parallel
+mode.  No XIDs can be assigned, and no subtransactions can start or end,
+because we have no way of communicating these state changes to cooperating
+backends, or of synchronizing them.  It's clearly unworkable for the initating
+backend to exit any transaction or subtransaction that was in progress when
+paralellism was started before all parallel workers have exited; and it's even
+more clearly crazy for a parallel worker to try to subcommit or subabort the
+current subtransaction and execute in some other transaction context that was
+present in the initiating backend.  It might be practical to allow internal
+sub-transactions (e.g. to implement a PL/pgsql EXCEPTION block) to be used in
+parallel mode, provided that they are XID-less, because other backends
+wouldn't really need to know about those transactions or do anything
+differently because of them.  Right now, we don't even allow that.
+
+Transaction commit or abort requires careful coordination between backends.
+Each backend has its own resource owners: buffer pins, catcache or relcache 
+reference counts, tuple descriptors, and so on are managed separately by each
+backend, and each backend is separately responsible for releasing such
+resources.  Generally, the commit or abort of a parallel worker is much like
+a top-transaction commit or abort, but there are a few exceptions.  Most
+importantly:
+
+  - No commit or abort record is written; the initiating backend is
+    responsible for this.
+
+  - End-of-transaction namespace processing is not done.  If a pg_temp
+    namespace needs to be cleaned up, the master is responsible for this.
+
+The master kills off all remaining workers as part of commit or abort
+processing.  It must not only kill the workers but wait for them to actually
+exit; otherwise, chaos can ensue.  For example, if the master is
+rolling back the transaction that created the relation being scanned by
+a worker, the relation could disappear while the worker is still busy
+scanning it.  That's not safe.
+
+Coding Conventions
+===================
+
+Before beginning any parallel operation, call EnterParallelMode(); after all
+parallel operations are completed, call ExitParallelMode().  To actually
+parallelize a particular operation, use a ParallelContext.  The basic coding
+pattern looks like this:
+
+       EnterParallelMode();            /* prohibit unsafe state changes */
+
+       pcxt = CreateParallelContext(entrypoint, nworkers);
+
+       /* Allow space for application-specific data here. */
+       shm_toc_estimate_chunk(&pcxt->estimator, size);
+       shm_toc_estimate_keys(&pcxt->estimator, keys);
+
+       InitializeParallelDSM(pcxt);    /* create DSM and copy state to it */
+
+       /* Store the data for which we reserved space. */
+       space = shm_toc_allocate(pcxt->toc, size);
+       shm_toc_insert(pcxt->toc, key, space);
+
+       LaunchParallelWorkers(pcxt);
+
+       /* do parallel stuff */
+
+       WaitForParallelWorkersToFinish(pcxt);
+
+       /* read any final results from dynamic shared memory */
+
+       DestroyParallelContext(pcxt);
+
+       ExitParallelMode();
diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c
new file mode 100644 (file)
index 0000000..1a66acc
--- /dev/null
@@ -0,0 +1,960 @@
+/*-------------------------------------------------------------------------
+ *
+ * parallel.c
+ *       Infrastructure for launching parallel workers
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *       src/backend/access/transam/parallel.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/xact.h"
+#include "access/parallel.h"
+#include "commands/async.h"
+#include "libpq/libpq.h"
+#include "libpq/pqformat.h"
+#include "libpq/pqmq.h"
+#include "miscadmin.h"
+#include "storage/ipc.h"
+#include "storage/sinval.h"
+#include "storage/spin.h"
+#include "utils/combocid.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+#include "utils/resowner.h"
+#include "utils/snapmgr.h"
+
+/*
+ * We don't want to waste a lot of memory on an error queue which, most of
+ * the time, will process only a handful of small messages.  However, it is
+ * desirable to make it large enough that a typical ErrorResponse can be sent
+ * without blocking.  That way, a worker that errors out can write the whole
+ * message into the queue and terminate without waiting for the user backend.
+ */
+#define        PARALLEL_ERROR_QUEUE_SIZE                       16384
+
+/* Magic number for parallel context TOC. */
+#define PARALLEL_MAGIC                                         0x50477c7c
+
+/*
+ * Magic numbers for parallel state sharing.  Higher-level code should use
+ * smaller values, leaving these very large ones for use by this module.
+ */
+#define PARALLEL_KEY_FIXED                                     UINT64CONST(0xFFFFFFFFFFFF0001)
+#define PARALLEL_KEY_ERROR_QUEUE                       UINT64CONST(0xFFFFFFFFFFFF0002)
+#define PARALLEL_KEY_LIBRARY                           UINT64CONST(0xFFFFFFFFFFFF0003)
+#define PARALLEL_KEY_GUC                                       UINT64CONST(0xFFFFFFFFFFFF0004)
+#define PARALLEL_KEY_COMBO_CID                         UINT64CONST(0xFFFFFFFFFFFF0005)
+#define PARALLEL_KEY_TRANSACTION_SNAPSHOT      UINT64CONST(0xFFFFFFFFFFFF0006)
+#define PARALLEL_KEY_ACTIVE_SNAPSHOT           UINT64CONST(0xFFFFFFFFFFFF0007)
+#define PARALLEL_KEY_TRANSACTION_STATE         UINT64CONST(0xFFFFFFFFFFFF0008)
+#define PARALLEL_KEY_LOCK                                      UINT64CONST(0xFFFFFFFFFFFF0009)
+#define PARALLEL_KEY_EXTENSION_TRAMPOLINE      UINT64CONST(0xFFFFFFFFFFFF000A)
+
+/* Fixed-size parallel state. */
+typedef struct FixedParallelState
+{
+       /* Fixed-size state that workers must restore. */
+       Oid                     database_id;
+       Oid                     authenticated_user_id;
+       Oid                     current_user_id;
+       int                     sec_context;
+       PGPROC     *parallel_master_pgproc;
+       pid_t           parallel_master_pid;
+       BackendId       parallel_master_backend_id;
+
+       /* Entrypoint for parallel workers. */
+       parallel_worker_main_type       entrypoint;
+
+       /* Track whether workers have attached. */
+       slock_t         mutex;
+       int                     workers_expected;
+       int                     workers_attached;
+} FixedParallelState;
+
+/*
+ * Our parallel worker number.  We initialize this to -1, meaning that we are
+ * not a parallel worker.  In parallel workers, it will be set to a value >= 0
+ * and < the number of workers before any user code is invoked; each parallel
+ * worker will get a different parallel worker number.
+ */
+int ParallelWorkerNumber = -1;
+
+/* Is there a parallel message pending which we need to receive? */
+bool ParallelMessagePending = false;
+
+/* Are we in the midst of handling parallel messages? */
+static bool HandlingParallelMessages = false;
+
+/* List of active parallel contexts. */
+static dlist_head pcxt_list = DLIST_STATIC_INIT(pcxt_list);
+
+/* Private functions. */
+static void HandleParallelMessages(void);
+static void HandleParallelMessage(ParallelContext *, int, StringInfo msg);
+static void ParallelErrorContext(void *arg);
+static void ParallelMain(Datum main_arg);
+static void ParallelExtensionTrampoline(dsm_segment *seg, shm_toc *toc);
+static void handle_sigterm(SIGNAL_ARGS);
+
+/*
+ * Establish a new parallel context.  This should be done after entering
+ * parallel mode, and (unless there is an error) the context should be
+ * destroyed before exiting the current subtransaction.
+ */
+ParallelContext *
+CreateParallelContext(parallel_worker_main_type entrypoint, int nworkers)
+{
+       MemoryContext   oldcontext;
+       ParallelContext *pcxt;
+
+       /* It is unsafe to create a parallel context if not in parallel mode. */
+       Assert(IsInParallelMode());
+
+       /* Number of workers should be positive. */
+       Assert(nworkers >= 0);
+
+       /* We might be running in a very short-lived memory context. */
+       oldcontext = MemoryContextSwitchTo(TopTransactionContext);
+
+       /* Initialize a new ParallelContext. */
+       pcxt = palloc0(sizeof(ParallelContext));
+       pcxt->subid = GetCurrentSubTransactionId();
+       pcxt->nworkers = nworkers;
+       pcxt->entrypoint = entrypoint;
+       pcxt->error_context_stack = error_context_stack;
+       shm_toc_initialize_estimator(&pcxt->estimator);
+       dlist_push_head(&pcxt_list, &pcxt->node);
+
+       /* Restore previous memory context. */
+       MemoryContextSwitchTo(oldcontext);
+
+       return pcxt;
+}
+
+/*
+ * Establish a new parallel context that calls a function provided by an
+ * extension.  This works around the fact that the library might get mapped
+ * at a different address in each backend.
+ */
+ParallelContext *
+CreateParallelContextForExtension(char *library_name, char *function_name,
+                                                                 int nworkers)
+{
+       MemoryContext   oldcontext;
+       ParallelContext *pcxt;
+
+       /* We might be running in a very short-lived memory context. */
+       oldcontext = MemoryContextSwitchTo(TopTransactionContext);
+
+       /* Create the context. */
+       pcxt = CreateParallelContext(ParallelExtensionTrampoline, nworkers);
+       pcxt->library_name = pstrdup(library_name);
+       pcxt->function_name = pstrdup(function_name);
+
+       /* Restore previous memory context. */
+       MemoryContextSwitchTo(oldcontext);
+
+       return pcxt;
+}
+
+/*
+ * Establish the dynamic shared memory segment for a parallel context and
+ * copied state and other bookkeeping information that will need by parallel
+ * workers into it.
+ */
+void
+InitializeParallelDSM(ParallelContext *pcxt)
+{
+       MemoryContext   oldcontext;
+       Size    library_len;
+       Size    guc_len;
+       Size    combocidlen;
+       Size    tsnaplen;
+       Size    asnaplen;
+       Size    tstatelen;
+       Size    lockstatelen;
+       Size    segsize;
+       int             i;
+       FixedParallelState *fps;
+       char   *libraryspace;
+       char   *gucspace;
+       char   *combocidspace;
+       char   *tsnapspace;
+       char   *asnapspace;
+       char   *tstatespace;
+       char   *lockstatespace;
+       char   *error_queue_space;
+       Snapshot        transaction_snapshot = GetTransactionSnapshot();
+       Snapshot        active_snapshot = GetActiveSnapshot();
+
+       /* We might be running in a very short-lived memory context. */
+       oldcontext = MemoryContextSwitchTo(TopTransactionContext);
+
+       /* Allocate space for worker information. */
+       pcxt->worker = palloc0(sizeof(ParallelWorkerInfo) * pcxt->nworkers);
+
+       /*
+        * Estimate how much space we'll need for state sharing.
+        *
+        * If you add more chunks here, you probably need more keys, too.
+        */
+       shm_toc_estimate_chunk(&pcxt->estimator, sizeof(FixedParallelState));
+       library_len = EstimateLibraryStateSpace();
+       shm_toc_estimate_chunk(&pcxt->estimator, library_len);
+       guc_len = EstimateGUCStateSpace();
+       shm_toc_estimate_chunk(&pcxt->estimator, guc_len);
+       combocidlen = EstimateComboCIDStateSpace();
+       shm_toc_estimate_chunk(&pcxt->estimator, combocidlen);
+       tsnaplen = EstimateSnapshotSpace(transaction_snapshot);
+       shm_toc_estimate_chunk(&pcxt->estimator, tsnaplen);
+       asnaplen = EstimateSnapshotSpace(active_snapshot);
+       shm_toc_estimate_chunk(&pcxt->estimator, asnaplen);
+       tstatelen = EstimateTransactionStateSpace();
+       shm_toc_estimate_chunk(&pcxt->estimator, tstatelen);
+       lockstatelen = EstimateLockStateSpace();
+       shm_toc_estimate_chunk(&pcxt->estimator, lockstatelen);
+       shm_toc_estimate_keys(&pcxt->estimator, 8);
+
+       /* Estimate how much space we'll need for error queues. */
+       StaticAssertStmt(BUFFERALIGN(PARALLEL_ERROR_QUEUE_SIZE) ==
+               PARALLEL_ERROR_QUEUE_SIZE,
+               "parallel error queue size not buffer-aligned");
+       shm_toc_estimate_chunk(&pcxt->estimator,
+                                                  PARALLEL_ERROR_QUEUE_SIZE * pcxt->nworkers);
+       shm_toc_estimate_keys(&pcxt->estimator, 1);
+
+       /* Estimate how much we'll need for extension entrypoint information. */
+       if (pcxt->library_name != NULL)
+       {
+               Assert(pcxt->entrypoint == ParallelExtensionTrampoline);
+               Assert(pcxt->function_name != NULL);
+               shm_toc_estimate_chunk(&pcxt->estimator, strlen(pcxt->library_name)
+                                                          + strlen(pcxt->function_name) + 2);
+               shm_toc_estimate_keys(&pcxt->estimator, 1);
+       }
+
+       /* Create DSM and initialize with new table of contents. */
+       segsize = shm_toc_estimate(&pcxt->estimator);
+       pcxt->seg = dsm_create(segsize);
+       pcxt->toc = shm_toc_create(PARALLEL_MAGIC,
+                                                          dsm_segment_address(pcxt->seg),
+                                                          segsize);
+
+       /* Initialize fixed-size state in shared memory. */
+       fps = (FixedParallelState *)
+               shm_toc_allocate(pcxt->toc, sizeof(FixedParallelState));
+       fps->database_id = MyDatabaseId;
+       fps->authenticated_user_id = GetAuthenticatedUserId();
+       GetUserIdAndSecContext(&fps->current_user_id, &fps->sec_context);
+       fps->parallel_master_pgproc = MyProc;
+       fps->parallel_master_pid = MyProcPid;
+       fps->parallel_master_backend_id = MyBackendId;
+       fps->entrypoint = pcxt->entrypoint;
+       SpinLockInit(&fps->mutex);
+       fps->workers_expected = pcxt->nworkers;
+       fps->workers_attached = 0;
+       shm_toc_insert(pcxt->toc, PARALLEL_KEY_FIXED, fps);
+
+       /* Serialize GUC state to dynamic shared memory. */
+       libraryspace = shm_toc_allocate(pcxt->toc, library_len);
+       SerializeLibraryState(library_len, libraryspace);
+       shm_toc_insert(pcxt->toc, PARALLEL_KEY_LIBRARY, libraryspace);
+
+       /* Serialize GUC state to dynamic shared memory. */
+       gucspace = shm_toc_allocate(pcxt->toc, guc_len);
+       SerializeGUCState(guc_len, gucspace);
+       shm_toc_insert(pcxt->toc, PARALLEL_KEY_GUC, gucspace);
+
+       /* Serialize combo CID state to dynamic shared memory. */
+       combocidspace = shm_toc_allocate(pcxt->toc, combocidlen);
+       SerializeComboCIDState(combocidlen, combocidspace);
+       shm_toc_insert(pcxt->toc, PARALLEL_KEY_COMBO_CID, combocidspace);
+
+       /* Serialize transaction snapshots to dynamic shared memory. */
+       tsnapspace = shm_toc_allocate(pcxt->toc, tsnaplen);
+       SerializeSnapshot(transaction_snapshot, tsnaplen, tsnapspace);
+       shm_toc_insert(pcxt->toc, PARALLEL_KEY_TRANSACTION_SNAPSHOT, tsnapspace);
+       asnapspace = shm_toc_allocate(pcxt->toc, asnaplen);
+       SerializeSnapshot(active_snapshot, asnaplen, asnapspace);
+       shm_toc_insert(pcxt->toc, PARALLEL_KEY_ACTIVE_SNAPSHOT, asnapspace);
+
+       /* Serialize transaction state to dynamic shared memory. */
+       tstatespace = shm_toc_allocate(pcxt->toc, tstatelen);
+       SerializeTransactionState(tstatelen, tstatespace);
+       shm_toc_insert(pcxt->toc, PARALLEL_KEY_TRANSACTION_STATE, tstatespace);
+
+       /* Serialize lock state to dynamic shared memory. */
+       lockstatespace = shm_toc_allocate(pcxt->toc, lockstatelen);
+       SerializeLockState(lockstatelen, lockstatespace);
+       shm_toc_insert(pcxt->toc, PARALLEL_KEY_LOCK, lockstatespace);
+
+       /*
+        * Establish error queues in dynamic shared memory.
+        *
+        * These queues should be used only for transmitting ErrorResponse,
+        * NoticeResponse, and NotifyResponse protocol messages.  Tuple data should
+        * be transmitted via separate (possibly larger?) queue.
+        */
+       error_queue_space =
+          shm_toc_allocate(pcxt->toc, PARALLEL_ERROR_QUEUE_SIZE * pcxt->nworkers);
+       for (i = 0; i < pcxt->nworkers; ++i)
+       {
+               shm_mq *mq;
+
+               mq = shm_mq_create(error_queue_space + i * PARALLEL_ERROR_QUEUE_SIZE,
+                                                  PARALLEL_ERROR_QUEUE_SIZE);
+               shm_mq_set_receiver(mq, MyProc);
+               pcxt->worker[i].error_mqh = shm_mq_attach(mq, pcxt->seg, NULL);
+       }
+       shm_toc_insert(pcxt->toc, PARALLEL_KEY_ERROR_QUEUE, error_queue_space);
+
+       /* Serialize extension entrypoint information to dynamic shared memory. */
+       if (pcxt->library_name != NULL)
+       {
+               Size    lnamelen = strlen(pcxt->library_name);
+               char *extensionstate;
+
+               extensionstate = shm_toc_allocate(pcxt->toc, lnamelen
+                                                                                 + strlen(pcxt->function_name) + 2);
+               strcpy(extensionstate, pcxt->library_name);
+               strcpy(extensionstate + lnamelen + 1, pcxt->function_name);
+               shm_toc_insert(pcxt->toc, PARALLEL_KEY_EXTENSION_TRAMPOLINE,
+                                          extensionstate);
+       }
+
+       /* Restore previous memory context. */
+       MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * Launch parallel workers.
+ */
+void
+LaunchParallelWorkers(ParallelContext *pcxt)
+{
+       MemoryContext   oldcontext;
+       BackgroundWorker        worker;
+       int             i;
+
+       /* We might be running in a very short-lived memory context. */
+       oldcontext = MemoryContextSwitchTo(TopTransactionContext);
+
+       /* Configure a worker. */
+       snprintf(worker.bgw_name, BGW_MAXLEN, "parallel worker for PID %d",
+                        MyProcPid);
+       worker.bgw_flags =
+               BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION;
+       worker.bgw_start_time = BgWorkerStart_ConsistentState;
+       worker.bgw_restart_time = BGW_NEVER_RESTART;
+       worker.bgw_main = ParallelMain;
+       worker.bgw_main_arg = UInt32GetDatum(dsm_segment_handle(pcxt->seg));
+       worker.bgw_notify_pid = MyProcPid;
+
+       /*
+        * Start workers.
+        *
+        * The caller must be able to tolerate ending up with fewer workers than
+        * expected, so there is no need to throw an error here if registration
+        * fails.  It wouldn't help much anyway, because registering the worker
+        * in no way guarantees that it will start up and initialize successfully.
+        */
+       for (i = 0; i < pcxt->nworkers; ++i)
+       {
+               if (RegisterDynamicBackgroundWorker(&worker,
+                                                                                       &pcxt->worker[i].bgwhandle))
+                       shm_mq_set_handle(pcxt->worker[i].error_mqh,
+                                                         pcxt->worker[i].bgwhandle);
+               else
+               {
+                       pcxt->worker[i].bgwhandle = NULL;
+                       pcxt->worker[i].error_mqh = NULL;
+               }
+       }
+
+       /* Restore previous memory context. */
+       MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * Wait for all workers to exit cleanly.
+ */
+void
+WaitForParallelWorkersToFinish(ParallelContext *pcxt)
+{
+       for (;;)
+       {
+               bool    anyone_alive = false;
+               int             i;
+
+               /*
+                * This will process any parallel messages that are pending, which
+                * may change the outcome of the loop that follows.  It may also
+                * throw an error propagated from a worker.
+                */
+               CHECK_FOR_INTERRUPTS();
+
+               for (i = 0; i < pcxt->nworkers; ++i)
+               {
+                       if (pcxt->worker[i].error_mqh != NULL)
+                       {
+                               anyone_alive = true;
+                               break;
+                       }
+               }
+
+               if (!anyone_alive)
+                       break;
+
+               WaitLatch(&MyProc->procLatch, WL_LATCH_SET, -1);
+               ResetLatch(&MyProc->procLatch);
+       }
+}
+
+/*
+ * Destroy a parallel context.
+ *
+ * If expecting a clean exit, you should use WaitForParallelWorkersToFinish()
+ * first, before calling this function.  When this function is invoked, any
+ * remaining workers are forcibly killed; the dynamic shared memory segment
+ * is unmapped; and we then wait (uninterruptibly) for the workers to exit.
+ */
+void
+DestroyParallelContext(ParallelContext *pcxt)
+{
+       int             i;
+
+       /*
+        * Be careful about order of operations here!  We remove the parallel
+        * context from the list before we do anything else; otherwise, if an
+        * error occurs during a subsequent step, we might try to nuke it again
+        * from AtEOXact_Parallel or AtEOSubXact_Parallel.
+        */
+       dlist_delete(&pcxt->node);
+
+       /* Kill each worker in turn, and forget their error queues. */
+       for (i = 0; i < pcxt->nworkers; ++i)
+       {
+               if (pcxt->worker[i].bgwhandle != NULL)
+                       TerminateBackgroundWorker(pcxt->worker[i].bgwhandle);
+               if (pcxt->worker[i].error_mqh != NULL)
+               {
+                       pfree(pcxt->worker[i].error_mqh);
+                       pcxt->worker[i].error_mqh = NULL;
+               }
+       }
+
+       /*
+        * If we have allocated a shared memory segment, detach it.  This will
+        * implicitly detach the error queues, and any other shared memory queues,
+        * stored there.
+        */
+       if (pcxt->seg != NULL)
+               dsm_detach(pcxt->seg);
+
+       /* Wait until the workers actually die. */
+       for (i = 0; i < pcxt->nworkers; ++i)
+       {
+               BgwHandleStatus status;
+
+               if (pcxt->worker[i].bgwhandle == NULL)
+                       continue;
+
+               /*
+                * We can't finish transaction commit or abort until all of the
+                * workers are dead.  This means, in particular, that we can't respond
+                * to interrupts at this stage.
+                */
+               HOLD_INTERRUPTS();
+               status = WaitForBackgroundWorkerShutdown(pcxt->worker[i].bgwhandle);
+               RESUME_INTERRUPTS();
+
+               /*
+                * If the postmaster kicked the bucket, we have no chance of cleaning
+                * up safely -- we won't be able to tell when our workers are actually
+                * dead.  This doesn't necessitate a PANIC since they will all abort
+                * eventually, but we can't safely continue this session.
+                */
+               if (status == BGWH_POSTMASTER_DIED)
+                       ereport(FATAL,
+                                       (errcode(ERRCODE_ADMIN_SHUTDOWN),
+                                errmsg("postmaster exited during a parallel transaction")));
+
+               /* Release memory. */
+               pfree(pcxt->worker[i].bgwhandle);
+               pcxt->worker[i].bgwhandle = NULL;
+       }
+
+       /* Free the worker array itself. */
+       pfree(pcxt->worker);
+       pcxt->worker = NULL;
+
+       /* Free memory. */
+       pfree(pcxt);
+}
+
+/*
+ * Are there any parallel contexts currently active?
+ */
+bool
+ParallelContextActive(void)
+{
+       return !dlist_is_empty(&pcxt_list);
+}
+
+/*
+ * Handle receipt of an interrupt indicating a parallel worker message.
+ *
+ * If signal_handler is true, we are being called from a signal handler and must
+ * be extremely cautious about what we do here!
+ */
+void
+HandleParallelMessageInterrupt(bool signal_handler)
+{
+       int                     save_errno = errno;
+
+       /* Don't joggle the elbow of proc_exit */
+       if (!proc_exit_inprogress)
+       {
+               InterruptPending = true;
+               ParallelMessagePending = true;
+
+               /*
+                * If it's safe to interrupt, service the interrupt immediately.
+                * (We shouldn't be in parallel mode if waiting for the user to send
+                * a new query, but we could be waiting for a lock.)
+                */
+               if ((ImmediateInterruptOK || !signal_handler)
+                       && InterruptHoldoffCount == 0 && CritSectionCount == 0
+                       && !HandlingParallelMessages)
+               {
+                       bool notify_enabled;
+                       bool catchup_enabled;
+                       bool save_ImmediateInterruptOK;
+
+                       /*
+                        * Disable everything that might recursively interrupt us.
+                        *
+                        * If there were any possibility that disabling and re-enabling
+                        * interrupts or handling parallel messages might take a lock, we'd
+                        * need to HOLD_INTERRUPTS() as well, since taking a lock might
+                        * cause ImmediateInterruptOK to get temporarily reset to true.
+                        * But that shouldn't happen, so this is (hopefully) safe.  That's
+                        * good, because it lets us respond to query cancel and die
+                        * interrupts while we're in the midst of message-processing.
+                        */
+                       save_ImmediateInterruptOK = ImmediateInterruptOK;
+                       ImmediateInterruptOK = false;
+                       notify_enabled = DisableNotifyInterrupt();
+                       catchup_enabled = DisableCatchupInterrupt();
+                       HandlingParallelMessages = true;
+
+                       /* OK, do the work... */
+                       HandleParallelMessages();
+
+                       /* Now re-enable whatever was enabled before */
+                       HandlingParallelMessages = false;
+                       if (catchup_enabled)
+                               EnableCatchupInterrupt();
+                       if (notify_enabled)
+                               EnableNotifyInterrupt();
+                       ImmediateInterruptOK = save_ImmediateInterruptOK;
+               }
+       }
+
+       errno = save_errno;
+}
+
+/*
+ * Handle any queued protocol messages received from parallel workers.
+ */
+static void
+HandleParallelMessages(void)
+{
+       dlist_iter      iter;
+
+       ParallelMessagePending = false;
+
+       dlist_foreach(iter, &pcxt_list)
+       {
+               ParallelContext *pcxt;
+               int             i;
+               Size    nbytes;
+               void   *data;
+
+               pcxt = dlist_container(ParallelContext, node, iter.cur);
+               if (pcxt->worker == NULL)
+                       continue;
+
+               for (i = 0; i < pcxt->nworkers; ++i)
+               {
+                       /*
+                        * Read as many messages as we can from each worker, but stop
+                        * when either (1) the error queue goes away, which can happen if
+                        * we receive a ReadyForQuery from the worker; or (2) no more
+                        * messages can be read from the worker without blocking.
+                        */
+                       while (pcxt->worker[i].error_mqh != NULL)
+                       {
+                               shm_mq_result   res;
+
+                               CHECK_FOR_INTERRUPTS();
+
+                               res = shm_mq_receive(pcxt->worker[i].error_mqh, &nbytes,
+                                                                        &data, true);
+                               if (res == SHM_MQ_WOULD_BLOCK)
+                                       break;
+                               else if (res == SHM_MQ_SUCCESS)
+                               {
+                                       StringInfoData  msg;
+
+                                       initStringInfo(&msg);
+                                       appendBinaryStringInfo(&msg, data, nbytes);
+                                       HandleParallelMessage(pcxt, i, &msg);
+                                       pfree(msg.data);
+                               }
+                               else
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_INTERNAL_ERROR), /* XXX: wrong errcode? */
+                                                        errmsg("lost connection to parallel worker")));
+                       }
+               }
+       }
+}
+
+/*
+ * Handle a single protocol message received from a single parallel worker.
+ */
+static void
+HandleParallelMessage(ParallelContext *pcxt, int i, StringInfo msg)
+{
+       char    msgtype;
+
+       msgtype = pq_getmsgbyte(msg);
+
+       switch (msgtype)
+       {
+               case 'E':
+               case 'N':
+                       {
+                               ErrorData       edata;
+                               ErrorContextCallback *save_error_context_stack;
+
+                               /* Parse ErrorReponse or NoticeResponse. */
+                               pq_parse_errornotice(msg, &edata);
+
+                               /* Death of a worker isn't enough justification for suicide. */
+                               edata.elevel = Min(edata.elevel, ERROR);
+
+                               /*
+                                * Rethrow the error using the error context callbacks that
+                                * were in effect when the context was created, not the
+                                * current ones.
+                                */
+                               save_error_context_stack = error_context_stack;
+                               error_context_stack = pcxt->error_context_stack;
+                               ThrowErrorData(&edata);
+                               error_context_stack = save_error_context_stack;
+
+                               break;
+                       }
+
+               case 'A':
+                       {
+                               /* Propagate NotifyResponse. */
+                               pq_putmessage(msg->data[0], &msg->data[1], msg->len - 1);
+                               break;
+                       }
+
+               case 'Z':
+                       {
+                               /* ReadyForQuery indicates that this worker exits cleanly. */
+                               pfree(pcxt->worker[i].bgwhandle);
+                               pfree(pcxt->worker[i].error_mqh);
+                               pcxt->worker[i].bgwhandle = NULL;
+                               pcxt->worker[i].error_mqh = NULL;
+                               break;
+                       }
+
+               default:
+                       {
+                               elog(ERROR, "unknown message type: %c (%d bytes)",
+                                        msgtype, msg->len);
+                       }
+       }
+}
+
+/*
+ * End-of-subtransaction cleanup for parallel contexts.
+ *
+ * Currently, it's forbidden to enter or leave a subtransaction while
+ * parallel mode is in effect, so we could just blow away everything.  But
+ * we may want to relax that restriction in the future, so this code
+ * contemplates that there may be multiple subtransaction IDs in pcxt_list.
+ */
+void
+AtEOSubXact_Parallel(bool isCommit, SubTransactionId mySubId)
+{
+       HandlingParallelMessages = false;
+
+       while (!dlist_is_empty(&pcxt_list))
+       {
+               ParallelContext *pcxt;
+
+               pcxt = dlist_head_element(ParallelContext, node, &pcxt_list);
+               if (pcxt->subid != mySubId)
+                       break;
+               if (isCommit)
+                       elog(WARNING, "leaked parallel context");
+               DestroyParallelContext(pcxt);
+       }
+}
+
+/*
+ * End-of-transaction cleanup for parallel contexts.
+ */
+void
+AtEOXact_Parallel(bool isCommit)
+{
+       HandlingParallelMessages = false;
+
+       while (!dlist_is_empty(&pcxt_list))
+       {
+               ParallelContext *pcxt;
+
+               pcxt = dlist_head_element(ParallelContext, node, &pcxt_list);
+               if (isCommit)
+                       elog(WARNING, "leaked parallel context");
+               DestroyParallelContext(pcxt);
+       }
+}
+
+/*
+ * Main entrypoint for parallel workers.
+ */
+static void
+ParallelMain(Datum main_arg)
+{
+       dsm_segment *seg;
+       shm_toc *toc;
+       FixedParallelState *fps;
+       char   *error_queue_space;
+       shm_mq *mq;
+       shm_mq_handle *mqh;
+       char   *libraryspace;
+       char   *gucspace;
+       char   *combocidspace;
+       char   *tsnapspace;
+       char   *asnapspace;
+       char   *tstatespace;
+       char   *lockstatespace;
+       ErrorContextCallback errctx;
+
+       /* Establish signal handlers. */
+       pqsignal(SIGTERM, handle_sigterm);
+       BackgroundWorkerUnblockSignals();
+
+       /* Set up a memory context and resource owner. */
+       Assert(CurrentResourceOwner == NULL);
+       CurrentResourceOwner = ResourceOwnerCreate(NULL, "parallel toplevel");
+       CurrentMemoryContext = AllocSetContextCreate(TopMemoryContext,
+                                                                                                "parallel worker",
+                                                                                                ALLOCSET_DEFAULT_MINSIZE,
+                                                                                                ALLOCSET_DEFAULT_INITSIZE,
+                                                                                                ALLOCSET_DEFAULT_MAXSIZE);
+
+       /*
+        * Now that we have a resource owner, we can attach to the dynamic
+        * shared memory segment and read the table of contents.
+        */
+       seg = dsm_attach(DatumGetUInt32(main_arg));
+       if (seg == NULL)
+               ereport(ERROR,
+                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                errmsg("unable to map dynamic shared memory segment")));
+       toc = shm_toc_attach(PARALLEL_MAGIC, dsm_segment_address(seg));
+       if (toc == NULL)
+               ereport(ERROR,
+                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                errmsg("bad magic number in dynamic shared memory segment")));
+
+       /* Determine and set our worker number. */
+       fps = shm_toc_lookup(toc, PARALLEL_KEY_FIXED);
+       Assert(fps != NULL);
+       Assert(ParallelWorkerNumber == -1);
+       SpinLockAcquire(&fps->mutex);
+       if (fps->workers_attached < fps->workers_expected)
+               ParallelWorkerNumber = fps->workers_attached++;
+       SpinLockRelease(&fps->mutex);
+       if (ParallelWorkerNumber < 0)
+               ereport(ERROR,
+                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                errmsg("too many parallel workers already attached")));
+
+       /*
+        * Now that we have a worker number, we can find and attach to the error
+        * queue provided for us.  That's good, because until we do that, any
+        * errors that happen here will not be reported back to the process that
+        * requested that this worker be launched.
+        */
+       error_queue_space = shm_toc_lookup(toc, PARALLEL_KEY_ERROR_QUEUE);
+       mq = (shm_mq *) (error_queue_space +
+               ParallelWorkerNumber * PARALLEL_ERROR_QUEUE_SIZE);
+       shm_mq_set_sender(mq, MyProc);
+       mqh = shm_mq_attach(mq, seg, NULL);
+       pq_redirect_to_shm_mq(mq, mqh);
+       pq_set_parallel_master(fps->parallel_master_pid,
+                                                  fps->parallel_master_backend_id);
+
+       /* Install an error-context callback. */
+       errctx.callback = ParallelErrorContext;
+       errctx.arg = NULL;
+       errctx.previous = error_context_stack;
+       error_context_stack = &errctx;
+
+       /*
+        * Hooray! Primary initialization is complete.  Now, we need to set up
+        * our backend-local state to match the original backend.
+        */
+
+       /*
+        * Load libraries that were loaded by original backend.  We want to do this
+        * before restoring GUCs, because the libraries might define custom
+        * variables.
+        */
+       libraryspace = shm_toc_lookup(toc, PARALLEL_KEY_LIBRARY);
+       Assert(libraryspace != NULL);
+       RestoreLibraryState(libraryspace);
+       /* Restore database connection. */
+       BackgroundWorkerInitializeConnectionByOid(fps->database_id,
+                                                                                         fps->authenticated_user_id);
+
+       /* Restore GUC values from launching backend. */
+       gucspace = shm_toc_lookup(toc, PARALLEL_KEY_GUC);
+       Assert(gucspace != NULL);
+       StartTransactionCommand();
+       RestoreGUCState(gucspace);
+       CommitTransactionCommand();
+
+       /* Handle local_preload_libraries and session_preload_libraries. */
+       process_session_preload_libraries();
+
+       /* Crank up a transaction state appropriate to a parallel worker. */
+       tstatespace = shm_toc_lookup(toc, PARALLEL_KEY_TRANSACTION_STATE);
+       StartParallelWorkerTransaction(tstatespace);
+
+       /* Restore combo CID state. */
+       combocidspace = shm_toc_lookup(toc, PARALLEL_KEY_COMBO_CID);
+       Assert(combocidspace != NULL);
+       RestoreComboCIDState(combocidspace);
+
+       /* Restore transaction snapshot. */
+       tsnapspace = shm_toc_lookup(toc, PARALLEL_KEY_TRANSACTION_SNAPSHOT);
+       Assert(tsnapspace != NULL);
+       RestoreTransactionSnapshot(RestoreSnapshot(tsnapspace),
+                                                          fps->parallel_master_pgproc);
+
+       /* Restore active snapshot. */
+       asnapspace = shm_toc_lookup(toc, PARALLEL_KEY_ACTIVE_SNAPSHOT);
+       Assert(asnapspace != NULL);
+       PushActiveSnapshot(RestoreSnapshot(asnapspace));
+
+       /* Restore user ID and security context. */
+       SetUserIdAndSecContext(fps->current_user_id, fps->sec_context);
+
+       /* Restore locks. */
+       lockstatespace = shm_toc_lookup(toc, PARALLEL_KEY_LOCK);
+       Assert(lockstatespace != NULL);
+       RestoreLockState(lockstatespace);
+
+       /*
+        * We've initialized all of our state now; nothing should change hereafter.
+        */
+       EnterParallelMode();
+
+       /*
+        * Time to do the real work: invoke the caller-supplied code.
+        *
+        * If you get a crash at this line, see the comments for
+        * ParallelExtensionTrampoline.
+        */
+       fps->entrypoint(seg, toc);
+
+       /* Must exit parallel mode to pop active snapshot. */
+       ExitParallelMode();
+
+       /* Must pop active snapshot so resowner.c doesn't complain. */
+       PopActiveSnapshot();
+
+       /* Shut down the parallel-worker transaction. */
+       EndParallelWorkerTransaction();
+
+       /* Report success. */
+       ReadyForQuery(DestRemote);
+}
+
+/*
+ * It's unsafe for the entrypoint invoked by ParallelMain to be a function
+ * living in a dynamically loaded module, because the module might not be
+ * loaded in every process, or might be loaded but not at the same address.
+ * To work around that problem, CreateParallelContextForExtension() arranges
+ * to call this function rather than calling the extension-provided function
+ * directly; and this function then looks up the real entrypoint and calls it.
+ */
+static void
+ParallelExtensionTrampoline(dsm_segment *seg, shm_toc *toc)
+{
+       char   *extensionstate;
+       char   *library_name;
+       char   *function_name;
+       parallel_worker_main_type entrypt;
+
+       extensionstate = shm_toc_lookup(toc, PARALLEL_KEY_EXTENSION_TRAMPOLINE);
+       Assert(extensionstate != NULL);
+       library_name = extensionstate;
+       function_name = extensionstate + strlen(library_name) + 1;
+
+       entrypt = (parallel_worker_main_type)
+               load_external_function(library_name, function_name, true, NULL);
+       entrypt(seg, toc);
+}
+
+/*
+ * When we receive a SIGTERM, we set InterruptPending and ProcDiePending just
+ * like a normal backend.  The next CHECK_FOR_INTERRUPTS() will do the right
+ * thing.
+ */
+static void
+ParallelErrorContext(void *arg)
+{
+       errcontext("parallel worker, pid %d", MyProcPid);
+}
+
+/*
+ * When we receive a SIGTERM, we set InterruptPending and ProcDiePending just
+ * like a normal backend.  The next CHECK_FOR_INTERRUPTS() will do the right
+ * thing.
+ */
+static void
+handle_sigterm(SIGNAL_ARGS)
+{
+       int             save_errno = errno;
+
+       if (MyProc)
+               SetLatch(&MyProc->procLatch);
+
+       if (!proc_exit_inprogress)
+       {
+               InterruptPending = true;
+               ProcDiePending = true;
+       }
+
+       errno = save_errno;
+}
index 42ee57fe8d72f85a0cf2ba1886296d2fdefe14a8..cf3e964fc6ec510cd8b4247eee20c7e02d3cacb7 100644 (file)
@@ -49,6 +49,13 @@ GetNewTransactionId(bool isSubXact)
 {
        TransactionId xid;
 
+       /*
+        * Workers synchronize transaction state at the beginning of each parallel
+        * operation, so we can't account for new XIDs after that point.
+        */
+       if (IsInParallelMode())
+               elog(ERROR, "cannot assign TransactionIds during a parallel operation");
+
        /*
         * During bootstrap initialization, we return the special bootstrap
         * transaction id.
index 97000ef616c9732e7b8b9748b52544df1841e0e7..bd6b386f1397e331364610fd5c989b33815de2ad 100644 (file)
@@ -22,6 +22,7 @@
 
 #include "access/commit_ts.h"
 #include "access/multixact.h"
+#include "access/parallel.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/twophase.h"
@@ -49,6 +50,7 @@
 #include "storage/procarray.h"
 #include "storage/sinvaladt.h"
 #include "storage/smgr.h"
+#include "utils/builtins.h"
 #include "utils/catcache.h"
 #include "utils/combocid.h"
 #include "utils/guc.h"
@@ -75,6 +77,32 @@ bool         XactDeferrable;
 
 int                    synchronous_commit = SYNCHRONOUS_COMMIT_ON;
 
+/*
+ * Only a single TransactionStateData is placed on the parallel worker's
+ * state stack, and the XID reflected there will be that of the *innermost*
+ * currently-active subtransaction in the backend that initiated paralllelism.
+ * However, GetTopTransactionId() and TransactionIdIsCurrentTransactionId()
+ * need to return the same answers in the parallel worker as they would have
+ * in the user backend, so we need some additional bookkeeping.
+ *
+ * XactTopTransactionId stores the XID of our toplevel transaction, which
+ * will be the same as TopTransactionState.transactionId in an ordinary
+ * backend; but in a parallel backend, which does not have the entire
+ * transaction state, it will instead be copied from the backend that started
+ * the parallel operation.
+ *
+ * nParallelCurrentXids will be 0 and ParallelCurrentXids NULL in an ordinary
+ * backend, but in a parallel backend, nParallelCurrentXids will contain the
+ * number of XIDs that need to be considered current, and ParallelCurrentXids
+ * will contain the XIDs themselves.  This includes all XIDs that were current
+ * or sub-committed in the parent at the time the parallel operation began.
+ * The XIDs are stored sorted in numerical order (not logical order) to make
+ * lookups as fast as possible.
+ */
+TransactionId  XactTopTransactionId = InvalidTransactionId;
+int                            nParallelCurrentXids = 0;
+TransactionId  *ParallelCurrentXids;
+
 /*
  * MyXactAccessedTempRel is set when a temporary relation is accessed.
  * We don't allow PREPARE TRANSACTION in that case.  (This is global
@@ -111,6 +139,7 @@ typedef enum TBlockState
        /* transaction block states */
        TBLOCK_BEGIN,                           /* starting transaction block */
        TBLOCK_INPROGRESS,                      /* live transaction */
+       TBLOCK_PARALLEL_INPROGRESS,     /* live transaction inside parallel worker */
        TBLOCK_END,                                     /* COMMIT received */
        TBLOCK_ABORT,                           /* failed xact, awaiting ROLLBACK */
        TBLOCK_ABORT_END,                       /* failed xact, ROLLBACK received */
@@ -152,6 +181,7 @@ typedef struct TransactionStateData
        bool            prevXactReadOnly;               /* entry-time xact r/o state */
        bool            startedInRecovery;              /* did we start in recovery? */
        bool            didLogXid;              /* has xid been included in WAL record? */
+       bool            parallelMode;   /* current transaction in parallel operation? */
        struct TransactionStateData *parent;            /* back link to parent */
 } TransactionStateData;
 
@@ -182,6 +212,7 @@ static TransactionStateData TopTransactionStateData = {
        false,                                          /* entry-time xact r/o state */
        false,                                          /* startedInRecovery */
        false,                                          /* didLogXid */
+       false,                                          /* parallelMode */
        NULL                                            /* link to parent state block */
 };
 
@@ -351,9 +382,9 @@ IsAbortedTransactionBlockState(void)
 TransactionId
 GetTopTransactionId(void)
 {
-       if (!TransactionIdIsValid(TopTransactionStateData.transactionId))
+       if (!TransactionIdIsValid(XactTopTransactionId))
                AssignTransactionId(&TopTransactionStateData);
-       return TopTransactionStateData.transactionId;
+       return XactTopTransactionId;
 }
 
 /*
@@ -366,7 +397,7 @@ GetTopTransactionId(void)
 TransactionId
 GetTopTransactionIdIfAny(void)
 {
-       return TopTransactionStateData.transactionId;
+       return XactTopTransactionId;
 }
 
 /*
@@ -459,6 +490,13 @@ AssignTransactionId(TransactionState s)
        Assert(!TransactionIdIsValid(s->transactionId));
        Assert(s->state == TRANS_INPROGRESS);
 
+       /*
+        * Workers synchronize transaction state at the beginning of each
+        * parallel operation, so we can't account for new XIDs at this point.
+        */
+       if (IsInParallelMode())
+               elog(ERROR, "cannot assign XIDs during a parallel operation");
+
        /*
         * Ensure parent(s) have XIDs, so that a child always has an XID later
         * than its parent.  Musn't recurse here, or we might get a stack overflow
@@ -511,6 +549,8 @@ AssignTransactionId(TransactionState s)
         * the Xid as "running".  See GetNewTransactionId.
         */
        s->transactionId = GetNewTransactionId(isSubXact);
+       if (!isSubXact)
+               XactTopTransactionId = s->transactionId;
 
        if (isSubXact)
                SubTransSetParent(s->transactionId, s->parent->transactionId, false);
@@ -642,7 +682,16 @@ GetCurrentCommandId(bool used)
 {
        /* this is global to a transaction, not subtransaction-local */
        if (used)
+       {
+               /*
+                * Forbid setting currentCommandIdUsed in parallel mode, because we
+                * have no provision for communicating this back to the master.  We
+                * could relax this restriction when currentCommandIdUsed was already
+                * true at the start of the parallel operation.
+                */
+               Assert(!CurrentTransactionState->parallelMode);
                currentCommandIdUsed = true;
+       }
        return currentCommandId;
 }
 
@@ -735,6 +784,36 @@ TransactionIdIsCurrentTransactionId(TransactionId xid)
        if (!TransactionIdIsNormal(xid))
                return false;
 
+       /*
+        * In parallel workers, the XIDs we must consider as current are stored
+        * in ParallelCurrentXids rather than the transaction-state stack.  Note
+        * that the XIDs in this array are sorted numerically rather than
+        * according to transactionIdPrecedes order.
+        */
+       if (nParallelCurrentXids > 0)
+       {
+               int                     low,
+                                       high;
+
+               low = 0;
+               high = nParallelCurrentXids - 1;
+               while (low <= high)
+               {
+                       int                     middle;
+                       TransactionId probe;
+
+                       middle = low + (high - low) / 2;
+                       probe = ParallelCurrentXids[middle];
+                       if (probe == xid)
+                               return true;
+                       else if (probe < xid)
+                               low = middle + 1;
+                       else
+                               high = middle - 1;
+               }
+               return false;
+       }
+
        /*
         * We will return true for the Xid of the current subtransaction, any of
         * its subcommitted children, any of its parents, or any of their
@@ -788,6 +867,53 @@ TransactionStartedDuringRecovery(void)
        return CurrentTransactionState->startedInRecovery;
 }
 
+/*
+ *     EnterParallelMode
+ */
+void
+EnterParallelMode(void)
+{
+       TransactionState s = CurrentTransactionState;
+
+       /*
+        * Workers synchronize transaction state at the beginning of each
+        * parallel operation, so we can't let the transaction state be changed
+        * after that point.  That includes the parallel mode flag itself.
+        */
+       Assert(!s->parallelMode);
+
+       s->parallelMode = true;
+}
+
+/*
+ *     ExitParallelMode
+ */
+void
+ExitParallelMode(void)
+{
+       TransactionState s = CurrentTransactionState;
+
+       Assert(s->parallelMode);
+       Assert(!ParallelContextActive());
+
+       s->parallelMode = false;
+}
+
+/*
+ *     IsInParallelMode
+ *
+ * Are we in a parallel operation, as either the master or a worker?  Check
+ * this to prohibit operations that change backend-local state expected to
+ * match across all workers.  Mere caches usually don't require such a
+ * restriction.  State modified in a strict push/pop fashion, such as the
+ * active snapshot stack, is often fine.
+ */
+bool
+IsInParallelMode(void)
+{
+       return CurrentTransactionState->parallelMode;
+}
+
 /*
  *     CommandCounterIncrement
  */
@@ -802,6 +928,14 @@ CommandCounterIncrement(void)
         */
        if (currentCommandIdUsed)
        {
+               /*
+                * Workers synchronize transaction state at the beginning of each
+                * parallel operation, so we can't account for new commands after that
+                * point.
+                */
+               if (IsInParallelMode())
+                       elog(ERROR, "cannot start commands during a parallel operation");
+
                currentCommandId += 1;
                if (currentCommandId == InvalidCommandId)
                {
@@ -1705,6 +1839,8 @@ StartTransaction(void)
        s = &TopTransactionStateData;
        CurrentTransactionState = s;
 
+       Assert(XactTopTransactionId == InvalidTransactionId);
+
        /*
         * check the current transaction state
         */
@@ -1834,6 +1970,9 @@ CommitTransaction(void)
 {
        TransactionState s = CurrentTransactionState;
        TransactionId latestXid;
+       bool            parallel;
+
+       parallel = (s->blockState == TBLOCK_PARALLEL_INPROGRESS);
 
        ShowTransactionState("CommitTransaction");
 
@@ -1867,7 +2006,8 @@ CommitTransaction(void)
                        break;
        }
 
-       CallXactCallbacks(XACT_EVENT_PRE_COMMIT);
+       CallXactCallbacks(parallel ? XACT_EVENT_PARALLEL_PRE_COMMIT
+                                         : XACT_EVENT_PRE_COMMIT);
 
        /*
         * The remaining actions cannot call any user-defined code, so it's safe
@@ -1876,6 +2016,10 @@ CommitTransaction(void)
         * the transaction-abort path.
         */
 
+       /* If we might have parallel workers, clean them up now. */
+       if (IsInParallelMode())
+               AtEOXact_Parallel(true);
+
        /* Shut down the deferred-trigger manager */
        AfterTriggerEndXact(true);
 
@@ -1915,9 +2059,13 @@ CommitTransaction(void)
        s->state = TRANS_COMMIT;
 
        /*
-        * Here is where we really truly commit.
+        * Unless we're in parallel mode, we need to mark our XIDs as committed
+        * in pg_clog.  This is where durably commit.
         */
-       latestXid = RecordTransactionCommit();
+       if (parallel)
+               latestXid = InvalidTransactionId;
+       else
+               latestXid = RecordTransactionCommit();
 
        TRACE_POSTGRESQL_TRANSACTION_COMMIT(MyProc->lxid);
 
@@ -1944,7 +2092,8 @@ CommitTransaction(void)
         * state.
         */
 
-       CallXactCallbacks(XACT_EVENT_COMMIT);
+       CallXactCallbacks(parallel ? XACT_EVENT_PARALLEL_COMMIT
+                                         : XACT_EVENT_COMMIT);
 
        ResourceOwnerRelease(TopTransactionResourceOwner,
                                                 RESOURCE_RELEASE_BEFORE_LOCKS,
@@ -1992,7 +2141,8 @@ CommitTransaction(void)
        AtEOXact_GUC(true, 1);
        AtEOXact_SPI(true);
        AtEOXact_on_commit_actions(true);
-       AtEOXact_Namespace(true);
+       if (!parallel)
+               AtEOXact_Namespace(true);
        AtEOXact_SMgr();
        AtEOXact_Files();
        AtEOXact_ComboCid();
@@ -2017,6 +2167,9 @@ CommitTransaction(void)
        s->nChildXids = 0;
        s->maxChildXids = 0;
 
+       XactTopTransactionId = InvalidTransactionId;
+       nParallelCurrentXids = 0;
+
        /*
         * done with commit processing, set current transaction state back to
         * default
@@ -2040,6 +2193,8 @@ PrepareTransaction(void)
        GlobalTransaction gxact;
        TimestampTz prepared_at;
 
+       Assert(!IsInParallelMode());
+
        ShowTransactionState("PrepareTransaction");
 
        /*
@@ -2284,6 +2439,9 @@ PrepareTransaction(void)
        s->nChildXids = 0;
        s->maxChildXids = 0;
 
+       XactTopTransactionId = InvalidTransactionId;
+       nParallelCurrentXids = 0;
+
        /*
         * done with 1st phase commit processing, set current transaction state
         * back to default
@@ -2302,6 +2460,7 @@ AbortTransaction(void)
 {
        TransactionState s = CurrentTransactionState;
        TransactionId latestXid;
+       bool    parallel;
 
        /* Prevent cancel/die interrupt while cleaning up */
        HOLD_INTERRUPTS();
@@ -2350,6 +2509,7 @@ AbortTransaction(void)
        /*
         * check the current transaction state
         */
+       parallel = (s->blockState == TBLOCK_PARALLEL_INPROGRESS);
        if (s->state != TRANS_INPROGRESS && s->state != TRANS_PREPARE)
                elog(WARNING, "AbortTransaction while in %s state",
                         TransStateAsString(s->state));
@@ -2373,6 +2533,13 @@ AbortTransaction(void)
         */
        SetUserIdAndSecContext(s->prevUser, s->prevSecContext);
 
+       /*
+        * If we might have parallel workers, send them all termination signals,
+        * and wait for them to die.
+        */
+       if (IsInParallelMode())
+               AtEOXact_Parallel(false);
+
        /*
         * do abort processing
         */
@@ -2385,9 +2552,14 @@ AbortTransaction(void)
 
        /*
         * Advertise the fact that we aborted in pg_clog (assuming that we got as
-        * far as assigning an XID to advertise).
+        * far as assigning an XID to advertise).  But if we're inside a parallel
+        * worker, skip this; the user backend must be the one to write the abort
+        * record.
         */
-       latestXid = RecordTransactionAbort(false);
+       if (parallel)
+               latestXid = InvalidTransactionId;
+       else
+               latestXid = RecordTransactionAbort(false);
 
        TRACE_POSTGRESQL_TRANSACTION_ABORT(MyProc->lxid);
 
@@ -2405,7 +2577,10 @@ AbortTransaction(void)
         */
        if (TopTransactionResourceOwner != NULL)
        {
-               CallXactCallbacks(XACT_EVENT_ABORT);
+               if (parallel)
+                       CallXactCallbacks(XACT_EVENT_PARALLEL_ABORT);
+               else
+                       CallXactCallbacks(XACT_EVENT_ABORT);
 
                ResourceOwnerRelease(TopTransactionResourceOwner,
                                                         RESOURCE_RELEASE_BEFORE_LOCKS,
@@ -2426,7 +2601,8 @@ AbortTransaction(void)
                AtEOXact_GUC(false, 1);
                AtEOXact_SPI(false);
                AtEOXact_on_commit_actions(false);
-               AtEOXact_Namespace(false);
+               if (!parallel)
+                       AtEOXact_Namespace(false);
                AtEOXact_SMgr();
                AtEOXact_Files();
                AtEOXact_ComboCid();
@@ -2478,6 +2654,10 @@ CleanupTransaction(void)
        s->childXids = NULL;
        s->nChildXids = 0;
        s->maxChildXids = 0;
+       s->parallelMode = false;
+
+       XactTopTransactionId = InvalidTransactionId;
+       nParallelCurrentXids = 0;
 
        /*
         * done with abort processing, set current transaction state back to
@@ -2531,6 +2711,7 @@ StartTransactionCommand(void)
                        /* These cases are invalid. */
                case TBLOCK_STARTED:
                case TBLOCK_BEGIN:
+               case TBLOCK_PARALLEL_INPROGRESS:
                case TBLOCK_SUBBEGIN:
                case TBLOCK_END:
                case TBLOCK_SUBRELEASE:
@@ -2566,11 +2747,13 @@ CommitTransactionCommand(void)
        switch (s->blockState)
        {
                        /*
-                        * This shouldn't happen, because it means the previous
+                        * These shouldn't happen.  TBLOCK_DEFAULT means the previous
                         * StartTransactionCommand didn't set the STARTED state
-                        * appropriately.
+                        * appropriately, while TBLOCK_PARALLEL_INPROGRESS should be ended
+                        * by EndParallelWorkerTranaction(), not this function.
                         */
                case TBLOCK_DEFAULT:
+               case TBLOCK_PARALLEL_INPROGRESS:
                        elog(FATAL, "CommitTransactionCommand: unexpected state %s",
                                 BlockStateAsString(s->blockState));
                        break;
@@ -2852,6 +3035,7 @@ AbortCurrentTransaction(void)
                         * ABORT state.  We will stay in ABORT until we get a ROLLBACK.
                         */
                case TBLOCK_INPROGRESS:
+               case TBLOCK_PARALLEL_INPROGRESS:
                        AbortTransaction();
                        s->blockState = TBLOCK_ABORT;
                        /* CleanupTransaction happens when we exit TBLOCK_ABORT_END */
@@ -3241,6 +3425,7 @@ BeginTransactionBlock(void)
                         * Already a transaction block in progress.
                         */
                case TBLOCK_INPROGRESS:
+               case TBLOCK_PARALLEL_INPROGRESS:
                case TBLOCK_SUBINPROGRESS:
                case TBLOCK_ABORT:
                case TBLOCK_SUBABORT:
@@ -3418,6 +3603,16 @@ EndTransactionBlock(void)
                        result = true;
                        break;
 
+                       /*
+                        * The user issued a COMMIT that somehow ran inside a parallel
+                        * worker.  We can't cope with that.
+                        */
+               case TBLOCK_PARALLEL_INPROGRESS:
+                       ereport(FATAL,
+                                       (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+                                        errmsg("cannot commit during a parallel operation")));
+                       break;
+
                        /* These cases are invalid. */
                case TBLOCK_DEFAULT:
                case TBLOCK_BEGIN:
@@ -3511,6 +3706,16 @@ UserAbortTransactionBlock(void)
                        s->blockState = TBLOCK_ABORT_PENDING;
                        break;
 
+                       /*
+                        * The user issued an ABORT that somehow ran inside a parallel
+                        * worker.  We can't cope with that.
+                        */
+               case TBLOCK_PARALLEL_INPROGRESS:
+                       ereport(FATAL,
+                                       (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+                                        errmsg("cannot abort during a parallel operation")));
+                       break;
+
                        /* These cases are invalid. */
                case TBLOCK_DEFAULT:
                case TBLOCK_BEGIN:
@@ -3540,6 +3745,18 @@ DefineSavepoint(char *name)
 {
        TransactionState s = CurrentTransactionState;
 
+       /*
+        * Workers synchronize transaction state at the beginning of each parallel
+        * operation, so we can't account for new subtransactions after that
+        * point.  (Note that this check will certainly error out if s->blockState
+        * is TBLOCK_PARALLEL_INPROGRESS, so we can treat that as an invalid case
+        * below.)
+        */
+       if (IsInParallelMode())
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+                                errmsg("cannot define savepoints during a parallel operation")));
+
        switch (s->blockState)
        {
                case TBLOCK_INPROGRESS:
@@ -3560,6 +3777,7 @@ DefineSavepoint(char *name)
                case TBLOCK_DEFAULT:
                case TBLOCK_STARTED:
                case TBLOCK_BEGIN:
+               case TBLOCK_PARALLEL_INPROGRESS:
                case TBLOCK_SUBBEGIN:
                case TBLOCK_END:
                case TBLOCK_SUBRELEASE:
@@ -3594,6 +3812,18 @@ ReleaseSavepoint(List *options)
        ListCell   *cell;
        char       *name = NULL;
 
+       /*
+        * Workers synchronize transaction state at the beginning of each parallel
+        * operation, so we can't account for transaction state change after that
+        * point.  (Note that this check will certainly error out if s->blockState
+        * is TBLOCK_PARALLEL_INPROGRESS, so we can treat that as an invalid case
+        * below.)
+        */
+       if (IsInParallelMode())
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+                                errmsg("cannot release savepoints during a parallel operation")));
+
        switch (s->blockState)
        {
                        /*
@@ -3617,6 +3847,7 @@ ReleaseSavepoint(List *options)
                case TBLOCK_DEFAULT:
                case TBLOCK_STARTED:
                case TBLOCK_BEGIN:
+               case TBLOCK_PARALLEL_INPROGRESS:
                case TBLOCK_SUBBEGIN:
                case TBLOCK_END:
                case TBLOCK_SUBRELEASE:
@@ -3694,6 +3925,18 @@ RollbackToSavepoint(List *options)
        ListCell   *cell;
        char       *name = NULL;
 
+       /*
+        * Workers synchronize transaction state at the beginning of each parallel
+        * operation, so we can't account for transaction state change after that
+        * point.  (Note that this check will certainly error out if s->blockState
+        * is TBLOCK_PARALLEL_INPROGRESS, so we can treat that as an invalid case
+        * below.)
+        */
+       if (IsInParallelMode())
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+                                errmsg("cannot rollback to savepoints during a parallel operation")));
+
        switch (s->blockState)
        {
                        /*
@@ -3718,6 +3961,7 @@ RollbackToSavepoint(List *options)
                case TBLOCK_DEFAULT:
                case TBLOCK_STARTED:
                case TBLOCK_BEGIN:
+               case TBLOCK_PARALLEL_INPROGRESS:
                case TBLOCK_SUBBEGIN:
                case TBLOCK_END:
                case TBLOCK_SUBRELEASE:
@@ -3806,6 +4050,20 @@ BeginInternalSubTransaction(char *name)
 {
        TransactionState s = CurrentTransactionState;
 
+       /*
+        * Workers synchronize transaction state at the beginning of each parallel
+        * operation, so we can't account for new subtransactions after that point.
+        * We might be able to make an exception for the type of subtransaction
+        * established by this function, which is typically used in contexts where
+        * we're going to release or roll back the subtransaction before proceeding
+        * further, so that no enduring change to the transaction state occurs.
+        * For now, however, we prohibit this case along with all the others.
+        */
+       if (IsInParallelMode())
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+                                errmsg("cannot start subtransactions during a parallel operation")));
+
        switch (s->blockState)
        {
                case TBLOCK_STARTED:
@@ -3828,6 +4086,7 @@ BeginInternalSubTransaction(char *name)
                        /* These cases are invalid. */
                case TBLOCK_DEFAULT:
                case TBLOCK_BEGIN:
+               case TBLOCK_PARALLEL_INPROGRESS:
                case TBLOCK_SUBBEGIN:
                case TBLOCK_SUBRELEASE:
                case TBLOCK_SUBCOMMIT:
@@ -3860,6 +4119,18 @@ ReleaseCurrentSubTransaction(void)
 {
        TransactionState s = CurrentTransactionState;
 
+       /*
+        * Workers synchronize transaction state at the beginning of each parallel
+        * operation, so we can't account for commit of subtransactions after that
+        * point.  This should not happen anyway.  Code calling this would
+        * typically have called BeginInternalSubTransaction() first, failing
+        * there.
+        */
+       if (IsInParallelMode())
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+                                errmsg("cannot commit subtransactions during a parallel operation")));
+
        if (s->blockState != TBLOCK_SUBINPROGRESS)
                elog(ERROR, "ReleaseCurrentSubTransaction: unexpected state %s",
                         BlockStateAsString(s->blockState));
@@ -3882,6 +4153,14 @@ RollbackAndReleaseCurrentSubTransaction(void)
 {
        TransactionState s = CurrentTransactionState;
 
+       /*
+        * Unlike ReleaseCurrentSubTransaction(), this is nominally permitted
+        * during parallel operations.  That's because we may be in the master,
+        * recovering from an error thrown while we were in parallel mode.  We
+        * won't reach here in a worker, because BeginInternalSubTransaction()
+        * will have failed.
+        */
+
        switch (s->blockState)
        {
                        /* Must be in a subtransaction */
@@ -3893,6 +4172,7 @@ RollbackAndReleaseCurrentSubTransaction(void)
                case TBLOCK_DEFAULT:
                case TBLOCK_STARTED:
                case TBLOCK_BEGIN:
+               case TBLOCK_PARALLEL_INPROGRESS:
                case TBLOCK_SUBBEGIN:
                case TBLOCK_INPROGRESS:
                case TBLOCK_END:
@@ -3968,6 +4248,7 @@ AbortOutOfAnyTransaction(void)
                        case TBLOCK_STARTED:
                        case TBLOCK_BEGIN:
                        case TBLOCK_INPROGRESS:
+                       case TBLOCK_PARALLEL_INPROGRESS:
                        case TBLOCK_END:
                        case TBLOCK_ABORT_PENDING:
                        case TBLOCK_PREPARE:
@@ -4059,6 +4340,7 @@ TransactionBlockStatusCode(void)
                case TBLOCK_BEGIN:
                case TBLOCK_SUBBEGIN:
                case TBLOCK_INPROGRESS:
+               case TBLOCK_PARALLEL_INPROGRESS:
                case TBLOCK_SUBINPROGRESS:
                case TBLOCK_END:
                case TBLOCK_SUBRELEASE:
@@ -4162,6 +4444,13 @@ CommitSubTransaction(void)
        CallSubXactCallbacks(SUBXACT_EVENT_PRE_COMMIT_SUB, s->subTransactionId,
                                                 s->parent->subTransactionId);
 
+       /* Exit from parallel mode, if necessary. */
+       if (IsInParallelMode())
+       {
+               AtEOSubXact_Parallel(true, s->subTransactionId);
+               s->parallelMode = false;
+       }
+
        /* Do the actual "commit", such as it is */
        s->state = TRANS_COMMIT;
 
@@ -4315,6 +4604,13 @@ AbortSubTransaction(void)
         */
        SetUserIdAndSecContext(s->prevUser, s->prevSecContext);
 
+       /* Exit from parallel mode, if necessary. */
+       if (IsInParallelMode())
+       {
+               AtEOSubXact_Parallel(false, s->subTransactionId);
+               s->parallelMode = false;
+       }
+
        /*
         * We can skip all this stuff if the subxact failed before creating a
         * ResourceOwner...
@@ -4455,6 +4751,7 @@ PushTransaction(void)
        s->blockState = TBLOCK_SUBBEGIN;
        GetUserIdAndSecContext(&s->prevUser, &s->prevSecContext);
        s->prevXactReadOnly = XactReadOnly;
+       s->parallelMode = false;
 
        CurrentTransactionState = s;
 
@@ -4501,6 +4798,134 @@ PopTransaction(void)
        pfree(s);
 }
 
+/*
+ * EstimateTransactionStateSpace
+ *             Estimate the amount of space that will be needed by
+ *             SerializeTransactionState.  It would be OK to overestimate slightly,
+ *             but it's simple for us to work out the precise value, so we do.
+ */
+Size
+EstimateTransactionStateSpace(void)
+{
+       TransactionState s;
+       Size    nxids = 3;      /* top XID, current XID, count of XIDs */
+
+       for (s = CurrentTransactionState; s != NULL; s = s->parent)
+       {
+               if (TransactionIdIsValid(s->transactionId))
+                       nxids = add_size(nxids, 1);
+               nxids = add_size(nxids, s->nChildXids);
+       }
+
+       nxids = add_size(nxids, nParallelCurrentXids);
+       return mul_size(nxids, sizeof(TransactionId));
+}
+
+/*
+ * SerializeTransactionState
+ *             Write out relevant details of our transaction state that will be
+ *             needed by a parallel worker.
+ *
+ * Currently, the only information we attempt to save and restore here is
+ * the XIDs associated with this transaction.  The first eight bytes of the
+ * result contain the XID of the top-level transaction and the XID of the
+ * current transaction (or, in each case, InvalidTransactionId if none).
+ * The next 4 bytes contain a count of how many additional XIDs follow;
+ * this is followed by all of those XIDs one after another.  We emit the XIDs
+ * in sorted order for the convenience of the receiving process.
+ */
+void
+SerializeTransactionState(Size maxsize, char *start_address)
+{
+       TransactionState s;
+       Size    nxids = 0;
+       Size    i = 0;
+       TransactionId *workspace;
+       TransactionId *result = (TransactionId *) start_address;
+
+       Assert(maxsize >= 3 * sizeof(TransactionId));
+       result[0] = XactTopTransactionId;
+       result[1] = CurrentTransactionState->transactionId;
+
+       /*
+        * If we're running in a parallel worker and launching a parallel worker
+        * of our own, we can just pass along the information that was passed to
+        * us.
+        */
+       if (nParallelCurrentXids > 0)
+       {
+               Assert(maxsize > (nParallelCurrentXids + 2) * sizeof(TransactionId));
+               result[2] = nParallelCurrentXids;
+               memcpy(&result[3], ParallelCurrentXids,
+                          nParallelCurrentXids * sizeof(TransactionId));
+               return;
+       }
+
+       /*
+        * OK, we need to generate a sorted list of XIDs that our workers
+        * should view as current.  First, figure out how many there are.
+        */
+       for (s = CurrentTransactionState; s != NULL; s = s->parent)
+       {
+               if (TransactionIdIsValid(s->transactionId))
+                       nxids = add_size(nxids, 1);
+               nxids = add_size(nxids, s->nChildXids);
+       }
+       Assert(nxids * sizeof(TransactionId) < maxsize);
+
+       /* Copy them to our scratch space. */
+       workspace = palloc(nxids * sizeof(TransactionId));
+       for (s = CurrentTransactionState; s != NULL; s = s->parent)
+       {
+               if (TransactionIdIsValid(s->transactionId))
+                       workspace[i++] = s->transactionId;
+               memcpy(&workspace[i], s->childXids,
+                          s->nChildXids * sizeof(TransactionId));
+               i += s->nChildXids;
+       }
+       Assert(i == nxids);
+
+       /* Sort them. */
+       qsort(workspace, nxids, sizeof(TransactionId), xidComparator);
+
+       /* Copy data into output area. */
+       result[2] = (TransactionId) nxids;
+       memcpy(&result[3], workspace, nxids * sizeof(TransactionId));
+}
+
+/*
+ * StartParallelWorkerTransaction
+ *             Start a parallel worker transaction, restoring the relevant
+ *             transaction state serialized by SerializeTransactionState.
+ */
+void
+StartParallelWorkerTransaction(char *tstatespace)
+{
+       TransactionId *tstate = (TransactionId *) tstatespace;
+
+       Assert(CurrentTransactionState->blockState == TBLOCK_DEFAULT);
+       StartTransaction();
+
+       XactTopTransactionId = tstate[0];
+       CurrentTransactionState->transactionId = tstate[1];
+       nParallelCurrentXids = (int) tstate[2];
+       ParallelCurrentXids = &tstate[3];
+
+       CurrentTransactionState->blockState = TBLOCK_PARALLEL_INPROGRESS;
+}
+
+/*
+ * EndParallelWorkerTransaction
+ *             End a parallel worker transaction.
+ */
+void
+EndParallelWorkerTransaction(void)
+{
+       Assert(CurrentTransactionState->blockState == TBLOCK_PARALLEL_INPROGRESS);
+       CommitTransaction();
+       CurrentTransactionState->blockState = TBLOCK_DEFAULT;
+}
+
 /*
  * ShowTransactionState
  *             Debug support
@@ -4571,6 +4996,8 @@ BlockStateAsString(TBlockState blockState)
                        return "BEGIN";
                case TBLOCK_INPROGRESS:
                        return "INPROGRESS";
+               case TBLOCK_PARALLEL_INPROGRESS:
+                       return "PARALLEL_INPROGRESS";
                case TBLOCK_END:
                        return "END";
                case TBLOCK_ABORT:
index 8cb2f13b27861b533845b85a4ea2bdd21c51ff5a..178eea90904d9b8c4cb485f51edd7ff6eba4d210 100644 (file)
@@ -917,9 +917,10 @@ DoCopy(const CopyStmt *stmt, const char *queryString, uint64 *processed)
        {
                Assert(rel);
 
-               /* check read-only transaction */
+               /* check read-only transaction and parallel mode */
                if (XactReadOnly && !rel->rd_islocaltemp)
                        PreventCommandIfReadOnly("COPY FROM");
+               PreventCommandIfParallelMode("COPY FROM");
 
                cstate = BeginCopyFrom(rel, stmt->filename, stmt->is_program,
                                                           stmt->attlist, stmt->options);
index 622ccf751845498f886fc9fe0d993fa299d1ec20..d3ccdb90409a5d239cdbd4d1b74b967fffbe55bc 100644 (file)
@@ -551,6 +551,13 @@ nextval_internal(Oid relid)
        if (!seqrel->rd_islocaltemp)
                PreventCommandIfReadOnly("nextval()");
 
+       /*
+        * Forbid this during parallel operation because, to make it work,
+        * the cooperating backends would need to share the backend-local cached
+        * sequence information.  Currently, we don't support that.
+        */
+       PreventCommandIfParallelMode("nextval()");
+
        if (elm->last != elm->cached)           /* some numbers were cached */
        {
                Assert(elm->last_valid);
@@ -838,6 +845,13 @@ do_setval(Oid relid, int64 next, bool iscalled)
        if (!seqrel->rd_islocaltemp)
                PreventCommandIfReadOnly("setval()");
 
+       /*
+        * Forbid this during parallel operation because, to make it work,
+        * the cooperating backends would need to share the backend-local cached
+        * sequence information.  Currently, we don't support that.
+        */
+       PreventCommandIfParallelMode("setval()");
+
        /* lock page' buffer and read tuple */
        seq = read_seq_tuple(elm, seqrel, &buf, &seqtuple);
 
index 20b3188dfdc1fd41d3adf9194cdb34c323bcfd60..b8222d90f44df730a2ae4ed3155b0e85cd3cb2da 100644 (file)
@@ -147,8 +147,20 @@ standard_ExecutorStart(QueryDesc *queryDesc, int eflags)
        /*
         * If the transaction is read-only, we need to check if any writes are
         * planned to non-temporary tables.  EXPLAIN is considered read-only.
+        *
+        * Don't allow writes in parallel mode.  Supporting UPDATE and DELETE would
+        * require (a) storing the combocid hash in shared memory, rather than
+        * synchronizing it just once at the start of parallelism, and (b) an
+        * alternative to heap_update()'s reliance on xmax for mutual exclusion.
+        * INSERT may have no such troubles, but we forbid it to simplify the
+        * checks.
+        *
+        * We have lower-level defenses in CommandCounterIncrement and elsewhere
+        * against performing unsafe operations in parallel mode, but this gives
+        * a more user-friendly error message.
         */
-       if (XactReadOnly && !(eflags & EXEC_FLAG_EXPLAIN_ONLY))
+       if ((XactReadOnly || IsInParallelMode()) &&
+               !(eflags & EXEC_FLAG_EXPLAIN_ONLY))
                ExecCheckXactReadOnly(queryDesc->plannedstmt);
 
        /*
@@ -691,18 +703,23 @@ ExecCheckRTEPerms(RangeTblEntry *rte)
 }
 
 /*
- * Check that the query does not imply any writes to non-temp tables.
+ * Check that the query does not imply any writes to non-temp tables;
+ * unless we're in parallel mode, in which case don't even allow writes
+ * to temp tables.
  *
  * Note: in a Hot Standby slave this would need to reject writes to temp
- * tables as well; but an HS slave can't have created any temp tables
- * in the first place, so no need to check that.
+ * tables just as we do in parallel mode; but an HS slave can't have created
+ * any temp tables in the first place, so no need to check that.
  */
 static void
 ExecCheckXactReadOnly(PlannedStmt *plannedstmt)
 {
        ListCell   *l;
 
-       /* Fail if write permissions are requested on any non-temp table */
+       /*
+        * Fail if write permissions are requested in parallel mode for
+        * table (temp or non-temp), otherwise fail for any non-temp table.
+        */
        foreach(l, plannedstmt->rtable)
        {
                RangeTblEntry *rte = (RangeTblEntry *) lfirst(l);
@@ -713,6 +730,8 @@ ExecCheckXactReadOnly(PlannedStmt *plannedstmt)
                if ((rte->requiredPerms & (~ACL_SELECT)) == 0)
                        continue;
 
+               PreventCommandIfParallelMode(CreateCommandTag((Node *) plannedstmt));
+
                if (isTempNamespace(get_rel_namespace(rte->relid)))
                        continue;
 
index 84be37c7a39d5a8d19ddc522640b45c8ebdfdf60..40f2eec75748f541b64b26d88cf5ac8c78188eb5 100644 (file)
@@ -513,6 +513,9 @@ init_execution_state(List *queryTree_list,
                                           errmsg("%s is not allowed in a non-volatile function",
                                                          CreateCommandTag(stmt))));
 
+                       if (IsInParallelMode() && !CommandIsReadOnly(stmt))
+                               PreventCommandIfParallelMode(CreateCommandTag(stmt));
+
                        /* OK, build the execution_state for this query */
                        newes = (execution_state *) palloc(sizeof(execution_state));
                        if (preves)
index 4b86e910df827ad425c8fb1487d9720bcb00e090..3a93a04ba7020aebec4da11aab8dc2ad22745551 100644 (file)
@@ -23,6 +23,7 @@
 #include "commands/trigger.h"
 #include "executor/executor.h"
 #include "executor/spi_priv.h"
+#include "miscadmin.h"
 #include "tcop/pquery.h"
 #include "tcop/utility.h"
 #include "utils/builtins.h"
@@ -1322,13 +1323,14 @@ SPI_cursor_open_internal(const char *name, SPIPlanPtr plan,
        }
 
        /*
-        * If told to be read-only, we'd better check for read-only queries. This
-        * can't be done earlier because we need to look at the finished, planned
-        * queries.  (In particular, we don't want to do it between GetCachedPlan
-        * and PortalDefineQuery, because throwing an error between those steps
-        * would result in leaking our plancache refcount.)
+        * If told to be read-only, or in parallel mode, verify that this query
+        * is in fact read-only.  This can't be done earlier because we need to
+        * look at the finished, planned queries.  (In particular, we don't want
+        * to do it between GetCachedPlan and PortalDefineQuery, because throwing
+        * an error between those steps would result in leaking our plancache
+        * refcount.)
         */
-       if (read_only)
+       if (read_only || IsInParallelMode())
        {
                ListCell   *lc;
 
@@ -1337,11 +1339,16 @@ SPI_cursor_open_internal(const char *name, SPIPlanPtr plan,
                        Node       *pstmt = (Node *) lfirst(lc);
 
                        if (!CommandIsReadOnly(pstmt))
-                               ereport(ERROR,
-                                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                               /* translator: %s is a SQL statement name */
-                                          errmsg("%s is not allowed in a non-volatile function",
-                                                         CreateCommandTag(pstmt))));
+                       {
+                               if (read_only)
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                       /* translator: %s is a SQL statement name */
+                                                        errmsg("%s is not allowed in a non-volatile function",
+                                                                       CreateCommandTag(pstmt))));
+                               else
+                                       PreventCommandIfParallelMode(CreateCommandTag(pstmt));
+                       }
                }
        }
 
@@ -2129,6 +2136,9 @@ _SPI_execute_plan(SPIPlanPtr plan, ParamListInfo paramLI,
                                           errmsg("%s is not allowed in a non-volatile function",
                                                          CreateCommandTag(stmt))));
 
+                       if (IsInParallelMode() && !CommandIsReadOnly(stmt))
+                               PreventCommandIfParallelMode(CreateCommandTag(stmt));
+
                        /*
                         * If not read-only mode, advance the command counter before each
                         * command and update the snapshot.
index 307fb60665c172ce8f8f8db0dd924f37dc7b6fc8..f12f2d582e8339f89b38cf34d10f159f83ff6a3f 100644 (file)
 #include "libpq/libpq.h"
 #include "libpq/pqformat.h"
 #include "libpq/pqmq.h"
+#include "miscadmin.h"
 #include "tcop/tcopprot.h"
 #include "utils/builtins.h"
 
 static shm_mq *pq_mq;
 static shm_mq_handle *pq_mq_handle;
 static bool pq_mq_busy = false;
+static pid_t pq_mq_parallel_master_pid = 0;
+static pid_t pq_mq_parallel_master_backend_id = InvalidBackendId;
 
 static void mq_comm_reset(void);
 static int     mq_flush(void);
@@ -57,6 +60,18 @@ pq_redirect_to_shm_mq(shm_mq *mq, shm_mq_handle *mqh)
        FrontendProtocol = PG_PROTOCOL_LATEST;
 }
 
+/*
+ * Arrange to SendProcSignal() to the parallel master each time we transmit
+ * message data via the shm_mq.
+ */
+void
+pq_set_parallel_master(pid_t pid, BackendId backend_id)
+{
+       Assert(PqCommMethods == &PqCommMqMethods);
+       pq_mq_parallel_master_pid = pid;
+       pq_mq_parallel_master_backend_id = backend_id;
+}
+
 static void
 mq_comm_reset(void)
 {
@@ -120,7 +135,23 @@ mq_putmessage(char msgtype, const char *s, size_t len)
        iov[1].len = len;
 
        Assert(pq_mq_handle != NULL);
-       result = shm_mq_sendv(pq_mq_handle, iov, 2, false);
+
+       for (;;)
+       {
+               result = shm_mq_sendv(pq_mq_handle, iov, 2, true);
+
+               if (pq_mq_parallel_master_pid != 0)
+                       SendProcSignal(pq_mq_parallel_master_pid,
+                                                  PROCSIG_PARALLEL_MESSAGE,
+                                                  pq_mq_parallel_master_backend_id);
+
+               if (result != SHM_MQ_WOULD_BLOCK)
+                       break;
+
+               WaitLatch(&MyProc->procLatch, WL_LATCH_SET, 0);
+               CHECK_FOR_INTERRUPTS();
+               ResetLatch(&MyProc->procLatch);
+       }
 
        pq_mq_busy = false;
 
index 267b91632712b9e7cfd00bd2bc94858fbcce7495..f80141ab845342f74135d52edac70ffb7eca3314 100644 (file)
@@ -965,6 +965,56 @@ WaitForBackgroundWorkerStartup(BackgroundWorkerHandle *handle, pid_t *pidp)
        return status;
 }
 
+/*
+ * Wait for a background worker to stop.
+ *
+ * If the worker hasn't yet started, or is running, we wait for it to stop
+ * and then return BGWH_STOPPED.  However, if the postmaster has died, we give
+ * up and return BGWH_POSTMASTER_DIED, because it's the postmaster that
+ * notifies us when a worker's state changes.
+ */
+BgwHandleStatus
+WaitForBackgroundWorkerShutdown(BackgroundWorkerHandle *handle)
+{
+       BgwHandleStatus status;
+       int                     rc;
+       bool            save_set_latch_on_sigusr1;
+
+       save_set_latch_on_sigusr1 = set_latch_on_sigusr1;
+       set_latch_on_sigusr1 = true;
+
+       PG_TRY();
+       {
+               for (;;)
+               {
+                       pid_t           pid;
+
+                       CHECK_FOR_INTERRUPTS();
+
+                       status = GetBackgroundWorkerPid(handle, &pid);
+                       if (status == BGWH_STOPPED)
+                               return status;
+
+                       rc = WaitLatch(&MyProc->procLatch,
+                                                  WL_LATCH_SET | WL_POSTMASTER_DEATH, 0);
+
+                       if (rc & WL_POSTMASTER_DEATH)
+                               return BGWH_POSTMASTER_DIED;
+
+                       ResetLatch(&MyProc->procLatch);
+               }
+       }
+       PG_CATCH();
+       {
+               set_latch_on_sigusr1 = save_set_latch_on_sigusr1;
+               PG_RE_THROW();
+       }
+       PG_END_TRY();
+
+       set_latch_on_sigusr1 = save_set_latch_on_sigusr1;
+       return status;
+}
+
 /*
  * Instruct the postmaster to terminate a background worker.
  *
index a1ebc72d8d55730aace019f565055794f5b44720..32701d3c7c3d263915b6337a06208fcf756c7add 100644 (file)
@@ -1685,6 +1685,50 @@ ProcArrayInstallImportedXmin(TransactionId xmin, TransactionId sourcexid)
        return result;
 }
 
+/*
+ * ProcArrayInstallRestoredXmin -- install restored xmin into MyPgXact->xmin
+ *
+ * This is like ProcArrayInstallImportedXmin, but we have a pointer to the
+ * PGPROC of the transaction from which we imported the snapshot, rather than
+ * an XID.
+ *
+ * Returns TRUE if successful, FALSE if source xact is no longer running.
+ */
+bool
+ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc)
+{
+       bool            result = false;
+       TransactionId xid;
+       volatile PGXACT *pgxact;
+
+       Assert(TransactionIdIsNormal(xmin));
+       Assert(proc != NULL);
+
+       /* Get lock so source xact can't end while we're doing this */
+       LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+       pgxact = &allPgXact[proc->pgprocno];
+
+       /*
+        * Be certain that the referenced PGPROC has an advertised xmin which
+        * is no later than the one we're installing, so that the system-wide
+        * xmin can't go backwards.  Also, make sure it's running in the same
+        * database, so that the per-database xmin cannot go backwards.
+        */
+       xid = pgxact->xmin;             /* fetch just once */
+       if (proc->databaseId == MyDatabaseId &&
+               TransactionIdIsNormal(xid) &&
+               TransactionIdPrecedesOrEquals(xid, xmin))
+       {
+               MyPgXact->xmin = TransactionXmin = xmin;
+               result = true;
+       }
+
+       LWLockRelease(ProcArrayLock);
+
+       return result;
+}
+
 /*
  * GetRunningTransactionData -- returns information about running transactions.
  *
index 48573bef60b4c1f456849c18671e2dd581f67f40..4945f596da841c3f4b3df03aa2627cd337a7e299 100644 (file)
@@ -17,6 +17,7 @@
 #include <signal.h>
 #include <unistd.h>
 
+#include "access/parallel.h"
 #include "commands/async.h"
 #include "miscadmin.h"
 #include "storage/latch.h"
@@ -274,6 +275,9 @@ procsignal_sigusr1_handler(SIGNAL_ARGS)
        if (CheckProcSignal(PROCSIG_NOTIFY_INTERRUPT))
                HandleNotifyInterrupt();
 
+       if (CheckProcSignal(PROCSIG_PARALLEL_MESSAGE))
+               HandleParallelMessageInterrupt(true);
+
        if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_DATABASE))
                RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_DATABASE);
 
index 292bed55c6123ec87159d3b2500350c58decff98..6002d51ceecb20c3e146cdffff81fa3fc3d810c1 100644 (file)
@@ -366,7 +366,8 @@ ResolveRecoveryConflictWithLock(Oid dbOid, Oid relOid)
                ResolveRecoveryConflictWithVirtualXIDs(backends,
                                                                                         PROCSIG_RECOVERY_CONFLICT_LOCK);
 
-               if (LockAcquireExtended(&locktag, AccessExclusiveLock, true, true, false)
+               if (LockAcquireExtended(&locktag, AccessExclusiveLock,
+                                                               true, true, false, false)
                        != LOCKACQUIRE_NOT_AVAIL)
                        lock_acquired = true;
        }
@@ -592,7 +593,8 @@ StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
         */
        SET_LOCKTAG_RELATION(locktag, newlock->dbOid, newlock->relOid);
 
-       if (LockAcquireExtended(&locktag, AccessExclusiveLock, true, true, false)
+       if (LockAcquireExtended(&locktag, AccessExclusiveLock,
+                                                       true, true, false, false)
                == LOCKACQUIRE_NOT_AVAIL)
                ResolveRecoveryConflictWithLock(newlock->dbOid, newlock->relOid);
 }
index 1eb2d4b68da5bf1a5ee218f8e14535139f4f4c33..9129fa494a34fc473d7ba6b897ae2bd3934e7980 100644 (file)
@@ -669,7 +669,8 @@ LockAcquire(const LOCKTAG *locktag,
                        bool sessionLock,
                        bool dontWait)
 {
-       return LockAcquireExtended(locktag, lockmode, sessionLock, dontWait, true);
+       return LockAcquireExtended(locktag, lockmode, sessionLock, dontWait, true,
+                                                          false);
 }
 
 /*
@@ -680,13 +681,20 @@ LockAcquire(const LOCKTAG *locktag,
  * caller to note that the lock table is full and then begin taking
  * extreme action to reduce the number of other lock holders before
  * retrying the action.
+ *
+ * parallelReacquire should be false except for the case of a parallel
+ * worker reacquiring locks already held by the parallel group leader.  In
+ * that case, we never log the lock acquisition since the parent has already
+ * done it; and more importantly and surprisingly, we ignore lock conflicts.
+ * See src/backend/access/transam/README.parallel for further discussion.
  */
 LockAcquireResult
 LockAcquireExtended(const LOCKTAG *locktag,
                                        LOCKMODE lockmode,
                                        bool sessionLock,
                                        bool dontWait,
-                                       bool reportMemoryError)
+                                       bool reportMemoryError,
+                                       bool parallelReacquire)
 {
        LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid;
        LockMethod      lockMethodTable;
@@ -797,7 +805,7 @@ LockAcquireExtended(const LOCKTAG *locktag,
        if (lockmode >= AccessExclusiveLock &&
                locktag->locktag_type == LOCKTAG_RELATION &&
                !RecoveryInProgress() &&
-               XLogStandbyInfoActive())
+               XLogStandbyInfoActive() && !parallelReacquire)
        {
                LogAccessExclusiveLockPrepare();
                log_lock = true;
@@ -910,9 +918,12 @@ LockAcquireExtended(const LOCKTAG *locktag,
        /*
         * If lock requested conflicts with locks requested by waiters, must join
         * wait queue.  Otherwise, check for conflict with already-held locks.
-        * (That's last because most complex check.)
+        * (That's last because most complex check.)  Parallel reacquire never
+        * conflicts.
         */
-       if (lockMethodTable->conflictTab[lockmode] & lock->waitMask)
+       if (parallelReacquire)
+               status = STATUS_OK;
+       else if (lockMethodTable->conflictTab[lockmode] & lock->waitMask)
                status = STATUS_FOUND;
        else
                status = LockCheckConflicts(lockMethodTable, lockmode,
@@ -3557,6 +3568,84 @@ GetLockmodeName(LOCKMETHODID lockmethodid, LOCKMODE mode)
        return LockMethods[lockmethodid]->lockModeNames[mode];
 }
 
+/*
+ * Estimate the amount of space required to record information on locks that
+ * need to be copied to parallel workers.
+ */
+Size
+EstimateLockStateSpace(void)
+{
+       return add_size(sizeof(long),
+                                       mul_size(hash_get_num_entries(LockMethodLocalHash),
+                                                        sizeof(LOCALLOCKTAG)));
+}
+
+/*
+ * Serialize relevant heavyweight lock state into the memory beginning at
+ * start_address.  maxsize should be at least as large as the value returned
+ * by EstimateLockStateSpace.
+ */
+void
+SerializeLockState(Size maxsize, char *start_address)
+{
+       char       *endptr = start_address + maxsize;
+       char       *curptr = start_address + sizeof(long);
+       HASH_SEQ_STATUS status;
+       LOCALLOCK  *locallock;
+       long            count = 0;
+
+       hash_seq_init(&status, LockMethodLocalHash);
+
+       while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+       {
+               if (locallock->nLocks == 0)
+                       continue;
+
+               /*
+                * We only copy ordinary heavyweight locks; advisory lock operations
+                * are prohibited while in parallel mode.
+                */
+               if (locallock->tag.lock.locktag_lockmethodid != DEFAULT_LOCKMETHOD)
+                       continue;
+
+               if (curptr >= endptr)
+                       elog(ERROR, "not enough space to serialize lock state");
+
+               memcpy(curptr, &locallock->tag, sizeof(LOCALLOCKTAG));
+               curptr += sizeof(LOCALLOCKTAG);
+               count++;
+       }
+
+       memcpy(start_address, &count, sizeof(long));
+}
+
+/*
+ * Retake the locals specified by the serialized lock state.
+ */
+void
+RestoreLockState(char *start_address)
+{
+       char       *curptr = start_address + sizeof(long);
+       long            count;
+
+       memcpy(&count, start_address, sizeof(long));
+
+       while (count > 0)
+       {
+               LOCALLOCKTAG    locallocktag;
+               LockAcquireResult       result;
+
+               memcpy(&locallocktag, curptr, sizeof(LOCALLOCKTAG));
+               curptr += sizeof(LOCALLOCKTAG);
+               --count;
+
+               result = LockAcquireExtended(&locallocktag.lock, locallocktag.mode,
+                                                                        false, false, true, true);
+               if (result != LOCKACQUIRE_OK)
+                       elog(ERROR, "parallel worker lock not reacquired OK");
+       }
+}
+
 #ifdef LOCK_DEBUG
 /*
  * Dump all locks in the given proc's myProcLocks lists.
index b81ebeb260ab009ab093946123fa0ed3087ee959..01e03f0e8427fe14c105af38d06b58d38d6c4320 100644 (file)
@@ -1653,6 +1653,14 @@ GetSerializableTransactionSnapshotInt(Snapshot snapshot,
 
        Assert(!RecoveryInProgress());
 
+       /*
+        * Since all parts of a serializable transaction must use the same
+        * snapshot, it is too late to establish one after a parallel operation
+        * has begun.
+        */
+       if (IsInParallelMode())
+               elog(ERROR, "cannot establish serializable snapshot during a parallel operation");
+
        proc = MyProc;
        Assert(proc != NULL);
        GET_VXID_FROM_PGPROC(vxid, *proc);
index 0ce637a28aa8b0e05b83ae25d8a2976f8ae73392..bbad0dc1bc80ac351d573bb8b0766e0e4ec9a57c 100644 (file)
@@ -36,6 +36,7 @@
 #include "rusagestub.h"
 #endif
 
+#include "access/parallel.h"
 #include "access/printtup.h"
 #include "access/xact.h"
 #include "catalog/pg_type.h"
@@ -2963,7 +2964,8 @@ ProcessInterrupts(void)
                                         errmsg("canceling statement due to user request")));
                }
        }
-       /* If we get here, do nothing (probably, QueryCancelPending was reset) */
+       if (ParallelMessagePending)
+               HandleParallelMessageInterrupt(false);
 }
 
 
index 3533cfa22d457e19c7eb8235266d965891058cc7..faa9b5526317217d1ac22f24c8500f3b70ac0c35 100644 (file)
@@ -128,14 +128,15 @@ CommandIsReadOnly(Node *parsetree)
 static void
 check_xact_readonly(Node *parsetree)
 {
-       if (!XactReadOnly)
+       /* Only perform the check if we have a reason to do so. */
+       if (!XactReadOnly && !IsInParallelMode())
                return;
 
        /*
         * Note: Commands that need to do more complicated checking are handled
         * elsewhere, in particular COPY and plannable statements do their own
-        * checking.  However they should all call PreventCommandIfReadOnly to
-        * actually throw the error.
+        * checking.  However they should all call PreventCommandIfReadOnly
+        * or PreventCommandIfParallelMode to actually throw the error.
         */
 
        switch (nodeTag(parsetree))
@@ -207,6 +208,7 @@ check_xact_readonly(Node *parsetree)
                case T_ImportForeignSchemaStmt:
                case T_SecLabelStmt:
                        PreventCommandIfReadOnly(CreateCommandTag(parsetree));
+                       PreventCommandIfParallelMode(CreateCommandTag(parsetree));
                        break;
                default:
                        /* do nothing */
@@ -231,6 +233,24 @@ PreventCommandIfReadOnly(const char *cmdname)
                                                cmdname)));
 }
 
+/*
+ * PreventCommandIfParallelMode: throw error if current (sub)transaction is
+ * in parallel mode.
+ *
+ * This is useful mainly to ensure consistency of the error message wording;
+ * most callers have checked IsInParallelMode() for themselves.
+ */
+void
+PreventCommandIfParallelMode(const char *cmdname)
+{
+       if (IsInParallelMode())
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+               /* translator: %s is name of a SQL command, eg CREATE */
+                                errmsg("cannot execute %s during a parallel operation",
+                                               cmdname)));
+}
+
 /*
  * PreventCommandDuringRecovery: throw error if RecoveryInProgress
  *
@@ -630,6 +650,7 @@ standard_ProcessUtility(Node *parsetree,
                case T_ClusterStmt:
                        /* we choose to allow this during "read only" transactions */
                        PreventCommandDuringRecovery("CLUSTER");
+                       /* forbidden in parallel mode due to CommandIsReadOnly */
                        cluster((ClusterStmt *) parsetree, isTopLevel);
                        break;
 
@@ -640,6 +661,7 @@ standard_ProcessUtility(Node *parsetree,
                                /* we choose to allow this during "read only" transactions */
                                PreventCommandDuringRecovery((stmt->options & VACOPT_VACUUM) ?
                                                                                         "VACUUM" : "ANALYZE");
+                               /* forbidden in parallel mode due to CommandIsReadOnly */
                                vacuum(stmt, InvalidOid, true, NULL, false, isTopLevel);
                        }
                        break;
@@ -716,6 +738,7 @@ standard_ProcessUtility(Node *parsetree,
                         * outside a transaction block is presumed to be user error.
                         */
                        RequireTransactionChain(isTopLevel, "LOCK TABLE");
+                       /* forbidden in parallel mode due to CommandIsReadOnly */
                        LockTableCommand((LockStmt *) parsetree);
                        break;
 
@@ -747,6 +770,7 @@ standard_ProcessUtility(Node *parsetree,
 
                                /* we choose to allow this during "read only" transactions */
                                PreventCommandDuringRecovery("REINDEX");
+                               /* forbidden in parallel mode due to CommandIsReadOnly */
                                switch (stmt->kind)
                                {
                                        case REINDEX_OBJECT_INDEX:
index a1967b69632f95654a045b9a0fa9eb902233af5c..21d9f73d9d8fb437e93ae5dcb3ebbc299a47d5e6 100644 (file)
@@ -420,6 +420,7 @@ pg_advisory_lock_int8(PG_FUNCTION_ARGS)
        int64           key = PG_GETARG_INT64(0);
        LOCKTAG         tag;
 
+       PreventCommandIfParallelMode("pg_advisory_lock_int8()");
        SET_LOCKTAG_INT64(tag, key);
 
        (void) LockAcquire(&tag, ExclusiveLock, true, false);
@@ -437,6 +438,7 @@ pg_advisory_xact_lock_int8(PG_FUNCTION_ARGS)
        int64           key = PG_GETARG_INT64(0);
        LOCKTAG         tag;
 
+       PreventCommandIfParallelMode("pg_advisory_xact_lock_int8()");
        SET_LOCKTAG_INT64(tag, key);
 
        (void) LockAcquire(&tag, ExclusiveLock, false, false);
@@ -453,6 +455,7 @@ pg_advisory_lock_shared_int8(PG_FUNCTION_ARGS)
        int64           key = PG_GETARG_INT64(0);
        LOCKTAG         tag;
 
+       PreventCommandIfParallelMode("pg_advisory_lock_shared_int8()");
        SET_LOCKTAG_INT64(tag, key);
 
        (void) LockAcquire(&tag, ShareLock, true, false);
@@ -470,6 +473,7 @@ pg_advisory_xact_lock_shared_int8(PG_FUNCTION_ARGS)
        int64           key = PG_GETARG_INT64(0);
        LOCKTAG         tag;
 
+       PreventCommandIfParallelMode("pg_advisory_xact_lock_shared_int8()");
        SET_LOCKTAG_INT64(tag, key);
 
        (void) LockAcquire(&tag, ShareLock, false, false);
@@ -489,6 +493,7 @@ pg_try_advisory_lock_int8(PG_FUNCTION_ARGS)
        LOCKTAG         tag;
        LockAcquireResult res;
 
+       PreventCommandIfParallelMode("pg_try_advisory_lock_int8()");
        SET_LOCKTAG_INT64(tag, key);
 
        res = LockAcquire(&tag, ExclusiveLock, true, true);
@@ -509,6 +514,7 @@ pg_try_advisory_xact_lock_int8(PG_FUNCTION_ARGS)
        LOCKTAG         tag;
        LockAcquireResult res;
 
+       PreventCommandIfParallelMode("pg_try_advisory_xact_lock_int8()");
        SET_LOCKTAG_INT64(tag, key);
 
        res = LockAcquire(&tag, ExclusiveLock, false, true);
@@ -528,6 +534,7 @@ pg_try_advisory_lock_shared_int8(PG_FUNCTION_ARGS)
        LOCKTAG         tag;
        LockAcquireResult res;
 
+       PreventCommandIfParallelMode("pg_try_advisory_lock_shared_int8()");
        SET_LOCKTAG_INT64(tag, key);
 
        res = LockAcquire(&tag, ShareLock, true, true);
@@ -548,6 +555,7 @@ pg_try_advisory_xact_lock_shared_int8(PG_FUNCTION_ARGS)
        LOCKTAG         tag;
        LockAcquireResult res;
 
+       PreventCommandIfParallelMode("pg_try_advisory_xact_lock_shared_int8()");
        SET_LOCKTAG_INT64(tag, key);
 
        res = LockAcquire(&tag, ShareLock, false, true);
@@ -567,6 +575,7 @@ pg_advisory_unlock_int8(PG_FUNCTION_ARGS)
        LOCKTAG         tag;
        bool            res;
 
+       PreventCommandIfParallelMode("pg_advisory_unlock_int8()");
        SET_LOCKTAG_INT64(tag, key);
 
        res = LockRelease(&tag, ExclusiveLock, true);
@@ -586,6 +595,7 @@ pg_advisory_unlock_shared_int8(PG_FUNCTION_ARGS)
        LOCKTAG         tag;
        bool            res;
 
+       PreventCommandIfParallelMode("pg_advisory_unlock_shared_int8()");
        SET_LOCKTAG_INT64(tag, key);
 
        res = LockRelease(&tag, ShareLock, true);
@@ -603,6 +613,7 @@ pg_advisory_lock_int4(PG_FUNCTION_ARGS)
        int32           key2 = PG_GETARG_INT32(1);
        LOCKTAG         tag;
 
+       PreventCommandIfParallelMode("pg_advisory_lock_int4()");
        SET_LOCKTAG_INT32(tag, key1, key2);
 
        (void) LockAcquire(&tag, ExclusiveLock, true, false);
@@ -621,6 +632,7 @@ pg_advisory_xact_lock_int4(PG_FUNCTION_ARGS)
        int32           key2 = PG_GETARG_INT32(1);
        LOCKTAG         tag;
 
+       PreventCommandIfParallelMode("pg_advisory_xact_lock_int4()");
        SET_LOCKTAG_INT32(tag, key1, key2);
 
        (void) LockAcquire(&tag, ExclusiveLock, false, false);
@@ -638,6 +650,7 @@ pg_advisory_lock_shared_int4(PG_FUNCTION_ARGS)
        int32           key2 = PG_GETARG_INT32(1);
        LOCKTAG         tag;
 
+       PreventCommandIfParallelMode("pg_advisory_lock_shared_int4()");
        SET_LOCKTAG_INT32(tag, key1, key2);
 
        (void) LockAcquire(&tag, ShareLock, true, false);
@@ -656,6 +669,7 @@ pg_advisory_xact_lock_shared_int4(PG_FUNCTION_ARGS)
        int32           key2 = PG_GETARG_INT32(1);
        LOCKTAG         tag;
 
+       PreventCommandIfParallelMode("pg_advisory_xact_lock_shared_int4()");
        SET_LOCKTAG_INT32(tag, key1, key2);
 
        (void) LockAcquire(&tag, ShareLock, false, false);
@@ -676,6 +690,7 @@ pg_try_advisory_lock_int4(PG_FUNCTION_ARGS)
        LOCKTAG         tag;
        LockAcquireResult res;
 
+       PreventCommandIfParallelMode("pg_try_advisory_lock_int4()");
        SET_LOCKTAG_INT32(tag, key1, key2);
 
        res = LockAcquire(&tag, ExclusiveLock, true, true);
@@ -697,6 +712,7 @@ pg_try_advisory_xact_lock_int4(PG_FUNCTION_ARGS)
        LOCKTAG         tag;
        LockAcquireResult res;
 
+       PreventCommandIfParallelMode("pg_try_advisory_xact_lock_int4()");
        SET_LOCKTAG_INT32(tag, key1, key2);
 
        res = LockAcquire(&tag, ExclusiveLock, false, true);
@@ -717,6 +733,7 @@ pg_try_advisory_lock_shared_int4(PG_FUNCTION_ARGS)
        LOCKTAG         tag;
        LockAcquireResult res;
 
+       PreventCommandIfParallelMode("pg_try_advisory_lock_shared_int4()");
        SET_LOCKTAG_INT32(tag, key1, key2);
 
        res = LockAcquire(&tag, ShareLock, true, true);
@@ -738,6 +755,7 @@ pg_try_advisory_xact_lock_shared_int4(PG_FUNCTION_ARGS)
        LOCKTAG         tag;
        LockAcquireResult res;
 
+       PreventCommandIfParallelMode("pg_try_advisory_xact_lock_shared_int4()");
        SET_LOCKTAG_INT32(tag, key1, key2);
 
        res = LockAcquire(&tag, ShareLock, false, true);
@@ -758,6 +776,7 @@ pg_advisory_unlock_int4(PG_FUNCTION_ARGS)
        LOCKTAG         tag;
        bool            res;
 
+       PreventCommandIfParallelMode("pg_advisory_unlock_int4()");
        SET_LOCKTAG_INT32(tag, key1, key2);
 
        res = LockRelease(&tag, ExclusiveLock, true);
@@ -778,6 +797,7 @@ pg_advisory_unlock_shared_int4(PG_FUNCTION_ARGS)
        LOCKTAG         tag;
        bool            res;
 
+       PreventCommandIfParallelMode("pg_advisory_unlock_shared_int4()");
        SET_LOCKTAG_INT32(tag, key1, key2);
 
        res = LockRelease(&tag, ShareLock, true);
index 1b6932235cb85a8fdf0de5fa8747ec355eab04a6..affb23afdae83d5f34fa5322fe1326c1e1af416d 100644 (file)
@@ -23,6 +23,7 @@
 #endif
 #include "lib/stringinfo.h"
 #include "miscadmin.h"
+#include "storage/shmem.h"
 #include "utils/dynamic_loader.h"
 #include "utils/hsearch.h"
 
@@ -697,3 +698,56 @@ find_rendezvous_variable(const char *varName)
 
        return &hentry->varValue;
 }
+
+/*
+ * Estimate the amount of space needed to serialize the list of libraries
+ * we have loaded.
+ */
+Size
+EstimateLibraryStateSpace(void)
+{
+       DynamicFileList *file_scanner;
+       Size    size = 1;
+
+       for (file_scanner = file_list;
+                file_scanner != NULL;
+                file_scanner = file_scanner->next)
+               size = add_size(size, strlen(file_scanner->filename) + 1);
+
+       return size;
+}
+
+/*
+ * Serialize the list of libraries we have loaded to a chunk of memory.
+ */
+void
+SerializeLibraryState(Size maxsize, char *start_address)
+{
+       DynamicFileList *file_scanner;
+
+       for (file_scanner = file_list;
+                file_scanner != NULL;
+                file_scanner = file_scanner->next)
+       {
+               Size len;
+
+               len = strlcpy(start_address, file_scanner->filename, maxsize) + 1;
+               Assert(len < maxsize);
+               maxsize -= len;
+               start_address += len;
+       }
+       start_address[0] = '\0';
+}
+
+/*
+ * Load every library the serializing backend had loaded.
+ */
+void
+RestoreLibraryState(char *start_address)
+{
+       while (*start_address != '\0')
+       {
+               internal_load_library(start_address);
+               start_address += strlen(start_address) + 1;
+       }
+}
index 95727776d3851a2d1a55ca3d3fa824a7cf492bbc..de988ba690cdf41b2ad26e99334c765fae57f562 100644 (file)
@@ -5602,6 +5602,20 @@ set_config_option(const char *name, const char *value,
                        elevel = ERROR;
        }
 
+       /*
+        * GUC_ACTION_SAVE changes are acceptable during a parallel operation,
+        * because the current worker will also pop the change.  We're probably
+        * dealing with a function having a proconfig entry.  Only the function's
+        * body should observe the change, and peer workers do not share in the
+        * execution of a function call started by this worker.
+        *
+        * Other changes might need to affect other workers, so forbid them.
+        */
+       if (IsInParallelMode() && changeVal && action != GUC_ACTION_SAVE)
+               ereport(elevel,
+                               (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+                                errmsg("cannot set parameters during a parallel operation")));
+
        record = find_option(name, true, elevel);
        if (record == NULL)
        {
@@ -6906,6 +6920,15 @@ ExecSetVariableStmt(VariableSetStmt *stmt, bool isTopLevel)
 {
        GucAction       action = stmt->is_local ? GUC_ACTION_LOCAL : GUC_ACTION_SET;
 
+       /*
+        * Workers synchronize these parameters at the start of the parallel
+        * operation; then, we block SET during the operation.
+        */
+       if (IsInParallelMode())
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+                                errmsg("cannot set parameters during a parallel operation")));
+
        switch (stmt->kind)
        {
                case VAR_SET_VALUE:
index bfd7d0ad42677364c448cb2218f6ef97613e9f9a..cc5409b8803dfb9ee1c1716f71a461a11e6a1579 100644 (file)
@@ -44,6 +44,7 @@
 #include "miscadmin.h"
 #include "access/htup_details.h"
 #include "access/xact.h"
+#include "storage/shmem.h"
 #include "utils/combocid.h"
 #include "utils/hsearch.h"
 #include "utils/memutils.h"
@@ -286,3 +287,76 @@ GetRealCmax(CommandId combocid)
        Assert(combocid < usedComboCids);
        return comboCids[combocid].cmax;
 }
+
+/*
+ * Estimate the amount of space required to serialize the current ComboCID
+ * state.
+ */
+Size
+EstimateComboCIDStateSpace(void)
+{
+       Size            size;
+
+       /* Add space required for saving usedComboCids */
+       size = sizeof(int);
+
+       /* Add space required for saving the combocids key */
+       size = add_size(size, mul_size(sizeof(ComboCidKeyData), usedComboCids));
+
+       return size;
+}
+
+/*
+ * Serialize the ComboCID state into the memory, beginning at start_address.
+ * maxsize should be at least as large as the value returned by
+ * EstimateComboCIDStateSpace.
+ */
+void
+SerializeComboCIDState(Size maxsize, char *start_address)
+{
+       char       *endptr;
+
+       /* First, we store the number of currently-existing ComboCIDs. */
+       * (int *) start_address = usedComboCids;
+
+       /* If maxsize is too small, throw an error. */
+       endptr = start_address + sizeof(int) +
+               (sizeof(ComboCidKeyData) * usedComboCids);
+       if (endptr < start_address || endptr > start_address + maxsize)
+               elog(ERROR, "not enough space to serialize ComboCID state");
+
+       /* Now, copy the actual cmin/cmax pairs. */
+       memcpy(start_address + sizeof(int), comboCids,
+                  (sizeof(ComboCidKeyData) * usedComboCids));
+}
+
+/*
+ * Read the ComboCID state at the specified address and initialize this
+ * backend with the same ComboCIDs.  This is only valid in a backend that
+ * currently has no ComboCIDs (and only makes sense if the transaction state
+ * is serialized and restored as well).
+ */
+void
+RestoreComboCIDState(char *comboCIDstate)
+{
+       int                     num_elements;
+       ComboCidKeyData *keydata;
+       int                     i;
+       CommandId       cid;
+
+       Assert(!comboCids && !comboHash);
+
+       /* First, we retrieve the number of ComboCIDs that were serialized. */
+       num_elements = * (int *) comboCIDstate;
+       keydata = (ComboCidKeyData *) (comboCIDstate + sizeof(int));
+
+       /* Use GetComboCommandId to restore each ComboCID. */
+       for (i = 0; i < num_elements; i++)
+       {
+               cid = GetComboCommandId(keydata[i].cmin, keydata[i].cmax);
+
+               /* Verify that we got the expected answer. */
+               if (cid != i)
+                       elog(ERROR, "unexpected command ID while restoring combo CIDs");
+       }
+}
index 7cfa0cf848e5baf6aef56a3074027942f2fb468d..cb0f412435b737272108f9616f8ea7e7f7ce9a83 100644 (file)
@@ -157,6 +157,22 @@ static Snapshot CopySnapshot(Snapshot snapshot);
 static void FreeSnapshot(Snapshot snapshot);
 static void SnapshotResetXmin(void);
 
+/*
+ * Snapshot fields to be serialized.
+ *
+ * Only these fields need to be sent to the cooperating backend; the
+ * remaining ones can (and must) set by the receiver upon restore.
+ */
+typedef struct SerializedSnapshotData
+{
+       TransactionId xmin;
+       TransactionId xmax;
+       uint32          xcnt;
+       int32           subxcnt;
+       bool            suboverflowed;
+       bool            takenDuringRecovery;
+       CommandId       curcid;
+} SerializedSnapshotData;
 
 /*
  * GetTransactionSnapshot
@@ -188,6 +204,10 @@ GetTransactionSnapshot(void)
                Assert(pairingheap_is_empty(&RegisteredSnapshots));
                Assert(FirstXactSnapshot == NULL);
 
+               if (IsInParallelMode())
+                       elog(ERROR,
+                                "cannot take query snapshot during a parallel operation");
+
                /*
                 * In transaction-snapshot mode, the first snapshot must live until
                 * end of xact regardless of what the caller does with it, so we must
@@ -238,6 +258,14 @@ GetTransactionSnapshot(void)
 Snapshot
 GetLatestSnapshot(void)
 {
+       /*
+        * We might be able to relax this, but nothing that could otherwise work
+        * needs it.
+        */
+       if (IsInParallelMode())
+               elog(ERROR,
+                        "cannot update SecondarySnapshot during a parallel operation");
+
        /*
         * So far there are no cases requiring support for GetLatestSnapshot()
         * during logical decoding, but it wouldn't be hard to add if required.
@@ -347,7 +375,8 @@ SnapshotSetCommandId(CommandId curcid)
  * in GetTransactionSnapshot.
  */
 static void
-SetTransactionSnapshot(Snapshot sourcesnap, TransactionId sourcexid)
+SetTransactionSnapshot(Snapshot sourcesnap, TransactionId sourcexid,
+                                          PGPROC *sourceproc)
 {
        /* Caller should have checked this already */
        Assert(!FirstSnapshotSet);
@@ -394,7 +423,15 @@ SetTransactionSnapshot(Snapshot sourcesnap, TransactionId sourcexid)
         * doesn't seem worth contorting the logic here to avoid two calls,
         * especially since it's not clear that predicate.c *must* do this.
         */
-       if (!ProcArrayInstallImportedXmin(CurrentSnapshot->xmin, sourcexid))
+       if (sourceproc != NULL)
+       {
+               if (!ProcArrayInstallRestoredXmin(CurrentSnapshot->xmin, sourceproc))
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                        errmsg("could not import the requested snapshot"),
+                          errdetail("The source transaction is not running anymore.")));
+       }
+       else if (!ProcArrayInstallImportedXmin(CurrentSnapshot->xmin, sourcexid))
                ereport(ERROR,
                                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                                 errmsg("could not import the requested snapshot"),
@@ -550,11 +587,24 @@ PushCopiedSnapshot(Snapshot snapshot)
 void
 UpdateActiveSnapshotCommandId(void)
 {
+       CommandId       save_curcid, curcid;
        Assert(ActiveSnapshot != NULL);
        Assert(ActiveSnapshot->as_snap->active_count == 1);
        Assert(ActiveSnapshot->as_snap->regd_count == 0);
 
-       ActiveSnapshot->as_snap->curcid = GetCurrentCommandId(false);
+       /*
+        * Don't allow modification of the active snapshot during parallel
+        * operation.  We share the snapshot to worker backends at beginning of
+        * parallel operation, so any change to snapshot can lead to
+        * inconsistencies.  We have other defenses against
+        * CommandCounterIncrement, but there are a few places that call this
+        * directly, so we put an additional guard here.
+        */
+       save_curcid = ActiveSnapshot->as_snap->curcid;
+       curcid = GetCurrentCommandId(false);
+       if (IsInParallelMode() && save_curcid != curcid)
+               elog(ERROR, "cannot modify commandid in active snapshot during a parallel operation");
+       ActiveSnapshot->as_snap->curcid = curcid;
 }
 
 /*
@@ -1289,7 +1339,7 @@ ImportSnapshot(const char *idstr)
                          errmsg("cannot import a snapshot from a different database")));
 
        /* OK, install the snapshot */
-       SetTransactionSnapshot(&snapshot, src_xid);
+       SetTransactionSnapshot(&snapshot, src_xid, NULL);
 }
 
 /*
@@ -1393,3 +1443,159 @@ HistoricSnapshotGetTupleCids(void)
        Assert(HistoricSnapshotActive());
        return tuplecid_data;
 }
+
+/*
+ * EstimateSnapshotSpace
+ *             Returns the size need to store the given snapshot.
+ *
+ * We are exporting only required fields from the Snapshot, stored in
+ * SerializedSnapshotData.
+ */
+Size
+EstimateSnapshotSpace(Snapshot snap)
+{
+       Size            size;
+
+       Assert(snap != InvalidSnapshot);
+       Assert(snap->satisfies == HeapTupleSatisfiesMVCC);
+
+       /* We allocate any XID arrays needed in the same palloc block. */
+       size = add_size(sizeof(SerializedSnapshotData),
+                                       mul_size(snap->xcnt, sizeof(TransactionId)));
+       if (snap->subxcnt > 0 &&
+               (!snap->suboverflowed || snap->takenDuringRecovery))
+               size = add_size(size,
+                                               mul_size(snap->subxcnt, sizeof(TransactionId)));
+
+       return size;
+}
+
+/*
+ * SerializeSnapshot
+ *             Dumps the serialized snapshot (extracted from given snapshot) onto the
+ *             memory location at start_address.
+ */
+void
+SerializeSnapshot(Snapshot snapshot, Size maxsize, char *start_address)
+{
+       SerializedSnapshotData *serialized_snapshot;
+
+       /* If the size is small, throw an error */
+       if (maxsize < EstimateSnapshotSpace(snapshot))
+               elog(ERROR, "not enough space to serialize given snapshot");
+
+       Assert(snapshot->xcnt >= 0);
+       Assert(snapshot->subxcnt >= 0);
+
+       serialized_snapshot = (SerializedSnapshotData *) start_address;
+
+       /* Copy all required fields */
+       serialized_snapshot->xmin = snapshot->xmin;
+       serialized_snapshot->xmax = snapshot->xmax;
+       serialized_snapshot->xcnt = snapshot->xcnt;
+       serialized_snapshot->subxcnt = snapshot->subxcnt;
+       serialized_snapshot->suboverflowed = snapshot->suboverflowed;
+       serialized_snapshot->takenDuringRecovery = snapshot->takenDuringRecovery;
+       serialized_snapshot->curcid = snapshot->curcid;
+
+       /*
+        * Ignore the SubXID array if it has overflowed, unless the snapshot
+        * was taken during recovey - in that case, top-level XIDs are in subxip
+        * as well, and we mustn't lose them.
+        */
+       if (serialized_snapshot->suboverflowed && !snapshot->takenDuringRecovery)
+               serialized_snapshot->subxcnt = 0;
+
+       /* Copy XID array */
+       if (snapshot->xcnt > 0)
+               memcpy((TransactionId *) (serialized_snapshot + 1),
+                          snapshot->xip, snapshot->xcnt * sizeof(TransactionId));
+
+       /*
+        * Copy SubXID array. Don't bother to copy it if it had overflowed,
+        * though, because it's not used anywhere in that case. Except if it's a
+        * snapshot taken during recovery; all the top-level XIDs are in subxip as
+        * well in that case, so we mustn't lose them.
+        */
+       if (snapshot->subxcnt > 0)
+       {
+               Size subxipoff = sizeof(SerializedSnapshotData) +
+                       snapshot->xcnt * sizeof(TransactionId);
+
+               memcpy((TransactionId *) ((char *) serialized_snapshot + subxipoff),
+                          snapshot->subxip, snapshot->subxcnt * sizeof(TransactionId));
+       }
+}
+
+/*
+ * RestoreSnapshot
+ *             Restore a serialized snapshot from the specified address.
+ *
+ * The copy is palloc'd in TopTransactionContext and has initial refcounts set
+ * to 0.  The returned snapshot has the copied flag set.
+ */
+Snapshot
+RestoreSnapshot(char *start_address)
+{
+       SerializedSnapshotData *serialized_snapshot;
+       Size            size;
+       Snapshot        snapshot;
+       TransactionId *serialized_xids;
+
+       serialized_snapshot = (SerializedSnapshotData *) start_address;
+       serialized_xids = (TransactionId *)
+               (start_address + sizeof(SerializedSnapshotData));
+
+       /* We allocate any XID arrays needed in the same palloc block. */
+       size = sizeof(SnapshotData)
+               + serialized_snapshot->xcnt * sizeof(TransactionId)
+               + serialized_snapshot->subxcnt * sizeof(TransactionId);
+
+       /* Copy all required fields */
+       snapshot = (Snapshot) MemoryContextAlloc(TopTransactionContext, size);
+       snapshot->satisfies = HeapTupleSatisfiesMVCC;
+       snapshot->xmin = serialized_snapshot->xmin;
+       snapshot->xmax = serialized_snapshot->xmax;
+       snapshot->xip = NULL;
+       snapshot->xcnt = serialized_snapshot->xcnt;
+       snapshot->subxip = NULL;
+       snapshot->subxcnt = serialized_snapshot->subxcnt;
+       snapshot->suboverflowed = serialized_snapshot->suboverflowed;
+       snapshot->takenDuringRecovery = serialized_snapshot->takenDuringRecovery;
+       snapshot->curcid = serialized_snapshot->curcid;
+
+       /* Copy XIDs, if present. */
+       if (serialized_snapshot->xcnt > 0)
+       {
+               snapshot->xip = (TransactionId *) (snapshot + 1);
+               memcpy(snapshot->xip, serialized_xids,
+                          serialized_snapshot->xcnt * sizeof(TransactionId));
+       }
+
+       /* Copy SubXIDs, if present. */
+       if (serialized_snapshot->subxcnt > 0)
+       {
+               snapshot->subxip = snapshot->xip + serialized_snapshot->xcnt;
+               memcpy(snapshot->subxip, serialized_xids + serialized_snapshot->xcnt,
+                          serialized_snapshot->subxcnt * sizeof(TransactionId));
+       }
+
+       /* Set the copied flag so that the caller will set refcounts correctly. */
+       snapshot->regd_count = 0;
+       snapshot->active_count = 0;
+       snapshot->copied = true;
+
+       return snapshot;
+}
+
+/*
+ * Install a restored snapshot as the transaction snapshot.
+ *
+ * The second argument is of type void * so that snapmgr.h need not include
+ * the declaration for PGPROC.
+ */
+void
+RestoreTransactionSnapshot(Snapshot snapshot, void *master_pgproc)
+{
+       SetTransactionSnapshot(snapshot, InvalidTransactionId, master_pgproc);
+}
diff --git a/src/include/access/parallel.h b/src/include/access/parallel.h
new file mode 100644 (file)
index 0000000..761ba1f
--- /dev/null
@@ -0,0 +1,61 @@
+/*-------------------------------------------------------------------------
+ *
+ * parallel.h
+ *       Infrastructure for launching parallel workers
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/parallel.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef PARALLEL_H
+#define PARALLEL_H
+
+#include "lib/ilist.h"
+#include "postmaster/bgworker.h"
+#include "storage/shm_mq.h"
+#include "storage/shm_toc.h"
+#include "utils/elog.h"
+
+typedef void (*parallel_worker_main_type)(dsm_segment *seg, shm_toc *toc);
+
+typedef struct ParallelWorkerInfo
+{
+       BackgroundWorkerHandle *bgwhandle;
+       shm_mq_handle *error_mqh;
+} ParallelWorkerInfo;
+
+typedef struct ParallelContext
+{
+       dlist_node node;
+       SubTransactionId subid;
+       int nworkers;
+       parallel_worker_main_type entrypoint;
+       char *library_name;
+       char *function_name;
+       ErrorContextCallback *error_context_stack;
+       shm_toc_estimator estimator;
+       dsm_segment *seg;
+       shm_toc *toc;
+       ParallelWorkerInfo *worker;
+} ParallelContext;
+
+extern bool ParallelMessagePending;
+
+extern ParallelContext *CreateParallelContext(parallel_worker_main_type entrypoint, int nworkers);
+extern ParallelContext *CreateParallelContextForExtension(char *library_name,
+                                                                 char *function_name, int nworkers);
+extern void InitializeParallelDSM(ParallelContext *);
+extern void LaunchParallelWorkers(ParallelContext *);
+extern void WaitForParallelWorkersToFinish(ParallelContext *);
+extern void DestroyParallelContext(ParallelContext *);
+extern bool ParallelContextActive(void);
+
+extern void HandleParallelMessageInterrupt(bool signal_handler);
+extern void AtEOXact_Parallel(bool isCommit);
+extern void AtEOSubXact_Parallel(bool isCommit, SubTransactionId mySubId);
+
+#endif   /* PARALLEL_H */
index 8205504e6e7c4fc89f7016bf90a71daeebbd0237..8fd3772ac2989bc02bd65d42fdc87b45c1c46114 100644 (file)
@@ -77,9 +77,12 @@ extern bool MyXactAccessedTempRel;
 typedef enum
 {
        XACT_EVENT_COMMIT,
+       XACT_EVENT_PARALLEL_COMMIT,
        XACT_EVENT_ABORT,
+       XACT_EVENT_PARALLEL_ABORT,
        XACT_EVENT_PREPARE,
        XACT_EVENT_PRE_COMMIT,
+       XACT_EVENT_PARALLEL_PRE_COMMIT,
        XACT_EVENT_PRE_PREPARE
 } XactEvent;
 
@@ -241,6 +244,10 @@ extern void BeginInternalSubTransaction(char *name);
 extern void ReleaseCurrentSubTransaction(void);
 extern void RollbackAndReleaseCurrentSubTransaction(void);
 extern bool IsSubTransaction(void);
+extern Size EstimateTransactionStateSpace(void);
+extern void SerializeTransactionState(Size maxsize, char *start_address);
+extern void StartParallelWorkerTransaction(char *tstatespace);
+extern void EndParallelWorkerTransaction(void);
 extern bool IsTransactionBlock(void);
 extern bool IsTransactionOrTransactionBlock(void);
 extern char TransactionBlockStatusCode(void);
@@ -260,4 +267,8 @@ extern void xact_redo(XLogReaderState *record);
 extern void xact_desc(StringInfo buf, XLogReaderState *record);
 extern const char *xact_identify(uint8 info);
 
+extern void EnterParallelMode(void);
+extern void ExitParallelMode(void);
+extern bool IsInParallelMode(void);
+
 #endif   /* XACT_H */
index 418f6aadaa6bf4d1d820acb1d99dbd8106d01753..b9a5c40f598fcae450df48f4c3950a92a9ff7f81 100644 (file)
@@ -642,6 +642,9 @@ extern PGFunction load_external_function(char *filename, char *funcname,
 extern PGFunction lookup_external_function(void *filehandle, char *funcname);
 extern void load_file(const char *filename, bool restricted);
 extern void **find_rendezvous_variable(const char *varName);
+extern Size EstimateLibraryStateSpace(void);
+extern void SerializeLibraryState(Size maxsize, char *start_address);
+extern void RestoreLibraryState(char *start_address);
 
 /*
  * Support for aggregate functions
index 5f2815ca902c965d21983b1bb08e589d49baa62a..ad7589d4edbc0cca271cb3adebc1a92c399310b6 100644 (file)
@@ -17,6 +17,7 @@
 #include "storage/shm_mq.h"
 
 extern void    pq_redirect_to_shm_mq(shm_mq *, shm_mq_handle *);
+extern void pq_set_parallel_master(pid_t pid, BackendId backend_id);
 
 extern void pq_parse_errornotice(StringInfo str, ErrorData *edata);
 
index 83198ed60c40d2ca799286fe9dcb946b4144e18c..ed6eda2f02d2279fc0a174496755db632233df14 100644 (file)
@@ -259,6 +259,7 @@ extern void check_stack_depth(void);
 
 /* in tcop/utility.c */
 extern void PreventCommandIfReadOnly(const char *cmdname);
+extern void PreventCommandIfParallelMode(const char *cmdname);
 extern void PreventCommandDuringRecovery(const char *cmdname);
 
 /* in utils/misc/guc.c */
index a81b90badcb77d3486bfd247e5015ec2b422be48..de9180df91b4bc9bdd3eb7258f29bb25a2cb3faf 100644 (file)
@@ -112,6 +112,8 @@ extern BgwHandleStatus GetBackgroundWorkerPid(BackgroundWorkerHandle *handle,
 extern BgwHandleStatus
 WaitForBackgroundWorkerStartup(BackgroundWorkerHandle *
                                                           handle, pid_t *pid);
+extern BgwHandleStatus
+WaitForBackgroundWorkerShutdown(BackgroundWorkerHandle *);
 
 /* Terminate a bgworker */
 extern void TerminateBackgroundWorker(BackgroundWorkerHandle *handle);
index 11009237866c3c3a870ff600e507bdd43d778835..5b61ce4c7cc3aeea9307ec6f8b4e179aa2b5ec6a 100644 (file)
@@ -503,7 +503,8 @@ extern LockAcquireResult LockAcquireExtended(const LOCKTAG *locktag,
                                        LOCKMODE lockmode,
                                        bool sessionLock,
                                        bool dontWait,
-                                       bool report_memory_error);
+                                       bool report_memory_error,
+                                       bool parallelReacquire);
 extern void AbortStrongLockAcquire(void);
 extern bool LockRelease(const LOCKTAG *locktag,
                        LOCKMODE lockmode, bool sessionLock);
@@ -564,4 +565,9 @@ extern void VirtualXactLockTableInsert(VirtualTransactionId vxid);
 extern void VirtualXactLockTableCleanup(void);
 extern bool VirtualXactLock(VirtualTransactionId vxid, bool wait);
 
+/* Parallel worker state sharing. */
+extern Size EstimateLockStateSpace(void);
+extern void SerializeLockState(Size maxsize, char *start_address);
+extern void RestoreLockState(char *start_address);
+
 #endif   /* LOCK_H */
index 97c6e9344e9bf8555006ac342c11c7827a8d64e2..a9b40ed944f7e8dbb526efb05eaa2c4d75746715 100644 (file)
@@ -46,6 +46,7 @@ extern Snapshot GetSnapshotData(Snapshot snapshot);
 
 extern bool ProcArrayInstallImportedXmin(TransactionId xmin,
                                                         TransactionId sourcexid);
+extern bool ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc);
 
 extern RunningTransactions GetRunningTransactionData(void);
 
index ac9d236decf5effc85e479a4c152373c46a5e7a5..af1a0cd71f25b900ed9187b5007203142b858a1d 100644 (file)
@@ -31,6 +31,7 @@ typedef enum
 {
        PROCSIG_CATCHUP_INTERRUPT,      /* sinval catchup interrupt */
        PROCSIG_NOTIFY_INTERRUPT,       /* listen/notify interrupt */
+       PROCSIG_PARALLEL_MESSAGE,       /* message from cooperating parallel backend */
 
        /* Recovery conflict reasons */
        PROCSIG_RECOVERY_CONFLICT_DATABASE,
index ce7b47c24eb96a21d671c25e3bf9446a108d717b..f2faa12623720148c135c84be5dc0f3852faf29b 100644 (file)
@@ -21,5 +21,8 @@
  */
 
 extern void AtEOXact_ComboCid(void);
+extern void RestoreComboCIDState(char *comboCIDstate);
+extern void SerializeComboCIDState(Size maxsize, char *start_address);
+extern Size EstimateComboCIDStateSpace(void);
 
 #endif   /* COMBOCID_H */
index 64d2ec1e5e110d6f528cc3843d3389eff2a10b20..5167e170c63d5f9fbe9bd2d80e4585d8776268da 100644 (file)
@@ -64,4 +64,10 @@ extern void SetupHistoricSnapshot(Snapshot snapshot_now, struct HTAB *tuplecids)
 extern void TeardownHistoricSnapshot(bool is_error);
 extern bool HistoricSnapshotActive(void);
 
+extern Size EstimateSnapshotSpace(Snapshot snapshot);
+extern void SerializeSnapshot(Snapshot snapshot, Size maxsize,
+                                                         char *start_address);
+extern Snapshot RestoreSnapshot(char *start_address);
+extern void RestoreTransactionSnapshot(Snapshot snapshot, void *master_pgproc);
+
 #endif   /* SNAPMGR_H */