Track global GTM state via cluster monitor.
authorPavan Deolasee <[email protected]>
Mon, 8 Oct 2018 07:38:50 +0000 (13:08 +0530)
committerPavan Deolasee <[email protected]>
Tue, 9 Oct 2018 07:31:54 +0000 (13:01 +0530)
We received a report that sometimes a transaction fails to see a recently
committed row, especially when the query tries to insert a duplicate primary
key, sees another concurrent insert and then waits for the other transaction to
commit. The next statement in the transaction should see the concurrently
inserted row if the transaction is running in read committed mode. But in
practice, sometimes GTM may see the final COMMIT message of the second
transaction after it hands out a new snapshot to the first transaction and this
snapshot still shows the second transaction as running. The MVCC check then
fails to see the row.

This is quite a complex situation and a tricky one to handle in a distributed
system without some kind of logical ordering of events. For example, if a
transaction T1 sees a COMMIT of another transaction T2 anywhere in the cluster,
all subsequent actions of T1 should see T2's COMMIT. Enforcing such logical
ordering is not trivial without additional communication overhead.

So what we are now doing is to ensure that a transaction is reported as
in-progress on all nodes until the GTM sees the final COMMIT. But instead of
querying the GTM everytime, we now maintain a local copy of the GTM's view of
running transactions. This local copy is either updated by the cluster-monitor
proccess at regular interval or when other backends fetch a new snapshot from
the GTM. In order to ensure that the state is only moved forward, we now also
have a concept of snapshot_id or counter which is incremented everytime state
on the GTM changes. Being a 64-bit counter, we don't need to worry about
a wrap-around. A transaction is considered to be in-progress, as long as it's
open on the GTM.

Accept changes in xc_for_update test case's expected output. In fact, the old
output was wrong since we were disregarding the prepared transaction holding
the AEL on the table. This looks like a separate bug, which should be
investigated in more details.

17 files changed:
src/backend/postmaster/clustermon.c
src/backend/storage/ipc/procarray.c
src/backend/storage/lmgr/lmgr.c
src/gtm/client/fe-protocol.c
src/gtm/common/gtm_serialize.c
src/gtm/common/gtm_serialize_debug.c
src/gtm/main/gtm_snap.c
src/gtm/main/gtm_standby.c
src/gtm/main/gtm_txn.c
src/gtm/main/main.c
src/gtm/proxy/proxy_main.c
src/include/gtm/gtm.h
src/include/gtm/gtm_c.h
src/include/gtm/gtm_txn.h
src/include/postmaster/clustermon.h
src/include/storage/procarray.h
src/test/regress/expected/xc_for_update.out

index fbe5a1b044c599c7cddfde057db1a8485fb5b8ca..d881afaba74ce275648f8572f5eb77edcf17ec59 100644 (file)
@@ -57,6 +57,7 @@ static ClusterMonitorCtlData *ClusterMonitorCtl = NULL;
 
 static void cm_sighup_handler(SIGNAL_ARGS);
 static void cm_sigterm_handler(SIGNAL_ARGS);
+static void cm_sigint_handler(SIGNAL_ARGS);
 static void ClusterMonitorSetReportedGlobalXmin(GlobalTransactionId xmin);
 static void ClusterMonitorSetReportingGlobalXmin(GlobalTransactionId xmin);
 
@@ -65,26 +66,147 @@ int                        ClusterMonitorPid = 0;
 
 #define CLUSTER_MONITOR_NAPTIME        5
 
+/*
+ * Report xmin to the GTM and fetch the global xmin information in the
+ * response.
+ */
+static void
+ClusterMonitorReportXmin(void)
+{
+       GlobalTransactionId oldestXmin;
+       GlobalTransactionId newOldestXmin;
+       GlobalTransactionId lastGlobalXmin;
+       GlobalTransactionId latestCompletedXid;
+       int                                     status;
+
+       /*
+        * Compute RecentGlobalXmin, report it to the GTM and sleep for the set
+        * interval. Keep doing this forever
+        */
+       lastGlobalXmin = ClusterMonitorGetGlobalXmin(true);
+       LWLockAcquire(ClusterMonitorLock, LW_EXCLUSIVE);
+       oldestXmin = GetOldestXminInternal(NULL, 0, true, lastGlobalXmin);
+       ClusterMonitorSetReportingGlobalXmin(oldestXmin);
+       LWLockRelease(ClusterMonitorLock);
+
+       if ((status = ReportGlobalXmin(oldestXmin, &newOldestXmin,
+                                       &latestCompletedXid)))
+       {
+               elog(DEBUG1, "Failed (status %d) to report RecentGlobalXmin "
+                               "- reported RecentGlobalXmin %u, received "
+                               "RecentGlobalXmin %u, " "received latestCompletedXid %u",
+                               status, oldestXmin, newOldestXmin,
+                               latestCompletedXid);
+               if (status == GTM_ERRCODE_TOO_OLD_XMIN ||
+                               status == GTM_ERRCODE_NODE_EXCLUDED)
+               {
+                       /*
+                        * If we haven't seen a new transaction for a very long time or
+                        * were disconncted for a while or excluded from the xmin
+                        * computation for any reason, our xmin calculation could be
+                        * well in the past, especially because its capped by the
+                        * latestCompletedXid which may not advance on an idle server.
+                        * In such cases, use the value of latestCompletedXid as
+                        * returned by GTM and then recompute local xmin.
+                        *
+                        * If the GTM's global xmin advances even further while we are
+                        * ready with a new xmin, just repeat the entire exercise as
+                        * long as GTM keeps returning us a more current value of
+                        * latestCompletedXid and thus pushing forward our local xmin
+                        * calculation
+                        */
+                       if (GlobalTransactionIdIsValid(latestCompletedXid) &&
+                                       TransactionIdPrecedes(oldestXmin, latestCompletedXid))
+                       {
+                               SetLatestCompletedXid(latestCompletedXid);
+                               return;
+                       }
+               }
+               else if (status == GTM_ERRCODE_NODE_NOT_REGISTERED)
+               {
+                       /*
+                        * If we're not registered on the GTM, it could be because the
+                        * GTM is restarted. Just exit and let the cluster monitor be
+                        * restarted again.
+                        */
+                       elog(WARNING, "ClusterMonitor process exiting - node not "
+                                       "registered on the GTM");
+                       proc_exit(0);
+               }
+       }
+       else
+       {
+               elog(DEBUG1, "Successfully reported xmin to GTM - reported_xmin %u,"
+                               "received RecentGlobalXmin %u, "
+                               "received latestCompletedXid %u", oldestXmin,
+                               newOldestXmin, latestCompletedXid);
+
+               SetLatestCompletedXid(latestCompletedXid);
+               ClusterMonitorSetReportedGlobalXmin(oldestXmin);
+               if (GlobalTransactionIdIsValid(newOldestXmin))
+                       ClusterMonitorSetGlobalXmin(newOldestXmin);
+       }
+
+       ClusterMonitorSetReportingGlobalXmin(InvalidGlobalTransactionId);
+}
+
+/*
+ * Update our local view of the global transactions using a recently fetched
+ * snapshot from the GTM. The snapshot contains the information about the
+ * currently running transactions and that's what we care about.
+ *
+ * We don't want to overwrite a future state with a past state just because a
+ * backend received an older snapshot. Checking snapshot->sn_snapid serves that
+ * purpose.
+ */
+void
+ClusterMonitorSyncGlobalStateUsingSnapshot(GTM_Snapshot snapshot)
+{
+       if (snapshot == NULL ||
+               snapshot->sn_snapid < ClusterMonitorCtl->gtm_snapid)
+               return;
+
+       /* Populate shared memory state */
+       SpinLockAcquire(&ClusterMonitorCtl->mutex);
+       ClusterMonitorCtl->gtm_xmin = snapshot->sn_xmin;
+       ClusterMonitorCtl->gtm_xmax = snapshot->sn_xmax;
+       ClusterMonitorCtl->gtm_xcnt = snapshot->sn_xcnt;
+       ClusterMonitorCtl->gtm_snapid = snapshot->sn_snapid;
+       memcpy((char *) ClusterMonitorCtl->gtm_xip, (char *) snapshot->sn_xip,
+                       sizeof (GlobalTransactionId) * snapshot->sn_xcnt);
+       SpinLockRelease(&ClusterMonitorCtl->mutex);
+
+       /* Wake up all processes waiting on our CV. */
+       ConditionVariableBroadcast(&ClusterMonitorCtl->cv);
+}
+
+/*
+ * Sync global state.
+ */
+static void
+ClusterMonitorSyncGlobalState(void)
+{
+       GTM_Snapshot snapshot = GetSnapshotGTM(InvalidGlobalTransactionId, true);
+       ClusterMonitorSyncGlobalStateUsingSnapshot(snapshot);
+}
+
 /*
  * Main loop for the cluster monitor process.
  */
 int
 ClusterMonitorInit(void)
 {
-       sigjmp_buf      local_sigjmp_buf;
        GTM_PGXCNodeType nodetype = IS_PGXC_DATANODE ?
                                                                        GTM_NODE_DATANODE :
                                                                        GTM_NODE_COORDINATOR;
-       GlobalTransactionId oldestXmin;
-       GlobalTransactionId newOldestXmin;
-       GlobalTransactionId lastGlobalXmin;
-       GlobalTransactionId latestCompletedXid;
-       int                                     status;
+       sigjmp_buf      local_sigjmp_buf;
        bool                            bootingUp = true;
        int                                     aggreesiveReportingCount = 0;
 
        am_clustermon = true;
 
+       ClusterMonitorCtl->clustermonitor_pid = MyProcPid;
+
        /* Identify myself via ps */
        init_ps_display("cluster monitor process", "", "", "");
 
@@ -100,7 +222,7 @@ ClusterMonitorInit(void)
         * tcop/postgres.c.
         */
        pqsignal(SIGHUP, cm_sighup_handler);
-       pqsignal(SIGINT, StatementCancelHandler);
+       pqsignal(SIGINT, cm_sigint_handler);
        pqsignal(SIGTERM, cm_sigterm_handler);
 
        pqsignal(SIGQUIT, quickdie);
@@ -248,76 +370,8 @@ ClusterMonitorInit(void)
                        ProcessConfigFile(PGC_SIGHUP);
                }
 
-               /*
-                * Compute RecentGlobalXmin, report it to the GTM and sleep for the set
-                * interval. Keep doing this forever
-                */
-               lastGlobalXmin = ClusterMonitorGetGlobalXmin(true);
-               LWLockAcquire(ClusterMonitorLock, LW_EXCLUSIVE);
-               oldestXmin = GetOldestXminInternal(NULL, 0, true, lastGlobalXmin);
-               ClusterMonitorSetReportingGlobalXmin(oldestXmin);
-               LWLockRelease(ClusterMonitorLock);
-
-               if ((status = ReportGlobalXmin(oldestXmin, &newOldestXmin,
-                                               &latestCompletedXid)))
-               {
-                       elog(DEBUG1, "Failed (status %d) to report RecentGlobalXmin "
-                                       "- reported RecentGlobalXmin %u, received "
-                                       "RecentGlobalXmin %u, " "received latestCompletedXid %u",
-                                       status, oldestXmin, newOldestXmin,
-                                       latestCompletedXid);
-                       if (status == GTM_ERRCODE_TOO_OLD_XMIN ||
-                               status == GTM_ERRCODE_NODE_EXCLUDED)
-                       {
-                               /*
-                                * If we haven't seen a new transaction for a very long time or
-                                * were disconncted for a while or excluded from the xmin
-                                * computation for any reason, our xmin calculation could be
-                                * well in the past, especially because its capped by the
-                                * latestCompletedXid which may not advance on an idle server.
-                                * In such cases, use the value of latestCompletedXid as
-                                * returned by GTM and then recompute local xmin.
-                                *
-                                * If the GTM's global xmin advances even further while we are
-                                * ready with a new xmin, just repeat the entire exercise as
-                                * long as GTM keeps returning us a more current value of
-                                * latestCompletedXid and thus pushing forward our local xmin
-                                * calculation
-                                */
-                               if (GlobalTransactionIdIsValid(latestCompletedXid) &&
-                                               TransactionIdPrecedes(oldestXmin, latestCompletedXid))
-                               {
-                                       SetLatestCompletedXid(latestCompletedXid);
-                                       continue;
-                               }
-                       }
-                       else if (status == GTM_ERRCODE_NODE_NOT_REGISTERED)
-                       {
-                               /*
-                                * If we're not registered on the GTM, it could be because the
-                                * GTM is restarted. Just exit and let the cluster monitor be
-                                * restarted again.
-                                */
-                               elog(WARNING, "ClusterMonitor process exiting - node not "
-                                               "registered on the GTM");
-                               proc_exit(0);
-                       }
-               }
-               else
-               {
-                       elog(DEBUG1, "Successfully reported xmin to GTM - reported_xmin %u,"
-                                       "received RecentGlobalXmin %u, "
-                                       "received latestCompletedXid %u", oldestXmin,
-                                       newOldestXmin, latestCompletedXid);
-
-                       SetLatestCompletedXid(latestCompletedXid);
-                       ClusterMonitorSetReportedGlobalXmin(oldestXmin);
-                       if (GlobalTransactionIdIsValid(newOldestXmin))
-                               ClusterMonitorSetGlobalXmin(newOldestXmin);
-               }
-
-               ClusterMonitorSetReportingGlobalXmin(InvalidGlobalTransactionId);
-
+               ClusterMonitorReportXmin();
+               ClusterMonitorSyncGlobalState();
        }
 
        /* Normal exit from the cluster monitor is here */
@@ -354,6 +408,17 @@ cm_sigterm_handler(SIGNAL_ARGS)
 }
 
 
+/* SIGINT: time to report */
+static void
+cm_sigint_handler(SIGNAL_ARGS)
+{
+       int                     save_errno = errno;
+
+       SetLatch(MyLatch);
+
+       errno = save_errno;
+}
+
 /*
  * IsClusterMonitor functions
  *             Return whether this is either a cluster monitor process or a worker
@@ -385,6 +450,7 @@ ClusterMonitorShmemInit(void)
                /* First time through, so initialize */
                MemSet(ClusterMonitorCtl, 0, ClusterMonitorShmemSize());
                SpinLockInit(&ClusterMonitorCtl->mutex);
+               ConditionVariableInit(&ClusterMonitorCtl->cv);
        }
 }
 
@@ -480,3 +546,61 @@ ClusterMonitorGetReportingGlobalXmin(void)
 
        return reporting_xmin;
 }
+
+/*
+ * Wake up cluster monitor process.
+ */
+void
+ClusterMonitorWakeUp(void)
+{
+       (void ) kill(ClusterMonitorCtl->clustermonitor_pid, SIGINT);
+}
+
+/*
+ * ClusterMonitorTransactionIsInProgress
+ *
+ * Check if the given transaction is in-progress anywhere in the cluster. Our
+ * local copy of the global state may not be accurate and hence this might
+ * return a slightly stale result. But the callers should be prepared to deal
+ * with that.
+ */
+bool
+ClusterMonitorTransactionIsInProgress(GlobalTransactionId gxid)
+{
+       int             i;
+       bool    status = false;
+
+       SpinLockAcquire(&ClusterMonitorCtl->mutex);
+       if (GlobalTransactionIdPrecedes(gxid, ClusterMonitorCtl->gtm_xmin))
+               status = false;
+
+       if (GlobalTransactionIdFollowsOrEquals(gxid, ClusterMonitorCtl->gtm_xmax))
+               status = true;
+
+       for (i = 0; i < ClusterMonitorCtl->gtm_xcnt; i++)
+       {
+               if (GlobalTransactionIdEquals(ClusterMonitorCtl->gtm_xip[i], gxid))
+               {
+                       status = true;
+                       break;
+               }
+       }
+       SpinLockRelease(&ClusterMonitorCtl->mutex);
+
+       return status;
+}
+
+/*
+ * ClusterMonitorWaitForEOFTransaction
+ *
+ * Wait for the given transaction to complete cluster-wide.
+ */
+void
+ClusterMonitorWaitForEOFTransaction(GlobalTransactionId gxid)
+{
+       ConditionVariablePrepareToSleep(&ClusterMonitorCtl->cv);
+       while (ClusterMonitorTransactionIsInProgress(gxid))
+               ConditionVariableSleep(&ClusterMonitorCtl->cv,
+                               WAIT_EVENT_CLUSTER_MONITOR_MAIN);
+       ConditionVariableCancelSleep();
+}
index 390cad612cd05844234b5dabbedaea73baa9ceae..7e9cf57dfb0a411717865ebc412d8382f41c4a46 100644 (file)
@@ -1042,33 +1042,12 @@ ProcArrayApplyXidAssignment(TransactionId topxid,
 }
 
 /*
- * TransactionIdIsInProgress -- is given transaction running in some backend
- *
- * Aside from some shortcuts such as checking RecentXmin and our own Xid,
- * there are four possibilities for finding a running transaction:
- *
- * 1. The given Xid is a main transaction Id.  We will find this out cheaply
- * by looking at the PGXACT struct for each backend.
- *
- * 2. The given Xid is one of the cached subxact Xids in the PGPROC array.
- * We can find this out cheaply too.
- *
- * 3. In Hot Standby mode, we must search the KnownAssignedXids list to see
- * if the Xid is running on the master.
- *
- * 4. Search the SubTrans tree to find the Xid's topmost parent, and then see
- * if that is running according to PGXACT or KnownAssignedXids.  This is the
- * slowest way, but sadly it has to be done always if the others failed,
- * unless we see that the cached subxact sets are complete (none have
- * overflowed).
- *
- * ProcArrayLock has to be held while we do 1, 2, 3.  If we save the top Xids
- * while doing 1 and 3, we can release the ProcArrayLock while we do 4.
- * This buys back some concurrency (and we can't retrieve the main Xids from
- * PGXACT again anyway; see GetNewTransactionId).
+ * Real workhouse for TransactionIdIsInProgress. If check_gtm is true, then
+ * also check cluster monitor's global state to see if the transaction is
+ * complete on the GTM too.
  */
 bool
-TransactionIdIsInProgress(TransactionId xid)
+TransactionIdIsInProgressExtended(TransactionId xid, bool check_gtm)
 {
        static TransactionId *xids = NULL;
        int                     nxids = 0;
@@ -1205,6 +1184,13 @@ TransactionIdIsInProgress(TransactionId xid)
                        xids[nxids++] = pxid;
        }
 
+       if (check_gtm && ClusterMonitorTransactionIsInProgress(xid))
+       {
+               elog(LOG, "ClusterMonitor reports xid %u as in-progress", xid);
+               LWLockRelease(ProcArrayLock);
+               return true;
+       }
+
        /*
         * Step 3: in hot standby mode, check the known-assigned-xids list.  XIDs
         * in the list must be treated as running.
@@ -1276,6 +1262,38 @@ TransactionIdIsInProgress(TransactionId xid)
        return false;
 }
 
+/*
+ * TransactionIdIsInProgress -- is given transaction running in some backend
+ *
+ * Aside from some shortcuts such as checking RecentXmin and our own Xid,
+ * there are four possibilities for finding a running transaction:
+ *
+ * 1. The given Xid is a main transaction Id.  We will find this out cheaply
+ * by looking at the PGXACT struct for each backend.
+ *
+ * 2. The given Xid is one of the cached subxact Xids in the PGPROC array.
+ * We can find this out cheaply too.
+ *
+ * 3. In Hot Standby mode, we must search the KnownAssignedXids list to see
+ * if the Xid is running on the master.
+ *
+ * 4. Search the SubTrans tree to find the Xid's topmost parent, and then see
+ * if that is running according to PGXACT or KnownAssignedXids.  This is the
+ * slowest way, but sadly it has to be done always if the others failed,
+ * unless we see that the cached subxact sets are complete (none have
+ * overflowed).
+ *
+ * ProcArrayLock has to be held while we do 1, 2, 3.  If we save the top Xids
+ * while doing 1 and 3, we can release the ProcArrayLock while we do 4.
+ * This buys back some concurrency (and we can't retrieve the main Xids from
+ * PGXACT again anyway; see GetNewTransactionId).
+ */
+bool
+TransactionIdIsInProgress(TransactionId xid)
+{
+       return TransactionIdIsInProgressExtended(xid, true);
+}
+
 /*
  * TransactionIdIsActive -- is xid the top-level XID of an active backend?
  *
@@ -3508,6 +3526,8 @@ retry:
                SetGlobalSnapshotData(gtm_snapshot->sn_xmin, gtm_snapshot->sn_xmax,
                                gtm_snapshot->sn_xcnt, gtm_snapshot->sn_xip, SNAPSHOT_DIRECT);
                GetSnapshotFromGlobalSnapshot(snapshot);
+
+               ClusterMonitorSyncGlobalStateUsingSnapshot(gtm_snapshot);
        }
        LWLockRelease(ClusterMonitorLock);
 }
index 896b94558bfd3904d6e6839868a390ce40d2cf1a..e1c74616d748b30e0f9d451047d4887660fdaa76 100644 (file)
@@ -20,6 +20,7 @@
 #include "access/xact.h"
 #include "catalog/catalog.h"
 #include "miscadmin.h"
+#include "postmaster/clustermon.h"
 #include "storage/lmgr.h"
 #include "storage/proc.h"
 #include "storage/procarray.h"
@@ -604,8 +605,12 @@ XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid,
 
                LockRelease(&tag, ShareLock, false);
 
-               if (!TransactionIdIsInProgress(xid))
+               if (!TransactionIdIsInProgressExtended(xid, false))
+               {
+                       if (ClusterMonitorTransactionIsInProgress(xid))
+                               ClusterMonitorWaitForEOFTransaction(xid);
                        break;
+               }
 
                /*
                 * If the Xid belonged to a subtransaction, then the lock would have
@@ -656,8 +661,13 @@ ConditionalXactLockTableWait(TransactionId xid)
 
                LockRelease(&tag, ShareLock, false);
 
-               if (!TransactionIdIsInProgress(xid))
-                       break;
+               if (!TransactionIdIsInProgressExtended(xid, false))
+               {
+                       if (ClusterMonitorTransactionIsInProgress(xid) && !first)
+                               return false;
+                       else
+                               break;
+               }
 
                /* See XactLockTableWait about this case */
                if (!first)
index b52249bcea521bf03ddda2d10a888a8bc352c645..8479e51b9c07653aa041a367615b7b80267d269d 100644 (file)
@@ -495,6 +495,13 @@ gtmpqParseSuccess(GTM_Conn *conn, GTM_Result *result)
                                break;
                        }
 
+                       if (gtmpqGetnchar((char *)&result->gr_snapshot.sn_snapid,
+                                                  sizeof (uint64), conn))
+                       {
+                               result->gr_status = GTM_RESULT_ERROR;
+                               break;
+                       }
+
                        if (gtmpqGetnchar((char *)&result->gr_snapshot.sn_xmin,
                                                   sizeof (GlobalTransactionId), conn))
                        {
index 4ff5ad72ec2e0e8c5c73b381b474fb3ef410d53a..acef732c918d573b48f1118ad99c92db5106a552 100644 (file)
@@ -486,6 +486,10 @@ gtm_serialize_transactions(GTM_Transactions *data, char *buf, size_t buflen)
        memcpy(buf + len, &(data->gt_latestCompletedXid), sizeof(GlobalTransactionId));
        len += sizeof(GlobalTransactionId);
 
+       /* GTM_Transactions.gt_snapid */
+       memcpy(buf + len, &(data->gt_snapid), sizeof(uint64));
+       len += sizeof(uint64);
+
        /* GTM_Transactions.gt_recent_global_xmin */
        memcpy(buf + len, &(data->gt_recent_global_xmin), sizeof(GlobalTransactionId));
        len += sizeof(GlobalTransactionId);
@@ -593,6 +597,10 @@ gtm_deserialize_transactions(GTM_Transactions *data, const char *buf, size_t max
        memcpy(&(data->gt_latestCompletedXid), buf + len, sizeof(GlobalTransactionId));
        len += sizeof(GlobalTransactionId);
 
+       /* GTM_Transactions.gt_snapid */
+       memcpy(&(data->gt_snapid), buf + len, sizeof(uint64));
+       len += sizeof(uint64);
+
        /* GTM_Transactions.gt_recent_global_xmin */
        memcpy(&(data->gt_recent_global_xmin), buf + len, sizeof(GlobalTransactionId));
        len += sizeof(GlobalTransactionId);
index d688211ff4d9cf40556e2fcb66f8de344d222c7f..b5ba133f41ecee61ebf979035c61a8c89b5381ec 100644 (file)
@@ -77,6 +77,7 @@ dump_transactions_elog(GTM_Transactions *txn, int num_txn)
        elog(LOG, "  gt_xidStopLimit: %d", txn->gt_xidStopLimit);
        elog(LOG, "  gt_xidWrapLimit: %d", txn->gt_xidWrapLimit);
        elog(LOG, "  gt_latestCompletedXid: %d", txn->gt_latestCompletedXid);
+       elog(LOG, "  gt_snapid: %lu", txn->gt_snapid);
        elog(LOG, "  gt_recent_global_xmin: %d", txn->gt_recent_global_xmin);
        elog(LOG, "  gt_lastslot: %d", txn->gt_lastslot);
 
index f5500656a5e82cc200200a61e54666e37368b15b..b8c5f4e59b6317a1b7d3a3fe66c99b29664299fb 100644 (file)
 #include "gtm/libpq-int.h"
 #include "gtm/pqformat.h"
 
+void
+GTM_AdvanceSnapshotCounter(void)
+{
+       GTMTransactions.gt_snapid++;
+}
+
 /*
  * GTM_GetTransactionSnapshot
  *             Compute and store snapshot(s) for specified transactions.
@@ -210,6 +216,9 @@ GTM_GetTransactionSnapshot(GTM_TransactionHandle handle[], int txn_count, int *s
        Assert(GlobalTransactionIdIsNormal(xmax));
        GlobalTransactionIdAdvance(xmax);
 
+       /* Get the snapshot id */
+       snapshot->sn_snapid = GTMTransactions.gt_snapid;
+
        /* initialize xmin calculation with xmax */
        globalxmin = xmin = xmax;
 
@@ -427,6 +436,7 @@ ProcessGetSnapshotCommand(Port *myport, StringInfo message, bool get_gxid)
        pq_sendbytes(&buf, (char *)&gxid, sizeof (GlobalTransactionId));
        pq_sendbytes(&buf, (char *)&txn_count, sizeof(txn_count));
        pq_sendbytes(&buf, (char *)&status, sizeof(int) * txn_count);
+       pq_sendbytes(&buf, (char *)&snapshot->sn_snapid, sizeof (uint64));
        pq_sendbytes(&buf, (char *)&snapshot->sn_xmin, sizeof (GlobalTransactionId));
        pq_sendbytes(&buf, (char *)&snapshot->sn_xmax, sizeof (GlobalTransactionId));
 
@@ -496,6 +506,7 @@ ProcessGetSnapshotCommandMulti(Port *myport, StringInfo message)
        }
        pq_sendbytes(&buf, (char *)&txn_count, sizeof(txn_count));
        pq_sendbytes(&buf, (char *)status, sizeof(int) * txn_count);
+       pq_sendbytes(&buf, (char *)&snapshot->sn_snapid, sizeof (uint64));
        pq_sendbytes(&buf, (char *)&snapshot->sn_xmin, sizeof (GlobalTransactionId));
        pq_sendbytes(&buf, (char *)&snapshot->sn_xmax, sizeof (GlobalTransactionId));
        /* Read once */
index 5ae1da0c284ca9cf51e9ab51b9af2a5847342b00..523fab8065c7e2eeed6025620d38ab16cbf9768f 100644 (file)
@@ -127,6 +127,7 @@ gtm_standby_restore_gxid(void)
        GTMTransactions.gt_xidStopLimit = txn.gt_xidStopLimit;
        GTMTransactions.gt_xidWrapLimit = txn.gt_xidWrapLimit;
        GTMTransactions.gt_latestCompletedXid = txn.gt_latestCompletedXid;
+       GTMTransactions.gt_snapid = txn.gt_snapid;
        GTMTransactions.gt_recent_global_xmin = txn.gt_recent_global_xmin;
        GTMTransactions.gt_lastslot = txn.gt_lastslot;
 
index 198e2d8132aeea68f4468f1893874b96b94cd533..6aacea5687090ba30a106ff30b0121076da2379e 100644 (file)
@@ -214,6 +214,7 @@ GTM_InitTxnManager(void)
         * XXX Newest XID that is committed or aborted
         */
        GTMTransactions.gt_latestCompletedXid = FirstNormalGlobalTransactionId;
+       GTMTransactions.gt_snapid = 1;
 
        /* Initialise gt_recent_global_xmin */
        GTMTransactions.gt_recent_global_xmin = FirstNormalGlobalTransactionId;
@@ -441,6 +442,9 @@ GTM_RemoveTransInfoMulti(GTM_TransactionInfo *gtm_txninfo[], int txn_count)
         */
        GTM_RWLockAcquire(&GTMTransactions.gt_TransArrayLock, GTM_LOCKMODE_WRITE);
 
+       /* Next snapshot will yield a different result. */
+       GTM_AdvanceSnapshotCounter();
+
        for (ii = 0; ii < txn_count; ii++)
        {
                if (gtm_txninfo[ii] == NULL)
@@ -498,6 +502,9 @@ GTM_RemoveAllTransInfos(uint32 client_id, int backend_id)
         */
        GTM_RWLockAcquire(&GTMTransactions.gt_TransArrayLock, GTM_LOCKMODE_WRITE);
 
+       /* Next snapshot will yield a different result. */
+       GTM_AdvanceSnapshotCounter();
+
        prev = NULL;
        cell = gtm_list_head(GTMTransactions.gt_open_transactions);
        while (cell != NULL)
index e8a8d79fa9dd29a21de4157c4ae621e29ecb4fa7..7f33360c970df8173b39cce9efcbf5cbd9616470 100644 (file)
@@ -2321,6 +2321,7 @@ GTM_RestoreTxnInfo(FILE *ctlf, GlobalTransactionId next_gxid,
 {
        GlobalTransactionId saved_gxid = InvalidGlobalTransactionId;
        GlobalTransactionId saved_global_xmin = InvalidGlobalTransactionId;
+       uint64                          saved_snapid = 1;
 
        if (ctlf)
        {
@@ -2336,6 +2337,19 @@ GTM_RestoreTxnInfo(FILE *ctlf, GlobalTransactionId next_gxid,
 
                        if (fscanf(ctlf, "global_xmin: %u\n", &saved_global_xmin) != 1)
                                saved_global_xmin = InvalidGlobalTransactionId;
+
+                       saved_snapid = 1;
+               }
+               else if (context && context->version == 20181008)
+               {
+                       if (fscanf(ctlf, "next_xid: %u\n", &saved_gxid) != 1)
+                               saved_gxid = InvalidGlobalTransactionId;
+
+                       if (fscanf(ctlf, "global_xmin: %u\n", &saved_global_xmin) != 1)
+                               saved_global_xmin = InvalidGlobalTransactionId;
+
+                       if (fscanf(ctlf, "snapid: %lu\n", &saved_snapid) != 1)
+                               saved_snapid = 1;
                }
                else
                {
@@ -2399,7 +2413,7 @@ GTM_RestoreTxnInfo(FILE *ctlf, GlobalTransactionId next_gxid,
                                                 " use -f option")));
                GTMTransactions.gt_recent_global_xmin = next_gxid;
        }
-
+       GTMTransactions.gt_snapid = saved_snapid;
        GTM_SetNextGlobalTransactionId(next_gxid);
        elog(LOG, "Restoring last GXID to %u\n", next_gxid);
        elog(LOG, "Restoring global xmin to %u\n",
@@ -2429,6 +2443,7 @@ GTM_SaveTxnInfo(FILE *ctlf)
 
        fprintf(ctlf, "next_xid: %u\n", next_gxid);
        fprintf(ctlf, "global_xmin: %u\n", global_xmin);
+       fprintf(ctlf, "snapid: %lu\n", GTMTransactions.gt_snapid);
 }
 
 void
@@ -2442,6 +2457,7 @@ GTM_WriteRestorePointXid(FILE *f)
        elog(DEBUG1, "Saving transaction restoration info, backed-up gxid: %u", GTMTransactions.gt_backedUpXid);
        fprintf(f, "next_xid: %u\n", GTMTransactions.gt_backedUpXid);
        fprintf(f, "global_xmin: %u\n", GTMTransactions.gt_backedUpXid);
+       fprintf(f, "snapid: %lu\n", GTMTransactions.gt_snapid);
 }
 
 void
index ba976eb938bec9b7bdac09b757d2b4b514f799eb..1b905870041b469492a2bfbe752eea7663c7e6a2 100644 (file)
@@ -1931,6 +1931,7 @@ ProcessResponse(GTMProxy_ThreadInfo *thrinfo, GTMProxy_CommandInfo *cmdinfo,
                                pq_sendint(&buf, SNAPSHOT_GET_MULTI_RESULT, 4);
                                pq_sendbytes(&buf, (char *)&txn_count, sizeof (txn_count));
                                pq_sendbytes(&buf, (char *)&status, sizeof (status));
+                               pq_sendbytes(&buf, (char *)&res->gr_snapshot.sn_snapid, sizeof (uint64));
                                pq_sendbytes(&buf, (char *)&res->gr_snapshot.sn_xmin, sizeof (GlobalTransactionId));
                                pq_sendbytes(&buf, (char *)&res->gr_snapshot.sn_xmax, sizeof (GlobalTransactionId));
                                pq_sendint(&buf, res->gr_snapshot.sn_xcnt, sizeof (int));
index 2141d332fca1c3e341895d5c89da09e173a25d68..05c111a1d348241293922b06b5efc3429f03dfc8 100644 (file)
@@ -147,6 +147,6 @@ extern GTM_ThreadID                                         TopMostThreadID;
        ((((a) + 1) == UINT32_MAX) ? 1 : ((a) + 1))
 
 #define GTM_CONTROL_FILE               "gtm.control"
-#define GTM_CONTROL_VERSION            20160302
+#define GTM_CONTROL_VERSION            20181008
 
 #endif
index 8918cf2b244b12ad3ca3271870bc0f722dc0f98e..fc618df0b1eada3f56258e337e610112a340171d 100644 (file)
@@ -108,9 +108,10 @@ typedef enum GTM_IsolationLevel
 
 typedef struct GTM_SnapshotData
 {
+       uint64                                  sn_snapid;
        GlobalTransactionId             sn_xmin;
        GlobalTransactionId             sn_xmax;
-       uint32                          sn_xcnt;
+       uint32                                  sn_xcnt;
        GlobalTransactionId             *sn_xip;
 } GTM_SnapshotData;
 
index 54fd995a6e9f9fe8c0a980dd6978ebbab1aab522..489d2e96451c3612f4db5c97abadc2089e138534 100644 (file)
@@ -124,6 +124,7 @@ typedef struct GTM_Transactions
         */
        GlobalTransactionId gt_latestCompletedXid;      /* newest XID that has committed or
                                                                                                 * aborted */
+       uint64                          gt_snapid;              /* next snapshot id to assign */
 
        GlobalTransactionId     gt_recent_global_xmin;
 
@@ -182,6 +183,7 @@ void ProcessBkupBeginTransactionGetGXIDCommandMulti(Port *myport, StringInfo mes
 /*
  * In gtm_snap.c
  */
+void GTM_AdvanceSnapshotCounter(void);
 void ProcessGetSnapshotCommand(Port *myport, StringInfo message, bool get_gxid);
 void ProcessGetSnapshotCommandMulti(Port *myport, StringInfo message);
 void GTM_RememberDroppedSequence(GlobalTransactionId gxid, void *seq);
index 8ec1281a743936cdaf44ea7a16d051335fe70b6a..a95dc4111427f2f6865deca0368c3d9344d36d39 100644 (file)
 #define CLUSTERMON_H
 
 #include "storage/s_lock.h"
+#include "storage/condition_variable.h"
 #include "gtm/gtm_c.h"
 
 typedef struct
 {
        slock_t                         mutex;
+       ConditionVariable       cv;
        GlobalTransactionId     reported_recent_global_xmin;
        GlobalTransactionId     reporting_recent_global_xmin;
        GlobalTransactionId     gtm_recent_global_xmin;
+       pid_t                           clustermonitor_pid;
+       uint64                          gtm_snapid;
+       GlobalTransactionId     gtm_xmin;
+       GlobalTransactionId     gtm_xmax;
+       int                                     gtm_xcnt;
+       GlobalTransactionId     gtm_xip[GTM_MAX_GLOBAL_TRANSACTIONS];
 } ClusterMonitorCtlData;
 
 extern void ClusterMonitorShmemInit(void);
@@ -39,6 +47,10 @@ extern int   StartClusterMonitor(void);
 extern GlobalTransactionId ClusterMonitorGetGlobalXmin(bool invalid_ok);
 extern void ClusterMonitorSetGlobalXmin(GlobalTransactionId xmin);
 extern GlobalTransactionId ClusterMonitorGetReportingGlobalXmin(void);
+extern void ClusterMonitorWakeUp(void);
+extern bool ClusterMonitorTransactionIsInProgress(GlobalTransactionId gxid);
+extern void ClusterMonitorWaitForEOFTransaction(GlobalTransactionId gxid);
+extern void ClusterMonitorSyncGlobalStateUsingSnapshot(GTM_Snapshot snapshot);
 
 #ifdef EXEC_BACKEND
 extern void ClusterMonitorIAm(void);
index 4e1622b174c2e5fd2f89b206948803de364963ef..53d81aa1a1312395930cb3734f548902ad2c9503 100644 (file)
@@ -115,6 +115,8 @@ extern void SetLatestCompletedXid(TransactionId latestCompletedXid);
 extern RunningTransactions GetRunningTransactionData(void);
 
 extern bool TransactionIdIsInProgress(TransactionId xid);
+extern bool TransactionIdIsInProgressExtended(TransactionId xid,
+               bool check_gtm);
 extern bool TransactionIdIsActive(TransactionId xid);
 extern TransactionId GetOldestXmin(Relation rel, int flags);
 extern TransactionId GetOldestXminInternal(Relation rel, int flags,
index 396b80738716c5fe671dd4e3a3db4786b922bb38..4309423fac495f2f74cd59e8f37708f5fd7fcf82 100644 (file)
@@ -685,11 +685,13 @@ set statement_timeout to 1000;
 
 --  3. update a row (Should fail)
        update mytab1 set val2=33 where val = 1;
+ERROR:  canceling statement due to statement timeout
 --  4. delete a row
 --     Newly Inserted (Should pass)
        delete from mytab1 where val2=456;
 --     Previously Inserted (Should fail)
        delete from mytab1 where val=1;
+ERROR:  canceling statement due to statement timeout
 --  5. inherit form it (Should pass)
        create table chld_mytab1(d int, e int) inherits (mytab1);
 --  6. create a view on it (Should pass)
@@ -745,7 +747,7 @@ ERROR:  could not obtain lock on relation "mytab1"
          fetch 1 from c1;
  val | val2 | val3 
 -----+------+------
-   2 |   11 | 3344
+   1 |   11 | 1122
 (1 row)
 
        end;
@@ -755,7 +757,7 @@ ERROR:  could not obtain lock on relation "mytab1"
          fetch 1 from c1;
  val | val2 | val3 
 -----+------+------
-   2 |   11 | 3344
+   1 |   11 | 1122
 (1 row)
 
        end;
@@ -796,14 +798,14 @@ declare c1 cursor for select * from mytab1 for share;
 fetch 1 from c1;
  val | val2 | val3 
 -----+------+------
-   2 |   11 | 3344
+   1 |   11 | 1122
 (1 row)
 
 declare c2 cursor for select * from mytab1 for update;
 fetch 1 from c2;
  val | val2 | val3 
 -----+------+------
-   2 |   11 | 3344
+   1 |   11 | 1122
 (1 row)
 
 end;
@@ -813,7 +815,7 @@ declare c1 cursor for select * from mytab1 for share;
 fetch 1 from c1;
  val | val2 | val3 
 -----+------+------
-   2 |   11 | 3344
+   1 |   11 | 1122
 (1 row)
 
 prepare transaction 'tbl_mytab1_locked';
@@ -824,8 +826,9 @@ set statement_timeout to 1000;
        select * from mytab1 order by 1 ;
  val | val2 | val3 
 -----+------+------
+   1 |   11 | 1122
    2 |   11 | 3344
-(1 row)
+(2 rows)
 
 --  2. insert a row (Should pass)
        insert into mytab1 values(123,456);
@@ -835,11 +838,13 @@ set statement_timeout to 1000;
 
 --  3. update a row (Should fail)
        update mytab1 set val2=33 where val = 1;
+ERROR:  canceling statement due to statement timeout
 --  4. delete a row
 --     Newly Inserted (Should pass)
        delete from mytab1 where val2=456;
 --     Previously Inserted (Should fail)
        delete from mytab1 where val=1;
+ERROR:  canceling statement due to statement timeout
 --  5. inherit form it (Should pass)
        create table chld_mytab1(d int, e int) inherits (mytab1);
 --  6. create a view on it (Should pass)
@@ -896,7 +901,7 @@ ERROR:  could not obtain lock on relation "mytab1"
          fetch 1 from c1;
  val | val2 | val3 
 -----+------+------
-   2 |   11 | 3344
+   1 |   11 | 1122
 (1 row)
 
        end;
@@ -906,7 +911,7 @@ ERROR:  could not obtain lock on relation "mytab1"
          fetch 1 from c1;
  val | val2 | val3 
 -----+------+------
-   2 |   11 | 3344
+   1 |   11 | 1122
 (1 row)
 
        end;