Improve node exclusion and node rejoining logic for calculation for global
authorPavan Deolasee <[email protected]>
Thu, 11 Feb 2016 07:58:11 +0000 (13:28 +0530)
committerPavan Deolasee <[email protected]>
Thu, 11 Feb 2016 07:58:11 +0000 (13:28 +0530)
xmin.

When a node rejoins the cluster, after disconnection or restart, the logic now
accounts for the fact that the node might be running with an older xmin. GTM
sends back appropriate error codes and recent state information so that the
node can make a decision to join the cluster or fail.

Also increase the threshold for delay in reporting to 10mins to avoid false
positives.

src/backend/access/transam/gtm.c
src/backend/postmaster/clustermon.c
src/backend/storage/ipc/procarray.c
src/gtm/main/gtm_txn.c
src/gtm/recovery/register_common.c
src/include/gtm/register.h

index 26defd944e154a7fc7d7c2714655df967762b2d8..64f39d19ad3550f0d357f33411fade6f77524c50 100644 (file)
@@ -704,10 +704,8 @@ ReportGlobalXmin(GlobalTransactionId gxid, GlobalTransactionId *global_xmin,
        if (!conn)
                return EOF;
 
-       if (report_global_xmin(conn, PGXCNodeName,
+       report_global_xmin(conn, PGXCNodeName,
                        IS_PGXC_COORDINATOR ?  GTM_NODE_COORDINATOR : GTM_NODE_DATANODE,
-                       gxid, global_xmin, latest_completed_xid, &errcode))
-               return errcode;
-       else
-               return 0;
+                       gxid, global_xmin, latest_completed_xid, &errcode);
+       return errcode;
 }
index f2148667e2d10a54530a7dda855b645cb82c30a7..2ff09787edb600e6846b58827830b8a8d06ef8ef 100644 (file)
@@ -212,8 +212,11 @@ ClusterMonitorInit(void)
                if ((status = ReportGlobalXmin(oldestXmin, &newOldestXmin,
                                                &latestCompletedXid)))
                {
-                       elog(DEBUG2, "Failed to report RecentGlobalXmin to GTM - %d:%d",
-                                       status, newOldestXmin);
+                       elog(DEBUG1, "Failed (status %d) to report RecentGlobalXmin "
+                                       "- reported RecentGlobalXmin %d, received "
+                                       "RecentGlobalXmin %d, " "received latestCompletedXid",
+                                       status, oldestXmin, newOldestXmin,
+                                       latestCompletedXid);
                        if (status == GTM_ERRCODE_TOO_OLD_XMIN ||
                                status == GTM_ERRCODE_NODE_EXCLUDED)
                        {
@@ -238,14 +241,17 @@ ClusterMonitorInit(void)
                                        SetLatestCompletedXid(latestCompletedXid);
                                        continue;
                                }
-                               elog(PANIC, "Global xmin computation mismatch");
                        }
                }
                else
                {
+                       elog(DEBUG1, "Successfully reported xmin to GTM - reported_xmin %d,"
+                                       "received RecentGlobalXmin %d, "
+                                       "received latestCompletedXid %d", oldestXmin,
+                                       newOldestXmin, latestCompletedXid);
+
                        SetLatestCompletedXid(latestCompletedXid);
                        ClusterMonitorSetReportedGlobalXmin(oldestXmin);
-                       elog(DEBUG2, "Updating global_xmin to %d", newOldestXmin);
                        if (GlobalTransactionIdIsValid(newOldestXmin))
                                ClusterMonitorSetGlobalXmin(newOldestXmin);
                }
@@ -373,15 +379,20 @@ void
 ClusterMonitorSetGlobalXmin(GlobalTransactionId xmin)
 {
        LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+       /*
+        * Do a consistency check to ensure that we NEVER have running transactions
+        * with xmin less than what the GTM has already computed. While during
+        * normal execution, this should never happen, if we ever been excluded
+        * from the xmin calculation by the GTM while we are still running old
+        * transactions, PANIC is our best bet to avoid corruption
+        */ 
        ProcArrayCheckXminConsistency(xmin);
 
        SpinLockAcquire(&ClusterMonitorCtl->mutex);
        ClusterMonitorCtl->gtm_recent_global_xmin = xmin;
        SpinLockRelease(&ClusterMonitorCtl->mutex);
 
-       if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, xmin))
-               ShmemVariableCache->latestCompletedXid = xmin;
-
        LWLockRelease(ProcArrayLock);
 }
 
index 55456cd3beca27fd2f960c0b99f80aca3f750eb6..216891485b26024587f95b1370ba105543994933 100644 (file)
@@ -1302,6 +1302,9 @@ GetOldestXminInternal(Relation rel, bool ignoreVacuum, bool computeLocal,
        TransactionIdAdvance(result);
 #endif
 
+       elog(DEBUG1, "GetOldestXminInternal - Starting computation with"
+                       "latestCompletedXid %d + 1", result);
+
        for (index = 0; index < arrayP->numProcs; index++)
        {
                int                     pgprocno = arrayP->pgprocnos[index];
@@ -1342,7 +1345,7 @@ GetOldestXminInternal(Relation rel, bool ignoreVacuum, bool computeLocal,
                         */
 #ifdef XCP
 
-                       elog(DEBUG3, "proc: pid:%d, xmin: %d, xid: %d", proc->pid,
+                       elog(DEBUG1, "proc: pid:%d, xmin: %d, xid: %d", proc->pid,
                                        xmin, xid);
 
                        if (TransactionIdIsNormal(xmin) &&
index f6453532a6d4536c9eecf3dce60a9761205df115..bc678f9770d723751d5676d030449b9bfc0f89e4 100644 (file)
@@ -565,7 +565,7 @@ GTM_GetGlobalTransactionIdMulti(GTM_TransactionHandle handle[], int txn_count,
                if (GlobalTransactionIdIsValid(gtm_txninfo->gti_gxid))
                {
                        gxid[ii] = gtm_txninfo->gti_gxid;
-                       elog(DEBUG2, "GTM_TransactionInfo has XID already assgined - %s:%d",
+                       elog(DEBUG1, "GTM_TransactionInfo has XID already assgined - %s:%d",
                                        gtm_txninfo->gti_global_session_id, gxid[ii]);
                        continue;
                }
@@ -604,7 +604,7 @@ GTM_GetGlobalTransactionIdMulti(GTM_TransactionHandle handle[], int txn_count,
 
                GlobalTransactionIdAdvance(GTMTransactions.gt_nextXid);
 
-               elog(DEBUG2, "Assigning new transaction ID = %s:%d",
+               elog(DEBUG1, "Assigning new transaction ID = %s:%d",
                                gtm_txninfo->gti_global_session_id, xid);
                gxid[ii] = gtm_txninfo->gti_gxid = xid;
                new_handle[*new_txn_count] = gtm_txninfo->gti_handle;
@@ -725,7 +725,7 @@ GTM_BeginTransactionMulti(GTM_IsolationLevel isolevel[],
                if (txn != InvalidTransactionHandle)
                {
                        gtm_txninfo[kk] = GTM_HandleToTransactionInfo(txn);
-                       elog(DEBUG2, "Existing transaction found: %s:%d",
+                       elog(DEBUG1, "Existing transaction found: %s:%d",
                                        gtm_txninfo[kk]->gti_global_session_id,
                                        gtm_txninfo[kk]->gti_gxid);
                        txns[kk] = txn;
index f8d2748a77e0fe9d7c072ea0496aa10897d5b398..ebca82b82887d8b12cc057fee7a5cdf26d045795 100644 (file)
@@ -941,9 +941,9 @@ GTM_InitNodeManager(void)
 }
 
 /* 
- * Set to 120 seconds, but should be a few multiple for cluster monitor naptime
+ * Set to 600 seconds, but should be a few multiple for cluster monitor naptime
  */ 
-#define GTM_REPORT_XMIN_DELAY_THRESHOLD (120 * 1000)
+#define GTM_REPORT_XMIN_DELAY_THRESHOLD (600 * 1000)
 
 GlobalTransactionId
 GTM_HandleGlobalXmin(GTM_PGXCNodeType type, char *node_name,
@@ -988,20 +988,31 @@ GTM_HandleGlobalXmin(GTM_PGXCNodeType type, char *node_name,
         * get opportunity to report xmin in a timely fashion, we shouldn't get
         * into this situation often.
         *
-        * The exception to this rule is that if the remote node is idle, then we
-        * actually ignore the xmin reported by it and instead calculate a new xmin
-        * for it and send it back in respone. The remote node will still done
-        * final sanity check and either accept that xmin or kill itself via PANIC
-        * mechanism.
         */
        if ((mynodeinfo->excluded) &&
-                       GlobalTransactionIdPrecedes(mynodeinfo->reported_xmin,
-                               GTM_GlobalXmin))
+                       GlobalTransactionIdPrecedes(reported_xmin, GTM_GlobalXmin))
        {
                *errcode = GTM_ERRCODE_NODE_EXCLUDED;
+
+               /*
+                * This node is joining back the cluster after being excluded from the
+                * GTM_GlobalXmin calculation because of timeout, disconnection or node
+                * failure. In such cases, we send appropriate error back to the node
+                * and let it handle the situation. To ensure that our GTM_GlobalXmin
+                * does not keep advancing while the node is trying to join back the
+                * cluster, we temporarily set reported_xmin to the current
+                * GTM_GlobalXmin and wait to see if the node finally catches up.
+                *
+                * Note: If the node had old transaction running while it was excluded
+                * by the GTM, it will fail the consistency checks and restart itself.
+                */
+               mynodeinfo->joining = true;
+               mynodeinfo->reported_xmin_time = GTM_TimestampGetCurrent();
+               mynodeinfo->reported_xmin = GTM_GlobalXmin;
+
                GTM_RWLockRelease(&mynodeinfo->node_lock);
                elog(LOG, "GTM_ERRCODE_NODE_EXCLUDED - node_name %s, reported_xmin %d "
-                               "previously reported_xmin, GTM_GlobalXmin %d", node_name,
+                               "previously reported_xmin %d, GTM_GlobalXmin %d", node_name,
                                reported_xmin,
                                mynodeinfo->reported_xmin,
                                GTM_GlobalXmin);
@@ -1009,13 +1020,18 @@ GTM_HandleGlobalXmin(GTM_PGXCNodeType type, char *node_name,
        }
 
        /*
-        * The remote node must not report a xmin which precedes the xmin it had
-        * reported in the past. If it ever happens, send an error back and let the
-        * remote node restart itself
+        * The remote node must not report a xmin which precedes the GTM_GlobalXmin
+        * we have already computed. If it ever happens, send an error back and let
+        * the remote node handle it, possibly restarting itself
         */
-       if (GlobalTransactionIdPrecedes(reported_xmin, mynodeinfo->reported_xmin))
+       if (GlobalTransactionIdPrecedes(reported_xmin, GTM_GlobalXmin))
        {
                *errcode = GTM_ERRCODE_TOO_OLD_XMIN;
+
+               mynodeinfo->joining = true;
+               mynodeinfo->reported_xmin_time = GTM_TimestampGetCurrent();
+               mynodeinfo->reported_xmin = GTM_GlobalXmin;
+
                GTM_RWLockRelease(&mynodeinfo->node_lock);
 
                /*
@@ -1028,8 +1044,8 @@ GTM_HandleGlobalXmin(GTM_PGXCNodeType type, char *node_name,
                 */
                if (mynodeinfo->reported_xmin_time)
                        elog(LOG, "GTM_ERRCODE_TOO_OLD_XMIN - node_name %s, reported_xmin %d, "
-                                       "previously reported_xmin %d", node_name,
-                                       reported_xmin, mynodeinfo->reported_xmin);
+                                       "previously reported_xmin %d, GTM_GlobalXmin %d", node_name,
+                                       reported_xmin, mynodeinfo->reported_xmin, GTM_GlobalXmin);
                return InvalidGlobalTransactionId;
        }
 
@@ -1038,7 +1054,15 @@ GTM_HandleGlobalXmin(GTM_PGXCNodeType type, char *node_name,
                        mynodeinfo->reported_xmin);
 
        mynodeinfo->reported_xmin = reported_xmin;
-       mynodeinfo->excluded = false;
+
+       /*
+        * Node joined back, set both excluded and joining to false
+        */
+       if (mynodeinfo->excluded)
+       {
+               mynodeinfo->excluded = false;
+               mynodeinfo->joining = false;
+       }
        mynodeinfo->reported_xmin_time = current_time = GTM_TimestampGetCurrent();
 
        GTM_RWLockRelease(&mynodeinfo->node_lock);
@@ -1060,13 +1084,22 @@ GTM_HandleGlobalXmin(GTM_PGXCNodeType type, char *node_name,
        {
                GTM_PGXCNodeInfo *nodeinfo = all_nodes[ii];
 
-               elog(DEBUG1, "nodeinfo %p, type: %d, exclude %c, xmin %d, time %lld",
+               elog(DEBUG1, "nodeinfo %p, type: %d, exclude %c, xmin %d, time %ld",
                                nodeinfo, nodeinfo->type, nodeinfo->excluded ? 'T' : 'F',
                                nodeinfo->reported_xmin, nodeinfo->reported_xmin_time);
 
-               if (nodeinfo->excluded)
+               /*
+                * If a node has not reported its status for
+                * GTM_REPORT_XMIN_DELAY_THRESHOLD and neither in the process of
+                * rejoining the cluster, don't include it in the GTM_GlobalXmin
+                * calculation
+                */
+               if (nodeinfo->excluded && !nodeinfo->joining)
                        continue;
 
+               /*
+                * Care only for datanodes and coordinators
+                */
                if (nodeinfo->type != GTM_NODE_COORDINATOR && nodeinfo->type !=
                                GTM_NODE_DATANODE)
                        continue;
@@ -1087,6 +1120,7 @@ GTM_HandleGlobalXmin(GTM_PGXCNodeType type, char *node_name,
                                                current_time, GTM_REPORT_XMIN_DELAY_THRESHOLD))
                        {
                                nodeinfo->excluded = true;
+                               nodeinfo->joining = false;
                                GTM_RWLockRelease(&nodeinfo->node_lock);
                                continue;
                        }
index 0212a9ecf834763014c3f3b7162c064d4e06c10d..a50ffe0c218d80a751065b2ff17011144b21c017 100644 (file)
@@ -62,6 +62,7 @@ typedef struct GTM_PGXCNodeInfo
                                                                                         *  Has the node timed out and be
                                                                                         * excluded from xmin computation?
                                                                                         */
+       bool                            joining;        /* Is the node joining back */
        bool                            idle;                           /* Has the node been idle since
                                                                                         * last report
                                                                                         */