Improve node exclusion and node rejoining logic for calculation for global

author Pavan Deolasee <[email protected]>

Thu, 11 Feb 2016 07:58:11 +0000 (13:28 +0530)

committer Pavan Deolasee <[email protected]>

Thu, 11 Feb 2016 07:58:11 +0000 (13:28 +0530)
author Pavan Deolasee <[email protected]>
Thu, 11 Feb 2016 07:58:11 +0000 (13:28 +0530)
committer Pavan Deolasee <[email protected]>
Thu, 11 Feb 2016 07:58:11 +0000 (13:28 +0530)
diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c

index 26defd944e154a7fc7d7c2714655df967762b2d8..64f39d19ad3550f0d357f33411fade6f77524c50 100644 (file)
--- a/src/backend/access/transam/gtm.c
+++ b/src/backend/access/transam/gtm.c
@@ -704,10 +704,8 @@ ReportGlobalXmin(GlobalTransactionId gxid, GlobalTransactionId *global_xmin,
         if (!conn)
                 return EOF;
  
-       if (report_global_xmin(conn, PGXCNodeName,
+       report_global_xmin(conn, PGXCNodeName,
                         IS_PGXC_COORDINATOR ?  GTM_NODE_COORDINATOR : GTM_NODE_DATANODE,
-                       gxid, global_xmin, latest_completed_xid, &errcode))
-               return errcode;
-       else
-               return 0;
+                       gxid, global_xmin, latest_completed_xid, &errcode);
+       return errcode;
  }
diff --git a/src/backend/postmaster/clustermon.c b/src/backend/postmaster/clustermon.c

index f2148667e2d10a54530a7dda855b645cb82c30a7..2ff09787edb600e6846b58827830b8a8d06ef8ef 100644 (file)
--- a/src/backend/postmaster/clustermon.c
+++ b/src/backend/postmaster/clustermon.c
@@ -212,8 +212,11 @@ ClusterMonitorInit(void)
                 if ((status = ReportGlobalXmin(oldestXmin, &newOldestXmin,
                                                 &latestCompletedXid)))
                 {
-                       elog(DEBUG2, "Failed to report RecentGlobalXmin to GTM - %d:%d",
-                                       status, newOldestXmin);
+                       elog(DEBUG1, "Failed (status %d) to report RecentGlobalXmin "
+                                       "- reported RecentGlobalXmin %d, received "
+                                       "RecentGlobalXmin %d, " "received latestCompletedXid",
+                                       status, oldestXmin, newOldestXmin,
+                                       latestCompletedXid);
                         if (status == GTM_ERRCODE_TOO_OLD_XMIN ||
                                 status == GTM_ERRCODE_NODE_EXCLUDED)
                         {
@@ -238,14 +241,17 @@ ClusterMonitorInit(void)
                                         SetLatestCompletedXid(latestCompletedXid);
                                         continue;
                                 }
-                               elog(PANIC, "Global xmin computation mismatch");
                         }
                 }
                 else
                 {
+                       elog(DEBUG1, "Successfully reported xmin to GTM - reported_xmin %d,"
+                                       "received RecentGlobalXmin %d, "
+                                       "received latestCompletedXid %d", oldestXmin,
+                                       newOldestXmin, latestCompletedXid);
+
                         SetLatestCompletedXid(latestCompletedXid);
                         ClusterMonitorSetReportedGlobalXmin(oldestXmin);
-                       elog(DEBUG2, "Updating global_xmin to %d", newOldestXmin);
                         if (GlobalTransactionIdIsValid(newOldestXmin))
                                 ClusterMonitorSetGlobalXmin(newOldestXmin);
                 }
@@ -373,15 +379,20 @@ void
  ClusterMonitorSetGlobalXmin(GlobalTransactionId xmin)
  {
         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+       /*
+        * Do a consistency check to ensure that we NEVER have running transactions
+        * with xmin less than what the GTM has already computed. While during
+        * normal execution, this should never happen, if we ever been excluded
+        * from the xmin calculation by the GTM while we are still running old
+        * transactions, PANIC is our best bet to avoid corruption
+        */ 
         ProcArrayCheckXminConsistency(xmin);
  
         SpinLockAcquire(&ClusterMonitorCtl->mutex);
         ClusterMonitorCtl->gtm_recent_global_xmin = xmin;
         SpinLockRelease(&ClusterMonitorCtl->mutex);
  
-       if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, xmin))
-               ShmemVariableCache->latestCompletedXid = xmin;
-
         LWLockRelease(ProcArrayLock);
  }
  
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c

index 55456cd3beca27fd2f960c0b99f80aca3f750eb6..216891485b26024587f95b1370ba105543994933 100644 (file)
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -1302,6 +1302,9 @@ GetOldestXminInternal(Relation rel, bool ignoreVacuum, bool computeLocal,
         TransactionIdAdvance(result);
  #endif
  
+       elog(DEBUG1, "GetOldestXminInternal - Starting computation with"
+                       "latestCompletedXid %d + 1", result);
+
         for (index = 0; index < arrayP->numProcs; index++)
         {
                 int                     pgprocno = arrayP->pgprocnos[index];
@@ -1342,7 +1345,7 @@ GetOldestXminInternal(Relation rel, bool ignoreVacuum, bool computeLocal,
                          */
  #ifdef XCP
  
-                       elog(DEBUG3, "proc: pid:%d, xmin: %d, xid: %d", proc->pid,
+                       elog(DEBUG1, "proc: pid:%d, xmin: %d, xid: %d", proc->pid,
                                         xmin, xid);
  
                         if (TransactionIdIsNormal(xmin) &&
diff --git a/src/gtm/main/gtm_txn.c b/src/gtm/main/gtm_txn.c

index f6453532a6d4536c9eecf3dce60a9761205df115..bc678f9770d723751d5676d030449b9bfc0f89e4 100644 (file)
--- a/src/gtm/main/gtm_txn.c
+++ b/src/gtm/main/gtm_txn.c
@@ -565,7 +565,7 @@ GTM_GetGlobalTransactionIdMulti(GTM_TransactionHandle handle[], int txn_count,
                 if (GlobalTransactionIdIsValid(gtm_txninfo->gti_gxid))
                 {
                         gxid[ii] = gtm_txninfo->gti_gxid;
-                       elog(DEBUG2, "GTM_TransactionInfo has XID already assgined - %s:%d",
+                       elog(DEBUG1, "GTM_TransactionInfo has XID already assgined - %s:%d",
                                         gtm_txninfo->gti_global_session_id, gxid[ii]);
                         continue;
                 }
@@ -604,7 +604,7 @@ GTM_GetGlobalTransactionIdMulti(GTM_TransactionHandle handle[], int txn_count,
  
                 GlobalTransactionIdAdvance(GTMTransactions.gt_nextXid);
  
-               elog(DEBUG2, "Assigning new transaction ID = %s:%d",
+               elog(DEBUG1, "Assigning new transaction ID = %s:%d",
                                 gtm_txninfo->gti_global_session_id, xid);
                 gxid[ii] = gtm_txninfo->gti_gxid = xid;
                 new_handle[*new_txn_count] = gtm_txninfo->gti_handle;
@@ -725,7 +725,7 @@ GTM_BeginTransactionMulti(GTM_IsolationLevel isolevel[],
                 if (txn != InvalidTransactionHandle)
                 {
                         gtm_txninfo[kk] = GTM_HandleToTransactionInfo(txn);
-                       elog(DEBUG2, "Existing transaction found: %s:%d",
+                       elog(DEBUG1, "Existing transaction found: %s:%d",
                                         gtm_txninfo[kk]->gti_global_session_id,
                                         gtm_txninfo[kk]->gti_gxid);
                         txns[kk] = txn;
diff --git a/src/gtm/recovery/register_common.c b/src/gtm/recovery/register_common.c

index f8d2748a77e0fe9d7c072ea0496aa10897d5b398..ebca82b82887d8b12cc057fee7a5cdf26d045795 100644 (file)
--- a/src/gtm/recovery/register_common.c
+++ b/src/gtm/recovery/register_common.c
@@ -941,9 +941,9 @@ GTM_InitNodeManager(void)
  }
  
  /* 
- * Set to 120 seconds, but should be a few multiple for cluster monitor naptime
+ * Set to 600 seconds, but should be a few multiple for cluster monitor naptime
   */ 
-#define GTM_REPORT_XMIN_DELAY_THRESHOLD (120 * 1000)
+#define GTM_REPORT_XMIN_DELAY_THRESHOLD (600 * 1000)
  
  GlobalTransactionId
  GTM_HandleGlobalXmin(GTM_PGXCNodeType type, char *node_name,
@@ -988,20 +988,31 @@ GTM_HandleGlobalXmin(GTM_PGXCNodeType type, char *node_name,
          * get opportunity to report xmin in a timely fashion, we shouldn't get
          * into this situation often.
          *
-        * The exception to this rule is that if the remote node is idle, then we
-        * actually ignore the xmin reported by it and instead calculate a new xmin
-        * for it and send it back in respone. The remote node will still done
-        * final sanity check and either accept that xmin or kill itself via PANIC
-        * mechanism.
          */
         if ((mynodeinfo->excluded) &&
-                       GlobalTransactionIdPrecedes(mynodeinfo->reported_xmin,
-                               GTM_GlobalXmin))
+                       GlobalTransactionIdPrecedes(reported_xmin, GTM_GlobalXmin))
         {
                 *errcode = GTM_ERRCODE_NODE_EXCLUDED;
+
+               /*
+                * This node is joining back the cluster after being excluded from the
+                * GTM_GlobalXmin calculation because of timeout, disconnection or node
+                * failure. In such cases, we send appropriate error back to the node
+                * and let it handle the situation. To ensure that our GTM_GlobalXmin
+                * does not keep advancing while the node is trying to join back the
+                * cluster, we temporarily set reported_xmin to the current
+                * GTM_GlobalXmin and wait to see if the node finally catches up.
+                *
+                * Note: If the node had old transaction running while it was excluded
+                * by the GTM, it will fail the consistency checks and restart itself.
+                */
+               mynodeinfo->joining = true;
+               mynodeinfo->reported_xmin_time = GTM_TimestampGetCurrent();
+               mynodeinfo->reported_xmin = GTM_GlobalXmin;
+
                 GTM_RWLockRelease(&mynodeinfo->node_lock);
                 elog(LOG, "GTM_ERRCODE_NODE_EXCLUDED - node_name %s, reported_xmin %d "
-                               "previously reported_xmin, GTM_GlobalXmin %d", node_name,
+                               "previously reported_xmin %d, GTM_GlobalXmin %d", node_name,
                                 reported_xmin,
                                 mynodeinfo->reported_xmin,
                                 GTM_GlobalXmin);
@@ -1009,13 +1020,18 @@ GTM_HandleGlobalXmin(GTM_PGXCNodeType type, char *node_name,
         }
  
         /*
-        * The remote node must not report a xmin which precedes the xmin it had
-        * reported in the past. If it ever happens, send an error back and let the
-        * remote node restart itself
+        * The remote node must not report a xmin which precedes the GTM_GlobalXmin
+        * we have already computed. If it ever happens, send an error back and let
+        * the remote node handle it, possibly restarting itself
          */
-       if (GlobalTransactionIdPrecedes(reported_xmin, mynodeinfo->reported_xmin))
+       if (GlobalTransactionIdPrecedes(reported_xmin, GTM_GlobalXmin))
         {
                 *errcode = GTM_ERRCODE_TOO_OLD_XMIN;
+
+               mynodeinfo->joining = true;
+               mynodeinfo->reported_xmin_time = GTM_TimestampGetCurrent();
+               mynodeinfo->reported_xmin = GTM_GlobalXmin;
+
                 GTM_RWLockRelease(&mynodeinfo->node_lock);
  
                 /*
@@ -1028,8 +1044,8 @@ GTM_HandleGlobalXmin(GTM_PGXCNodeType type, char *node_name,
                  */
                 if (mynodeinfo->reported_xmin_time)
                         elog(LOG, "GTM_ERRCODE_TOO_OLD_XMIN - node_name %s, reported_xmin %d, "
-                                       "previously reported_xmin %d", node_name,
-                                       reported_xmin, mynodeinfo->reported_xmin);
+                                       "previously reported_xmin %d, GTM_GlobalXmin %d", node_name,
+                                       reported_xmin, mynodeinfo->reported_xmin, GTM_GlobalXmin);
                 return InvalidGlobalTransactionId;
         }
  
@@ -1038,7 +1054,15 @@ GTM_HandleGlobalXmin(GTM_PGXCNodeType type, char *node_name,
                         mynodeinfo->reported_xmin);
  
         mynodeinfo->reported_xmin = reported_xmin;
-       mynodeinfo->excluded = false;
+
+       /*
+        * Node joined back, set both excluded and joining to false
+        */
+       if (mynodeinfo->excluded)
+       {
+               mynodeinfo->excluded = false;
+               mynodeinfo->joining = false;
+       }
         mynodeinfo->reported_xmin_time = current_time = GTM_TimestampGetCurrent();
  
         GTM_RWLockRelease(&mynodeinfo->node_lock);
@@ -1060,13 +1084,22 @@ GTM_HandleGlobalXmin(GTM_PGXCNodeType type, char *node_name,
         {
                 GTM_PGXCNodeInfo *nodeinfo = all_nodes[ii];
  
-               elog(DEBUG1, "nodeinfo %p, type: %d, exclude %c, xmin %d, time %lld",
+               elog(DEBUG1, "nodeinfo %p, type: %d, exclude %c, xmin %d, time %ld",
                                 nodeinfo, nodeinfo->type, nodeinfo->excluded ? 'T' : 'F',
                                 nodeinfo->reported_xmin, nodeinfo->reported_xmin_time);
  
-               if (nodeinfo->excluded)
+               /*
+                * If a node has not reported its status for
+                * GTM_REPORT_XMIN_DELAY_THRESHOLD and neither in the process of
+                * rejoining the cluster, don't include it in the GTM_GlobalXmin
+                * calculation
+                */
+               if (nodeinfo->excluded && !nodeinfo->joining)
                         continue;
  
+               /*
+                * Care only for datanodes and coordinators
+                */
                 if (nodeinfo->type != GTM_NODE_COORDINATOR && nodeinfo->type !=
                                 GTM_NODE_DATANODE)
                         continue;
@@ -1087,6 +1120,7 @@ GTM_HandleGlobalXmin(GTM_PGXCNodeType type, char *node_name,
                                                 current_time, GTM_REPORT_XMIN_DELAY_THRESHOLD))
                         {
                                 nodeinfo->excluded = true;
+                               nodeinfo->joining = false;
                                 GTM_RWLockRelease(&nodeinfo->node_lock);
                                 continue;
                         }
diff --git a/src/include/gtm/register.h b/src/include/gtm/register.h

index 0212a9ecf834763014c3f3b7162c064d4e06c10d..a50ffe0c218d80a751065b2ff17011144b21c017 100644 (file)
--- a/src/include/gtm/register.h
+++ b/src/include/gtm/register.h
@@ -62,6 +62,7 @@ typedef struct GTM_PGXCNodeInfo
                                                                                          *  Has the node timed out and be
                                                                                          * excluded from xmin computation?
                                                                                          */
+       bool                            joining;        /* Is the node joining back */
         bool                            idle;                           /* Has the node been idle since
                                                                                          * last report
                                                                                          */
author	Pavan Deolasee <[email protected]>
	Thu, 11 Feb 2016 07:58:11 +0000 (13:28 +0530)
committer	Pavan Deolasee <[email protected]>
	Thu, 11 Feb 2016 07:58:11 +0000 (13:28 +0530)
src/backend/access/transam/gtm.c		patch \| blob \| blame \| history
src/backend/postmaster/clustermon.c		patch \| blob \| blame \| history
src/backend/storage/ipc/procarray.c		patch \| blob \| blame \| history
src/gtm/main/gtm_txn.c		patch \| blob \| blame \| history
src/gtm/recovery/register_common.c		patch \| blob \| blame \| history
src/include/gtm/register.h		patch \| blob \| blame \| history