From: Pavan Deolasee <pavan.deolasee@gmail.com>
Date: Thu, 6 Sep 2018 08:12:48 +0000 (+0530)
Subject: Fix problems associated with globalXmin tracking by ClusterMonitor
X-Git-Tag: XL_10_R1BETA1~19
X-Git-Url: http://git.postgresql.org/gitweb/static/gitweb.js?a=commitdiff_plain;h=da509766fce98e7121a7a399c1ba468fbb3a0d25;p=postgres-xl.git

Fix problems associated with globalXmin tracking by ClusterMonitor

The very first report by the cluster monitor may be discarded by the GTM if the
reporting xmin has fallen far behind GTM's view. This leads to the globalXmin
value remaining Invalid in the shared memory state, as tracked by the
ClusterMonitor. ClusterMonitor process usually naps for CLUSTER_MONITOR_NAPTIME
(default 5s) between two successive reporting. But discard that during the
bootup process and report the xmin a bit more aggressively. This should in all
likelihood set the globalXmin correctly, before the regular backends start
processing.

The other major problem with the current code was that when the globalXmin
tracked in the shared memory state is Invalid, the callers were using
FirstNormalXid as the globalXmin. This could be disastrous especially when XID
counter has wrapped around. We could accidentally remove visible rows by using
a wrong value of globalXmin. We now fix that by computing the globalXmin using
the local state (just like we would have computed globalXmin in vanilla PG).
This should ensure that we never use a wrong or a newer value for globalXmin
than what is allowed.

Accept regression diff in txid test case resulting from the fix. The new
expected output actually matches with what upstream produces.

Per report by Hengbing and investigations/fix by me.
---

diff --git a/src/backend/postmaster/clustermon.c b/src/backend/postmaster/clustermon.c
index d5f5df334b..e15a0d788b 100644
--- a/src/backend/postmaster/clustermon.c
+++ b/src/backend/postmaster/clustermon.c
@@ -79,7 +79,9 @@ ClusterMonitorInit(void)
 	GlobalTransactionId newOldestXmin;
 	GlobalTransactionId lastGlobalXmin;
 	GlobalTransactionId latestCompletedXid;
-	int status;
+	int					status;
+	bool				bootingUp = true;
+	int					aggreesiveReportingCount = 0;
 
 	am_clustermon = true;
 
@@ -198,21 +200,33 @@ ClusterMonitorInit(void)
 		int			rc;
 
 		/*
-		 * Repeat at CLUSTER_MONITOR_NAPTIME seconds interval
+		 * While booting up, aggressively try to report Xmin and fetch global
+		 * state from the GTM. This allows up to be set the shared memory state
+		 * before regular processing starts up. While there is no guarantee
+		 * that the regular backends won't start before we get chance to setup
+		 * the shared memory state, being aggressive reduces that window.
 		 */
-		nap.tv_sec = CLUSTER_MONITOR_NAPTIME;
-		nap.tv_usec = 0;
-
-		/*
-		 * Wait until naptime expires or we get some type of signal (all the
-		 * signal handlers will wake us by calling SetLatch).
-		 */
-		rc = WaitLatch(MyLatch,
-					   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
-					   (nap.tv_sec * 1000L) + (nap.tv_usec / 1000L),
-					   WAIT_EVENT_CLUSTER_MONITOR_MAIN);
-
-		ResetLatch(MyLatch);
+		if (!bootingUp)
+		{
+			/*
+			 * Repeat at CLUSTER_MONITOR_NAPTIME seconds interval
+			 */
+			nap.tv_sec = CLUSTER_MONITOR_NAPTIME;
+			nap.tv_usec = 0;
+
+			/*
+			 * Wait until naptime expires or we get some type of signal (all the
+			 * signal handlers will wake us by calling SetLatch).
+			 */
+			rc = WaitLatch(MyLatch,
+						   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+						   (nap.tv_sec * 1000L) + (nap.tv_usec / 1000L),
+						   WAIT_EVENT_CLUSTER_MONITOR_MAIN);
+
+			ResetLatch(MyLatch);
+		}
+		else if (aggreesiveReportingCount++ > 5)
+			bootingUp = false;
 
 		/* Process sinval catchup interrupts that happened while sleeping */
 		ProcessCatchupInterrupt();
@@ -238,7 +252,7 @@ ClusterMonitorInit(void)
 		 * Compute RecentGlobalXmin, report it to the GTM and sleep for the set
 		 * interval. Keep doing this forever
 		 */
-		lastGlobalXmin = ClusterMonitorGetGlobalXmin();
+		lastGlobalXmin = ClusterMonitorGetGlobalXmin(true);
  		LWLockAcquire(ClusterMonitorLock, LW_EXCLUSIVE);
 		oldestXmin = GetOldestXminInternal(NULL, 0, true, lastGlobalXmin);
 		ClusterMonitorSetReportingGlobalXmin(oldestXmin);
@@ -374,15 +388,34 @@ ClusterMonitorShmemInit(void)
 	}
 }
 
+/*
+ * Get GlobalXmin from the shared memory state. If invalid_ok is true, then the
+ * caller is ready to accept an InvalidGlobalTransactionId if the value is not
+ * yet set in the shared memory. This can typically only happen when the
+ * ClusterMonitor process is starting up and hasn't yet got chance to report
+ * local state and fetch global state. Or this can happen when the server is
+ * boot-strapping and not using GTM for XID management (initdb).
+ *
+ * If invalid_ok is false and shared memory state is not yet set, then just
+ * compute the GlobalXmin using regular means and return that.
+ */
 GlobalTransactionId
-ClusterMonitorGetGlobalXmin(void)
+ClusterMonitorGetGlobalXmin(bool invalid_ok)
 {
-	GlobalTransactionId xmin;
+	GlobalTransactionId xmin = InvalidGlobalTransactionId;
+	int					retries = 0;
 
 	SpinLockAcquire(&ClusterMonitorCtl->mutex);
 	xmin = ClusterMonitorCtl->gtm_recent_global_xmin;
 	SpinLockRelease(&ClusterMonitorCtl->mutex);
 
+	/*
+	 * If caller can't accept invalid value, then compute local GlobalXmin and
+	 * return that.
+	 */
+	if (!GlobalTransactionIdIsValid(xmin) && !invalid_ok)
+		xmin = GetOldestXminInternal(NULL, 0, true, InvalidTransactionId);
+
 	return xmin;
 }
 
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 125398f320..390cad612c 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -1432,9 +1432,8 @@ GetOldestXminInternal(Relation rel, int flags, bool computeLocal,
 #ifdef XCP
 	if (!computeLocal)
 	{
-		xmin = (TransactionId) ClusterMonitorGetGlobalXmin();
-		if (!TransactionIdIsValid(xmin))
-			xmin = FirstNormalTransactionId;
+		xmin = (TransactionId) ClusterMonitorGetGlobalXmin(false);
+		Assert(TransactionIdIsValid(xmin));
 		return xmin;
 	}
 #endif
@@ -1909,7 +1908,7 @@ GetSnapshotData(Snapshot snapshot, bool latest)
 		globalxmin = xmin;
 
 #ifdef XCP
-	clustermon_xmin = ClusterMonitorGetGlobalXmin();
+	clustermon_xmin = ClusterMonitorGetGlobalXmin(false);
 	if (TransactionIdPrecedes(clustermon_xmin, globalxmin))
 		globalxmin = clustermon_xmin;
 #endif
@@ -3500,9 +3499,8 @@ retry:
 		 * Set RecentGlobalXmin by copying from the shared memory state
 		 * maintained by the Clutser Monitor
 		 */
-		RecentGlobalXmin = ClusterMonitorGetGlobalXmin();
-		if (!TransactionIdIsValid(RecentGlobalXmin))
-			RecentGlobalXmin = FirstNormalTransactionId;
+		RecentGlobalXmin = ClusterMonitorGetGlobalXmin(false);
+		Assert(TransactionIdIsValid(RecentGlobalXmin));
 		/*
 		 * XXX Is it ok to set RecentGlobalDataXmin same as RecentGlobalXmin ?
 		 */
@@ -3601,9 +3599,8 @@ GetSnapshotFromGlobalSnapshot(Snapshot snapshot)
 		 * and rejoin the cluster, but if at all it sends a snapshot to us, we
 		 * should protect ourselves from using it
 		 */
-		global_xmin = ClusterMonitorGetGlobalXmin();
-		if (!TransactionIdIsValid(global_xmin))
-			global_xmin = FirstNormalTransactionId;
+		global_xmin = ClusterMonitorGetGlobalXmin(false);
+		Assert(TransactionIdIsValid(global_xmin));
 
 		if (TransactionIdPrecedes(globalSnapshot.gxmin, global_xmin))
 			elog(ERROR, "Snapshot too old - RecentGlobalXmin (%d) has already "
diff --git a/src/include/postmaster/clustermon.h b/src/include/postmaster/clustermon.h
index 8967ee5510..8ec1281a74 100644
--- a/src/include/postmaster/clustermon.h
+++ b/src/include/postmaster/clustermon.h
@@ -36,7 +36,7 @@ extern bool IsClusterMonitorProcess(void);
 /* Functions to start cluster monitor process, called from postmaster */
 int ClusterMonitorInit(void);
 extern int	StartClusterMonitor(void);
-extern GlobalTransactionId ClusterMonitorGetGlobalXmin(void);
+extern GlobalTransactionId ClusterMonitorGetGlobalXmin(bool invalid_ok);
 extern void ClusterMonitorSetGlobalXmin(GlobalTransactionId xmin);
 extern GlobalTransactionId ClusterMonitorGetReportingGlobalXmin(void);
 
diff --git a/src/test/regress/expected/txid.out b/src/test/regress/expected/txid.out
index d98eecd5e2..5f8dc40253 100644
--- a/src/test/regress/expected/txid.out
+++ b/src/test/regress/expected/txid.out
@@ -293,14 +293,10 @@ SELECT txid_status(2); -- FrozenTransactionId is always committed
  committed
 (1 row)
 
--- in regress testing FirstNormalTransactionId will always be behind oldestXmin
--- XXX in XL, the oldestXmin is advanced lazily and depends on the global
--- state. So the clog for FirstNormalTransactionId may very well exist and
--- txid_status gives us a correct answer
-SELECT txid_status(3);
+SELECT txid_status(3); -- in regress testing FirstNormalTransactionId will always be behind oldestXmin
  txid_status 
 -------------
- committed
+ 
 (1 row)
 
 COMMIT;
diff --git a/src/test/regress/sql/txid.sql b/src/test/regress/sql/txid.sql
index c5ecaeb663..420c677ab2 100644
--- a/src/test/regress/sql/txid.sql
+++ b/src/test/regress/sql/txid.sql
@@ -77,11 +77,7 @@ SELECT txid_status(:rolledback) AS rolledback;
 SELECT txid_status(:inprogress) AS inprogress;
 SELECT txid_status(1); -- BootstrapTransactionId is always committed
 SELECT txid_status(2); -- FrozenTransactionId is always committed
--- in regress testing FirstNormalTransactionId will always be behind oldestXmin
--- XXX in XL, the oldestXmin is advanced lazily and depends on the global
--- state. So the clog for FirstNormalTransactionId may very well exist and
--- txid_status gives us a correct answer
-SELECT txid_status(3);
+SELECT txid_status(3); -- in regress testing FirstNormalTransactionId will always be behind oldestXmin
 
 COMMIT;