Fix problems associated with globalXmin tracking by ClusterMonitor

author Pavan Deolasee <[email protected]>

Thu, 6 Sep 2018 08:12:48 +0000 (13:42 +0530)

committer Pavan Deolasee <[email protected]>

Thu, 6 Sep 2018 08:31:27 +0000 (14:01 +0530)
author Pavan Deolasee <[email protected]>
Thu, 6 Sep 2018 08:12:48 +0000 (13:42 +0530)
committer Pavan Deolasee <[email protected]>
Thu, 6 Sep 2018 08:31:27 +0000 (14:01 +0530)
diff --git a/src/backend/postmaster/clustermon.c b/src/backend/postmaster/clustermon.c

index d5f5df334b1bc391085b5987800f975d84a042b3..e15a0d788b48f43f70055922f22e73052a256a4c 100644 (file)
--- a/src/backend/postmaster/clustermon.c
+++ b/src/backend/postmaster/clustermon.c
@@ -79,7 +79,9 @@ ClusterMonitorInit(void)
         GlobalTransactionId newOldestXmin;
         GlobalTransactionId lastGlobalXmin;
         GlobalTransactionId latestCompletedXid;
-       int status;
+       int                                     status;
+       bool                            bootingUp = true;
+       int                                     aggreesiveReportingCount = 0;
  
         am_clustermon = true;
  
@@ -198,21 +200,33 @@ ClusterMonitorInit(void)
                 int                     rc;
  
                 /*
-                * Repeat at CLUSTER_MONITOR_NAPTIME seconds interval
+                * While booting up, aggressively try to report Xmin and fetch global
+                * state from the GTM. This allows up to be set the shared memory state
+                * before regular processing starts up. While there is no guarantee
+                * that the regular backends won't start before we get chance to setup
+                * the shared memory state, being aggressive reduces that window.
                  */
-               nap.tv_sec = CLUSTER_MONITOR_NAPTIME;
-               nap.tv_usec = 0;
-
-               /*
-                * Wait until naptime expires or we get some type of signal (all the
-                * signal handlers will wake us by calling SetLatch).
-                */
-               rc = WaitLatch(MyLatch,
-                                          WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
-                                          (nap.tv_sec * 1000L) + (nap.tv_usec / 1000L),
-                                          WAIT_EVENT_CLUSTER_MONITOR_MAIN);
-
-               ResetLatch(MyLatch);
+               if (!bootingUp)
+               {
+                       /*
+                        * Repeat at CLUSTER_MONITOR_NAPTIME seconds interval
+                        */
+                       nap.tv_sec = CLUSTER_MONITOR_NAPTIME;
+                       nap.tv_usec = 0;
+
+                       /*
+                        * Wait until naptime expires or we get some type of signal (all the
+                        * signal handlers will wake us by calling SetLatch).
+                        */
+                       rc = WaitLatch(MyLatch,
+                                                  WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+                                                  (nap.tv_sec * 1000L) + (nap.tv_usec / 1000L),
+                                                  WAIT_EVENT_CLUSTER_MONITOR_MAIN);
+
+                       ResetLatch(MyLatch);
+               }
+               else if (aggreesiveReportingCount++ > 5)
+                       bootingUp = false;
  
                 /* Process sinval catchup interrupts that happened while sleeping */
                 ProcessCatchupInterrupt();
@@ -238,7 +252,7 @@ ClusterMonitorInit(void)
                  * Compute RecentGlobalXmin, report it to the GTM and sleep for the set
                  * interval. Keep doing this forever
                  */
-               lastGlobalXmin = ClusterMonitorGetGlobalXmin();
+               lastGlobalXmin = ClusterMonitorGetGlobalXmin(true);
                 LWLockAcquire(ClusterMonitorLock, LW_EXCLUSIVE);
                 oldestXmin = GetOldestXminInternal(NULL, 0, true, lastGlobalXmin);
                 ClusterMonitorSetReportingGlobalXmin(oldestXmin);
@@ -374,15 +388,34 @@ ClusterMonitorShmemInit(void)
         }
  }
  
+/*
+ * Get GlobalXmin from the shared memory state. If invalid_ok is true, then the
+ * caller is ready to accept an InvalidGlobalTransactionId if the value is not
+ * yet set in the shared memory. This can typically only happen when the
+ * ClusterMonitor process is starting up and hasn't yet got chance to report
+ * local state and fetch global state. Or this can happen when the server is
+ * boot-strapping and not using GTM for XID management (initdb).
+ *
+ * If invalid_ok is false and shared memory state is not yet set, then just
+ * compute the GlobalXmin using regular means and return that.
+ */
  GlobalTransactionId
-ClusterMonitorGetGlobalXmin(void)
+ClusterMonitorGetGlobalXmin(bool invalid_ok)
  {
-       GlobalTransactionId xmin;
+       GlobalTransactionId xmin = InvalidGlobalTransactionId;
+       int                                     retries = 0;
  
         SpinLockAcquire(&ClusterMonitorCtl->mutex);
         xmin = ClusterMonitorCtl->gtm_recent_global_xmin;
         SpinLockRelease(&ClusterMonitorCtl->mutex);
  
+       /*
+        * If caller can't accept invalid value, then compute local GlobalXmin and
+        * return that.
+        */
+       if (!GlobalTransactionIdIsValid(xmin) && !invalid_ok)
+               xmin = GetOldestXminInternal(NULL, 0, true, InvalidTransactionId);
+
         return xmin;
  }
  
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c

index 125398f3206607ba223ff79bd9fdc3efe04a1002..390cad612cd05844234b5dabbedaea73baa9ceae 100644 (file)
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -1432,9 +1432,8 @@ GetOldestXminInternal(Relation rel, int flags, bool computeLocal,
  #ifdef XCP
         if (!computeLocal)
         {
-               xmin = (TransactionId) ClusterMonitorGetGlobalXmin();
-               if (!TransactionIdIsValid(xmin))
-                       xmin = FirstNormalTransactionId;
+               xmin = (TransactionId) ClusterMonitorGetGlobalXmin(false);
+               Assert(TransactionIdIsValid(xmin));
                 return xmin;
         }
  #endif
@@ -1909,7 +1908,7 @@ GetSnapshotData(Snapshot snapshot, bool latest)
                 globalxmin = xmin;
  
  #ifdef XCP
-       clustermon_xmin = ClusterMonitorGetGlobalXmin();
+       clustermon_xmin = ClusterMonitorGetGlobalXmin(false);
         if (TransactionIdPrecedes(clustermon_xmin, globalxmin))
                 globalxmin = clustermon_xmin;
  #endif
@@ -3500,9 +3499,8 @@ retry:
                  * Set RecentGlobalXmin by copying from the shared memory state
                  * maintained by the Clutser Monitor
                  */
-               RecentGlobalXmin = ClusterMonitorGetGlobalXmin();
-               if (!TransactionIdIsValid(RecentGlobalXmin))
-                       RecentGlobalXmin = FirstNormalTransactionId;
+               RecentGlobalXmin = ClusterMonitorGetGlobalXmin(false);
+               Assert(TransactionIdIsValid(RecentGlobalXmin));
                 /*
                  * XXX Is it ok to set RecentGlobalDataXmin same as RecentGlobalXmin ?
                  */
@@ -3601,9 +3599,8 @@ GetSnapshotFromGlobalSnapshot(Snapshot snapshot)
                  * and rejoin the cluster, but if at all it sends a snapshot to us, we
                  * should protect ourselves from using it
                  */
-               global_xmin = ClusterMonitorGetGlobalXmin();
-               if (!TransactionIdIsValid(global_xmin))
-                       global_xmin = FirstNormalTransactionId;
+               global_xmin = ClusterMonitorGetGlobalXmin(false);
+               Assert(TransactionIdIsValid(global_xmin));
  
                 if (TransactionIdPrecedes(globalSnapshot.gxmin, global_xmin))
                         elog(ERROR, "Snapshot too old - RecentGlobalXmin (%d) has already "
diff --git a/src/include/postmaster/clustermon.h b/src/include/postmaster/clustermon.h

index 8967ee5510168a8d8cd198d812334fb44f689f3b..8ec1281a743936cdaf44ea7a16d051335fe70b6a 100644 (file)
--- a/src/include/postmaster/clustermon.h
+++ b/src/include/postmaster/clustermon.h
@@ -36,7 +36,7 @@ extern bool IsClusterMonitorProcess(void);
  /* Functions to start cluster monitor process, called from postmaster */
  int ClusterMonitorInit(void);
  extern int     StartClusterMonitor(void);
-extern GlobalTransactionId ClusterMonitorGetGlobalXmin(void);
+extern GlobalTransactionId ClusterMonitorGetGlobalXmin(bool invalid_ok);
  extern void ClusterMonitorSetGlobalXmin(GlobalTransactionId xmin);
  extern GlobalTransactionId ClusterMonitorGetReportingGlobalXmin(void);
  
diff --git a/src/test/regress/expected/txid.out b/src/test/regress/expected/txid.out

index d98eecd5e28da7e48fbd3acf463f205e29afa858..5f8dc40253dd3b12c1465188439f9a871d185102 100644 (file)
--- a/src/test/regress/expected/txid.out
+++ b/src/test/regress/expected/txid.out
@@ -293,14 +293,10 @@ SELECT txid_status(2); -- FrozenTransactionId is always committed
   committed
  (1 row)
  
--- in regress testing FirstNormalTransactionId will always be behind oldestXmin
--- XXX in XL, the oldestXmin is advanced lazily and depends on the global
--- state. So the clog for FirstNormalTransactionId may very well exist and
--- txid_status gives us a correct answer
-SELECT txid_status(3);
+SELECT txid_status(3); -- in regress testing FirstNormalTransactionId will always be behind oldestXmin
   txid_status 
  -------------
- committed
+ 
  (1 row)
  
  COMMIT;
diff --git a/src/test/regress/sql/txid.sql b/src/test/regress/sql/txid.sql

index c5ecaeb663888a1dd3214d8fea40e3a2c4a5f6bb..420c677ab29122a3500710a1c6694722b5b8ba98 100644 (file)
--- a/src/test/regress/sql/txid.sql
+++ b/src/test/regress/sql/txid.sql
@@ -77,11 +77,7 @@ SELECT txid_status(:rolledback) AS rolledback;
  SELECT txid_status(:inprogress) AS inprogress;
  SELECT txid_status(1); -- BootstrapTransactionId is always committed
  SELECT txid_status(2); -- FrozenTransactionId is always committed
--- in regress testing FirstNormalTransactionId will always be behind oldestXmin
--- XXX in XL, the oldestXmin is advanced lazily and depends on the global
--- state. So the clog for FirstNormalTransactionId may very well exist and
--- txid_status gives us a correct answer
-SELECT txid_status(3);
+SELECT txid_status(3); -- in regress testing FirstNormalTransactionId will always be behind oldestXmin
  
  COMMIT;
author	Pavan Deolasee <[email protected]>
	Thu, 6 Sep 2018 08:12:48 +0000 (13:42 +0530)
committer	Pavan Deolasee <[email protected]>
	Thu, 6 Sep 2018 08:31:27 +0000 (14:01 +0530)
src/backend/postmaster/clustermon.c		patch \| blob \| blame \| history
src/backend/storage/ipc/procarray.c		patch \| blob \| blame \| history
src/include/postmaster/clustermon.h		patch \| blob \| blame \| history
src/test/regress/expected/txid.out		patch \| blob \| blame \| history
src/test/regress/sql/txid.sql		patch \| blob \| blame \| history