Fix problems associated with globalXmin tracking by ClusterMonitor
authorPavan Deolasee <[email protected]>
Thu, 6 Sep 2018 08:12:48 +0000 (13:42 +0530)
committerPavan Deolasee <[email protected]>
Thu, 6 Sep 2018 08:31:27 +0000 (14:01 +0530)
The very first report by the cluster monitor may be discarded by the GTM if the
reporting xmin has fallen far behind GTM's view. This leads to the globalXmin
value remaining Invalid in the shared memory state, as tracked by the
ClusterMonitor. ClusterMonitor process usually naps for CLUSTER_MONITOR_NAPTIME
(default 5s) between two successive reporting. But discard that during the
bootup process and report the xmin a bit more aggressively. This should in all
likelihood set the globalXmin correctly, before the regular backends start
processing.

The other major problem with the current code was that when the globalXmin
tracked in the shared memory state is Invalid, the callers were using
FirstNormalXid as the globalXmin. This could be disastrous especially when XID
counter has wrapped around. We could accidentally remove visible rows by using
a wrong value of globalXmin. We now fix that by computing the globalXmin using
the local state (just like we would have computed globalXmin in vanilla PG).
This should ensure that we never use a wrong or a newer value for globalXmin
than what is allowed.

Accept regression diff in txid test case resulting from the fix. The new
expected output actually matches with what upstream produces.

Per report by Hengbing and investigations/fix by me.

src/backend/postmaster/clustermon.c
src/backend/storage/ipc/procarray.c
src/include/postmaster/clustermon.h
src/test/regress/expected/txid.out
src/test/regress/sql/txid.sql

index d5f5df334b1bc391085b5987800f975d84a042b3..e15a0d788b48f43f70055922f22e73052a256a4c 100644 (file)
@@ -79,7 +79,9 @@ ClusterMonitorInit(void)
        GlobalTransactionId newOldestXmin;
        GlobalTransactionId lastGlobalXmin;
        GlobalTransactionId latestCompletedXid;
-       int status;
+       int                                     status;
+       bool                            bootingUp = true;
+       int                                     aggreesiveReportingCount = 0;
 
        am_clustermon = true;
 
@@ -198,21 +200,33 @@ ClusterMonitorInit(void)
                int                     rc;
 
                /*
-                * Repeat at CLUSTER_MONITOR_NAPTIME seconds interval
+                * While booting up, aggressively try to report Xmin and fetch global
+                * state from the GTM. This allows up to be set the shared memory state
+                * before regular processing starts up. While there is no guarantee
+                * that the regular backends won't start before we get chance to setup
+                * the shared memory state, being aggressive reduces that window.
                 */
-               nap.tv_sec = CLUSTER_MONITOR_NAPTIME;
-               nap.tv_usec = 0;
-
-               /*
-                * Wait until naptime expires or we get some type of signal (all the
-                * signal handlers will wake us by calling SetLatch).
-                */
-               rc = WaitLatch(MyLatch,
-                                          WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
-                                          (nap.tv_sec * 1000L) + (nap.tv_usec / 1000L),
-                                          WAIT_EVENT_CLUSTER_MONITOR_MAIN);
-
-               ResetLatch(MyLatch);
+               if (!bootingUp)
+               {
+                       /*
+                        * Repeat at CLUSTER_MONITOR_NAPTIME seconds interval
+                        */
+                       nap.tv_sec = CLUSTER_MONITOR_NAPTIME;
+                       nap.tv_usec = 0;
+
+                       /*
+                        * Wait until naptime expires or we get some type of signal (all the
+                        * signal handlers will wake us by calling SetLatch).
+                        */
+                       rc = WaitLatch(MyLatch,
+                                                  WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+                                                  (nap.tv_sec * 1000L) + (nap.tv_usec / 1000L),
+                                                  WAIT_EVENT_CLUSTER_MONITOR_MAIN);
+
+                       ResetLatch(MyLatch);
+               }
+               else if (aggreesiveReportingCount++ > 5)
+                       bootingUp = false;
 
                /* Process sinval catchup interrupts that happened while sleeping */
                ProcessCatchupInterrupt();
@@ -238,7 +252,7 @@ ClusterMonitorInit(void)
                 * Compute RecentGlobalXmin, report it to the GTM and sleep for the set
                 * interval. Keep doing this forever
                 */
-               lastGlobalXmin = ClusterMonitorGetGlobalXmin();
+               lastGlobalXmin = ClusterMonitorGetGlobalXmin(true);
                LWLockAcquire(ClusterMonitorLock, LW_EXCLUSIVE);
                oldestXmin = GetOldestXminInternal(NULL, 0, true, lastGlobalXmin);
                ClusterMonitorSetReportingGlobalXmin(oldestXmin);
@@ -374,15 +388,34 @@ ClusterMonitorShmemInit(void)
        }
 }
 
+/*
+ * Get GlobalXmin from the shared memory state. If invalid_ok is true, then the
+ * caller is ready to accept an InvalidGlobalTransactionId if the value is not
+ * yet set in the shared memory. This can typically only happen when the
+ * ClusterMonitor process is starting up and hasn't yet got chance to report
+ * local state and fetch global state. Or this can happen when the server is
+ * boot-strapping and not using GTM for XID management (initdb).
+ *
+ * If invalid_ok is false and shared memory state is not yet set, then just
+ * compute the GlobalXmin using regular means and return that.
+ */
 GlobalTransactionId
-ClusterMonitorGetGlobalXmin(void)
+ClusterMonitorGetGlobalXmin(bool invalid_ok)
 {
-       GlobalTransactionId xmin;
+       GlobalTransactionId xmin = InvalidGlobalTransactionId;
+       int                                     retries = 0;
 
        SpinLockAcquire(&ClusterMonitorCtl->mutex);
        xmin = ClusterMonitorCtl->gtm_recent_global_xmin;
        SpinLockRelease(&ClusterMonitorCtl->mutex);
 
+       /*
+        * If caller can't accept invalid value, then compute local GlobalXmin and
+        * return that.
+        */
+       if (!GlobalTransactionIdIsValid(xmin) && !invalid_ok)
+               xmin = GetOldestXminInternal(NULL, 0, true, InvalidTransactionId);
+
        return xmin;
 }
 
index 125398f3206607ba223ff79bd9fdc3efe04a1002..390cad612cd05844234b5dabbedaea73baa9ceae 100644 (file)
@@ -1432,9 +1432,8 @@ GetOldestXminInternal(Relation rel, int flags, bool computeLocal,
 #ifdef XCP
        if (!computeLocal)
        {
-               xmin = (TransactionId) ClusterMonitorGetGlobalXmin();
-               if (!TransactionIdIsValid(xmin))
-                       xmin = FirstNormalTransactionId;
+               xmin = (TransactionId) ClusterMonitorGetGlobalXmin(false);
+               Assert(TransactionIdIsValid(xmin));
                return xmin;
        }
 #endif
@@ -1909,7 +1908,7 @@ GetSnapshotData(Snapshot snapshot, bool latest)
                globalxmin = xmin;
 
 #ifdef XCP
-       clustermon_xmin = ClusterMonitorGetGlobalXmin();
+       clustermon_xmin = ClusterMonitorGetGlobalXmin(false);
        if (TransactionIdPrecedes(clustermon_xmin, globalxmin))
                globalxmin = clustermon_xmin;
 #endif
@@ -3500,9 +3499,8 @@ retry:
                 * Set RecentGlobalXmin by copying from the shared memory state
                 * maintained by the Clutser Monitor
                 */
-               RecentGlobalXmin = ClusterMonitorGetGlobalXmin();
-               if (!TransactionIdIsValid(RecentGlobalXmin))
-                       RecentGlobalXmin = FirstNormalTransactionId;
+               RecentGlobalXmin = ClusterMonitorGetGlobalXmin(false);
+               Assert(TransactionIdIsValid(RecentGlobalXmin));
                /*
                 * XXX Is it ok to set RecentGlobalDataXmin same as RecentGlobalXmin ?
                 */
@@ -3601,9 +3599,8 @@ GetSnapshotFromGlobalSnapshot(Snapshot snapshot)
                 * and rejoin the cluster, but if at all it sends a snapshot to us, we
                 * should protect ourselves from using it
                 */
-               global_xmin = ClusterMonitorGetGlobalXmin();
-               if (!TransactionIdIsValid(global_xmin))
-                       global_xmin = FirstNormalTransactionId;
+               global_xmin = ClusterMonitorGetGlobalXmin(false);
+               Assert(TransactionIdIsValid(global_xmin));
 
                if (TransactionIdPrecedes(globalSnapshot.gxmin, global_xmin))
                        elog(ERROR, "Snapshot too old - RecentGlobalXmin (%d) has already "
index 8967ee5510168a8d8cd198d812334fb44f689f3b..8ec1281a743936cdaf44ea7a16d051335fe70b6a 100644 (file)
@@ -36,7 +36,7 @@ extern bool IsClusterMonitorProcess(void);
 /* Functions to start cluster monitor process, called from postmaster */
 int ClusterMonitorInit(void);
 extern int     StartClusterMonitor(void);
-extern GlobalTransactionId ClusterMonitorGetGlobalXmin(void);
+extern GlobalTransactionId ClusterMonitorGetGlobalXmin(bool invalid_ok);
 extern void ClusterMonitorSetGlobalXmin(GlobalTransactionId xmin);
 extern GlobalTransactionId ClusterMonitorGetReportingGlobalXmin(void);
 
index d98eecd5e28da7e48fbd3acf463f205e29afa858..5f8dc40253dd3b12c1465188439f9a871d185102 100644 (file)
@@ -293,14 +293,10 @@ SELECT txid_status(2); -- FrozenTransactionId is always committed
  committed
 (1 row)
 
--- in regress testing FirstNormalTransactionId will always be behind oldestXmin
--- XXX in XL, the oldestXmin is advanced lazily and depends on the global
--- state. So the clog for FirstNormalTransactionId may very well exist and
--- txid_status gives us a correct answer
-SELECT txid_status(3);
+SELECT txid_status(3); -- in regress testing FirstNormalTransactionId will always be behind oldestXmin
  txid_status 
 -------------
- committed
 (1 row)
 
 COMMIT;
index c5ecaeb663888a1dd3214d8fea40e3a2c4a5f6bb..420c677ab29122a3500710a1c6694722b5b8ba98 100644 (file)
@@ -77,11 +77,7 @@ SELECT txid_status(:rolledback) AS rolledback;
 SELECT txid_status(:inprogress) AS inprogress;
 SELECT txid_status(1); -- BootstrapTransactionId is always committed
 SELECT txid_status(2); -- FrozenTransactionId is always committed
--- in regress testing FirstNormalTransactionId will always be behind oldestXmin
--- XXX in XL, the oldestXmin is advanced lazily and depends on the global
--- state. So the clog for FirstNormalTransactionId may very well exist and
--- txid_status gives us a correct answer
-SELECT txid_status(3);
+SELECT txid_status(3); -- in regress testing FirstNormalTransactionId will always be behind oldestXmin
 
 COMMIT;