From: Pavan Deolasee Date: Thu, 6 Sep 2018 08:12:48 +0000 (+0530) Subject: Fix problems associated with globalXmin tracking by ClusterMonitor X-Git-Tag: XL_10_R1BETA1~19 X-Git-Url: http://git.postgresql.org/gitweb/static/gitweb.js?a=commitdiff_plain;h=da509766fce98e7121a7a399c1ba468fbb3a0d25;p=postgres-xl.git Fix problems associated with globalXmin tracking by ClusterMonitor The very first report by the cluster monitor may be discarded by the GTM if the reporting xmin has fallen far behind GTM's view. This leads to the globalXmin value remaining Invalid in the shared memory state, as tracked by the ClusterMonitor. ClusterMonitor process usually naps for CLUSTER_MONITOR_NAPTIME (default 5s) between two successive reporting. But discard that during the bootup process and report the xmin a bit more aggressively. This should in all likelihood set the globalXmin correctly, before the regular backends start processing. The other major problem with the current code was that when the globalXmin tracked in the shared memory state is Invalid, the callers were using FirstNormalXid as the globalXmin. This could be disastrous especially when XID counter has wrapped around. We could accidentally remove visible rows by using a wrong value of globalXmin. We now fix that by computing the globalXmin using the local state (just like we would have computed globalXmin in vanilla PG). This should ensure that we never use a wrong or a newer value for globalXmin than what is allowed. Accept regression diff in txid test case resulting from the fix. The new expected output actually matches with what upstream produces. Per report by Hengbing and investigations/fix by me. --- diff --git a/src/backend/postmaster/clustermon.c b/src/backend/postmaster/clustermon.c index d5f5df334b..e15a0d788b 100644 --- a/src/backend/postmaster/clustermon.c +++ b/src/backend/postmaster/clustermon.c @@ -79,7 +79,9 @@ ClusterMonitorInit(void) GlobalTransactionId newOldestXmin; GlobalTransactionId lastGlobalXmin; GlobalTransactionId latestCompletedXid; - int status; + int status; + bool bootingUp = true; + int aggreesiveReportingCount = 0; am_clustermon = true; @@ -198,21 +200,33 @@ ClusterMonitorInit(void) int rc; /* - * Repeat at CLUSTER_MONITOR_NAPTIME seconds interval + * While booting up, aggressively try to report Xmin and fetch global + * state from the GTM. This allows up to be set the shared memory state + * before regular processing starts up. While there is no guarantee + * that the regular backends won't start before we get chance to setup + * the shared memory state, being aggressive reduces that window. */ - nap.tv_sec = CLUSTER_MONITOR_NAPTIME; - nap.tv_usec = 0; - - /* - * Wait until naptime expires or we get some type of signal (all the - * signal handlers will wake us by calling SetLatch). - */ - rc = WaitLatch(MyLatch, - WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, - (nap.tv_sec * 1000L) + (nap.tv_usec / 1000L), - WAIT_EVENT_CLUSTER_MONITOR_MAIN); - - ResetLatch(MyLatch); + if (!bootingUp) + { + /* + * Repeat at CLUSTER_MONITOR_NAPTIME seconds interval + */ + nap.tv_sec = CLUSTER_MONITOR_NAPTIME; + nap.tv_usec = 0; + + /* + * Wait until naptime expires or we get some type of signal (all the + * signal handlers will wake us by calling SetLatch). + */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + (nap.tv_sec * 1000L) + (nap.tv_usec / 1000L), + WAIT_EVENT_CLUSTER_MONITOR_MAIN); + + ResetLatch(MyLatch); + } + else if (aggreesiveReportingCount++ > 5) + bootingUp = false; /* Process sinval catchup interrupts that happened while sleeping */ ProcessCatchupInterrupt(); @@ -238,7 +252,7 @@ ClusterMonitorInit(void) * Compute RecentGlobalXmin, report it to the GTM and sleep for the set * interval. Keep doing this forever */ - lastGlobalXmin = ClusterMonitorGetGlobalXmin(); + lastGlobalXmin = ClusterMonitorGetGlobalXmin(true); LWLockAcquire(ClusterMonitorLock, LW_EXCLUSIVE); oldestXmin = GetOldestXminInternal(NULL, 0, true, lastGlobalXmin); ClusterMonitorSetReportingGlobalXmin(oldestXmin); @@ -374,15 +388,34 @@ ClusterMonitorShmemInit(void) } } +/* + * Get GlobalXmin from the shared memory state. If invalid_ok is true, then the + * caller is ready to accept an InvalidGlobalTransactionId if the value is not + * yet set in the shared memory. This can typically only happen when the + * ClusterMonitor process is starting up and hasn't yet got chance to report + * local state and fetch global state. Or this can happen when the server is + * boot-strapping and not using GTM for XID management (initdb). + * + * If invalid_ok is false and shared memory state is not yet set, then just + * compute the GlobalXmin using regular means and return that. + */ GlobalTransactionId -ClusterMonitorGetGlobalXmin(void) +ClusterMonitorGetGlobalXmin(bool invalid_ok) { - GlobalTransactionId xmin; + GlobalTransactionId xmin = InvalidGlobalTransactionId; + int retries = 0; SpinLockAcquire(&ClusterMonitorCtl->mutex); xmin = ClusterMonitorCtl->gtm_recent_global_xmin; SpinLockRelease(&ClusterMonitorCtl->mutex); + /* + * If caller can't accept invalid value, then compute local GlobalXmin and + * return that. + */ + if (!GlobalTransactionIdIsValid(xmin) && !invalid_ok) + xmin = GetOldestXminInternal(NULL, 0, true, InvalidTransactionId); + return xmin; } diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 125398f320..390cad612c 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -1432,9 +1432,8 @@ GetOldestXminInternal(Relation rel, int flags, bool computeLocal, #ifdef XCP if (!computeLocal) { - xmin = (TransactionId) ClusterMonitorGetGlobalXmin(); - if (!TransactionIdIsValid(xmin)) - xmin = FirstNormalTransactionId; + xmin = (TransactionId) ClusterMonitorGetGlobalXmin(false); + Assert(TransactionIdIsValid(xmin)); return xmin; } #endif @@ -1909,7 +1908,7 @@ GetSnapshotData(Snapshot snapshot, bool latest) globalxmin = xmin; #ifdef XCP - clustermon_xmin = ClusterMonitorGetGlobalXmin(); + clustermon_xmin = ClusterMonitorGetGlobalXmin(false); if (TransactionIdPrecedes(clustermon_xmin, globalxmin)) globalxmin = clustermon_xmin; #endif @@ -3500,9 +3499,8 @@ retry: * Set RecentGlobalXmin by copying from the shared memory state * maintained by the Clutser Monitor */ - RecentGlobalXmin = ClusterMonitorGetGlobalXmin(); - if (!TransactionIdIsValid(RecentGlobalXmin)) - RecentGlobalXmin = FirstNormalTransactionId; + RecentGlobalXmin = ClusterMonitorGetGlobalXmin(false); + Assert(TransactionIdIsValid(RecentGlobalXmin)); /* * XXX Is it ok to set RecentGlobalDataXmin same as RecentGlobalXmin ? */ @@ -3601,9 +3599,8 @@ GetSnapshotFromGlobalSnapshot(Snapshot snapshot) * and rejoin the cluster, but if at all it sends a snapshot to us, we * should protect ourselves from using it */ - global_xmin = ClusterMonitorGetGlobalXmin(); - if (!TransactionIdIsValid(global_xmin)) - global_xmin = FirstNormalTransactionId; + global_xmin = ClusterMonitorGetGlobalXmin(false); + Assert(TransactionIdIsValid(global_xmin)); if (TransactionIdPrecedes(globalSnapshot.gxmin, global_xmin)) elog(ERROR, "Snapshot too old - RecentGlobalXmin (%d) has already " diff --git a/src/include/postmaster/clustermon.h b/src/include/postmaster/clustermon.h index 8967ee5510..8ec1281a74 100644 --- a/src/include/postmaster/clustermon.h +++ b/src/include/postmaster/clustermon.h @@ -36,7 +36,7 @@ extern bool IsClusterMonitorProcess(void); /* Functions to start cluster monitor process, called from postmaster */ int ClusterMonitorInit(void); extern int StartClusterMonitor(void); -extern GlobalTransactionId ClusterMonitorGetGlobalXmin(void); +extern GlobalTransactionId ClusterMonitorGetGlobalXmin(bool invalid_ok); extern void ClusterMonitorSetGlobalXmin(GlobalTransactionId xmin); extern GlobalTransactionId ClusterMonitorGetReportingGlobalXmin(void); diff --git a/src/test/regress/expected/txid.out b/src/test/regress/expected/txid.out index d98eecd5e2..5f8dc40253 100644 --- a/src/test/regress/expected/txid.out +++ b/src/test/regress/expected/txid.out @@ -293,14 +293,10 @@ SELECT txid_status(2); -- FrozenTransactionId is always committed committed (1 row) --- in regress testing FirstNormalTransactionId will always be behind oldestXmin --- XXX in XL, the oldestXmin is advanced lazily and depends on the global --- state. So the clog for FirstNormalTransactionId may very well exist and --- txid_status gives us a correct answer -SELECT txid_status(3); +SELECT txid_status(3); -- in regress testing FirstNormalTransactionId will always be behind oldestXmin txid_status ------------- - committed + (1 row) COMMIT; diff --git a/src/test/regress/sql/txid.sql b/src/test/regress/sql/txid.sql index c5ecaeb663..420c677ab2 100644 --- a/src/test/regress/sql/txid.sql +++ b/src/test/regress/sql/txid.sql @@ -77,11 +77,7 @@ SELECT txid_status(:rolledback) AS rolledback; SELECT txid_status(:inprogress) AS inprogress; SELECT txid_status(1); -- BootstrapTransactionId is always committed SELECT txid_status(2); -- FrozenTransactionId is always committed --- in regress testing FirstNormalTransactionId will always be behind oldestXmin --- XXX in XL, the oldestXmin is advanced lazily and depends on the global --- state. So the clog for FirstNormalTransactionId may very well exist and --- txid_status gives us a correct answer -SELECT txid_status(3); +SELECT txid_status(3); -- in regress testing FirstNormalTransactionId will always be behind oldestXmin COMMIT;