From: Amit Kapila Date: Thu, 11 Dec 2025 03:49:28 +0000 (+0000) Subject: Enhance slot synchronization API to respect promotion signal. X-Git-Url: http://git.postgresql.org/gitweb/static/gitweb.js?a=commitdiff_plain;h=1362bc33e025fd2848ff38558f5672e2f0f0c7de;p=users%2Frhaas%2Fpostgres.git Enhance slot synchronization API to respect promotion signal. Previously, during a promotion, only the slot synchronization worker was signaled to shut down. The backend executing slot synchronization via the pg_sync_replication_slots() SQL function was not signaled, allowing it to complete its synchronization cycle before exiting. An upcoming patch improves pg_sync_replication_slots() to wait until replication slots are fully persisted before finishing. This behaviour requires the backend to exit promptly if a promotion occurs. This patch ensures that, during promotion, a signal is also sent to the backend running pg_sync_replication_slots(), allowing it to be interrupted and exit immediately. Author: Ajin Cherian Reviewed-by: Shveta Malik Reviewed-by: Chao Li Reviewed-by: Amit Kapila Discussion: https://postgr.es/m/CAFPTHDZAA%2BgWDntpa5ucqKKba41%3DtXmoXqN3q4rpjO9cdxgQrw%40mail.gmail.com --- diff --git a/src/backend/replication/logical/slotsync.c b/src/backend/replication/logical/slotsync.c index 9f92c21237..873aa003ee 100644 --- a/src/backend/replication/logical/slotsync.c +++ b/src/backend/replication/logical/slotsync.c @@ -71,11 +71,14 @@ /* * Struct for sharing information to control slot synchronization. * - * The slot sync worker's pid is needed by the startup process to shut it - * down during promotion. The startup process shuts down the slot sync worker - * and also sets stopSignaled=true to handle the race condition when the + * The 'pid' is either the slot sync worker's pid or the backend's pid running + * the SQL function pg_sync_replication_slots(). When the startup process sets + * 'stopSignaled' during promotion, it uses this 'pid' to wake up the currently + * synchronizing process so that the process can immediately stop its + * synchronizing work on seeing 'stopSignaled' set. + * Setting 'stopSignaled' is also used to handle the race condition when the * postmaster has not noticed the promotion yet and thus may end up restarting - * the slot sync worker. If stopSignaled is set, the worker will exit in such a + * the slot sync worker. If 'stopSignaled' is set, the worker will exit in such a * case. The SQL function pg_sync_replication_slots() will also error out if * this flag is set. Note that we don't need to reset this variable as after * promotion the slot sync worker won't be restarted because the pmState @@ -1195,10 +1198,10 @@ ValidateSlotSyncParams(int elevel) } /* - * Re-read the config file. + * Re-read the config file for slot synchronization. * - * Exit if any of the slot sync GUCs have changed. The postmaster will - * restart it. + * Exit or throw error if relevant GUCs have changed depending on whether + * called from slot sync worker or from the SQL function pg_sync_replication_slots() */ static void slotsync_reread_config(void) @@ -1209,8 +1212,11 @@ slotsync_reread_config(void) bool old_hot_standby_feedback = hot_standby_feedback; bool conninfo_changed; bool primary_slotname_changed; + bool is_slotsync_worker = AmLogicalSlotSyncWorkerProcess(); + bool parameter_changed = false; - Assert(sync_replication_slots); + if (is_slotsync_worker) + Assert(sync_replication_slots); ConfigReloadPending = false; ProcessConfigFile(PGC_SIGHUP); @@ -1222,32 +1228,60 @@ slotsync_reread_config(void) if (old_sync_replication_slots != sync_replication_slots) { - ereport(LOG, - /* translator: %s is a GUC variable name */ - errmsg("replication slot synchronization worker will shut down because \"%s\" is disabled", "sync_replication_slots")); - proc_exit(0); - } + if (is_slotsync_worker) + { + ereport(LOG, + /* translator: %s is a GUC variable name */ + errmsg("replication slot synchronization worker will stop because \"%s\" is disabled", + "sync_replication_slots")); + + proc_exit(0); + } - if (conninfo_changed || - primary_slotname_changed || - (old_hot_standby_feedback != hot_standby_feedback)) + parameter_changed = true; + } + else { - ereport(LOG, - errmsg("replication slot synchronization worker will restart because of a parameter change")); + if (conninfo_changed || + primary_slotname_changed || + (old_hot_standby_feedback != hot_standby_feedback)) + { - /* - * Reset the last-start time for this worker so that the postmaster - * can restart it without waiting for SLOTSYNC_RESTART_INTERVAL_SEC. - */ - SlotSyncCtx->last_start_time = 0; + if (is_slotsync_worker) + { + ereport(LOG, + errmsg("replication slot synchronization worker will restart because of a parameter change")); - proc_exit(0); + /* + * Reset the last-start time for this worker so that the + * postmaster can restart it without waiting for + * SLOTSYNC_RESTART_INTERVAL_SEC. + */ + SlotSyncCtx->last_start_time = 0; + + proc_exit(0); + } + + parameter_changed = true; + } + } + + /* + * If we have reached here with a parameter change, we must be running in + * SQL function, emit error in such a case. + */ + if (parameter_changed) + { + Assert(!is_slotsync_worker); + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("replication slot synchronization will stop because of a parameter change")); } } /* - * Interrupt handler for main loop of slot sync worker. + * Interrupt handler for process performing slot synchronization. */ static void ProcessSlotSyncInterrupts(void) @@ -1256,10 +1290,23 @@ ProcessSlotSyncInterrupts(void) if (SlotSyncCtx->stopSignaled) { - ereport(LOG, - errmsg("replication slot synchronization worker is shutting down because promotion is triggered")); + if (AmLogicalSlotSyncWorkerProcess()) + { + ereport(LOG, + errmsg("replication slot synchronization worker will stop because promotion is triggered")); - proc_exit(0); + proc_exit(0); + } + else + { + /* + * For the backend executing SQL function + * pg_sync_replication_slots(). + */ + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("replication slot synchronization will stop because promotion is triggered")); + } } if (ConfigReloadPending) @@ -1362,29 +1409,14 @@ wait_for_slot_activity(bool some_slot_updated) } /* - * Emit an error if a promotion or a concurrent sync call is in progress. + * Emit an error if a concurrent sync call is in progress. * Otherwise, advertise that a sync is in progress. */ static void -check_and_set_sync_info(pid_t worker_pid) +check_and_set_sync_info(pid_t sync_process_pid) { SpinLockAcquire(&SlotSyncCtx->mutex); - /* The worker pid must not be already assigned in SlotSyncCtx */ - Assert(worker_pid == InvalidPid || SlotSyncCtx->pid == InvalidPid); - - /* - * Emit an error if startup process signaled the slot sync machinery to - * stop. See comments atop SlotSyncCtxStruct. - */ - if (SlotSyncCtx->stopSignaled) - { - SpinLockRelease(&SlotSyncCtx->mutex); - ereport(ERROR, - errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("cannot synchronize replication slots when standby promotion is ongoing")); - } - if (SlotSyncCtx->syncing) { SpinLockRelease(&SlotSyncCtx->mutex); @@ -1393,13 +1425,16 @@ check_and_set_sync_info(pid_t worker_pid) errmsg("cannot synchronize replication slots concurrently")); } + /* The pid must not be already assigned in SlotSyncCtx */ + Assert(SlotSyncCtx->pid == InvalidPid); + SlotSyncCtx->syncing = true; /* * Advertise the required PID so that the startup process can kill the - * slot sync worker on promotion. + * slot sync process on promotion. */ - SlotSyncCtx->pid = worker_pid; + SlotSyncCtx->pid = sync_process_pid; SpinLockRelease(&SlotSyncCtx->mutex); @@ -1414,6 +1449,7 @@ reset_syncing_flag(void) { SpinLockAcquire(&SlotSyncCtx->mutex); SlotSyncCtx->syncing = false; + SlotSyncCtx->pid = InvalidPid; SpinLockRelease(&SlotSyncCtx->mutex); syncing_slots = false; @@ -1622,7 +1658,7 @@ update_synced_slots_inactive_since(void) if (!StandbyMode) return; - /* The slot sync worker or SQL function mustn't be running by now */ + /* The slot sync worker or the SQL function mustn't be running by now */ Assert((SlotSyncCtx->pid == InvalidPid) && !SlotSyncCtx->syncing); LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); @@ -1651,16 +1687,18 @@ update_synced_slots_inactive_since(void) } /* - * Shut down the slot sync worker. + * Shut down slot synchronization. * - * This function sends signal to shutdown slot sync worker, if required. It - * also waits till the slot sync worker has exited or + * This function sets stopSignaled=true and wakes up the slot sync process + * (either worker or backend running the SQL function pg_sync_replication_slots()) + * so that worker can exit or the SQL function pg_sync_replication_slots() can + * finish. It also waits till the slot sync worker has exited or * pg_sync_replication_slots() has finished. */ void ShutDownSlotSync(void) { - pid_t worker_pid; + pid_t sync_process_pid; SpinLockAcquire(&SlotSyncCtx->mutex); @@ -1677,16 +1715,16 @@ ShutDownSlotSync(void) return; } - worker_pid = SlotSyncCtx->pid; + sync_process_pid = SlotSyncCtx->pid; SpinLockRelease(&SlotSyncCtx->mutex); /* - * Signal slotsync worker if it was still running. The worker will stop - * upon detecting that the stopSignaled flag is set to true. + * Signal process doing slotsync, if any. The process will stop upon + * detecting that the stopSignaled flag is set to true. */ - if (worker_pid != InvalidPid) - kill(worker_pid, SIGUSR1); + if (sync_process_pid != InvalidPid) + kill(sync_process_pid, SIGUSR1); /* Wait for slot sync to end */ for (;;) @@ -1835,7 +1873,10 @@ SyncReplicationSlots(WalReceiverConn *wrconn) { PG_ENSURE_ERROR_CLEANUP(slotsync_failure_callback, PointerGetDatum(wrconn)); { - check_and_set_sync_info(InvalidPid); + check_and_set_sync_info(MyProcPid); + + /* Check for interrupts and config changes */ + ProcessSlotSyncInterrupts(); validate_remote_info(wrconn);