From c43e37ccd0778e72ea525f09b2d761d3793ae2f5 Mon Sep 17 00:00:00 2001 From: alterego655 <824662526@qq.com> Date: Thu, 30 Oct 2025 23:56:29 +0800 Subject: [PATCH v2] Fix unconditional walreceiver shutdown during stream-archive transition Commit 3635a0a introduced unconditional walreceiver shutdown when switching from streaming to archive WAL sources. This causes problems during timeline divergence, when walreceiver enters WALRCV_WAITING state but remains alive. The unconditional shutdown breaks monitoring: walreceiver gets repeatedly terminated and respawned, causing pg_stat_wal_receiver.status to show 'streaming' instead of 'waiting', masking the underlying replication problem. In worst cases with synchronous replication, this can lead to unwritable clusters when the standby reports false readiness. Fix by making the shutdown conditional: only terminate walreceiver when it's actively streaming (WALRCV_STREAMING, WALRCV_STARTING, or WALRCV_RESTARTING). When in WALRCV_WAITING state, just reset InstallXLogFileSegmentActive flag to allow archive restoration without killing the process. --- src/backend/access/transam/xlog.c | 27 ++++++++++++++++++---- src/test/recovery/t/004_timeline_switch.pl | 6 +++++ 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 3f8de31b386..ba74254f277 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -12837,8 +12837,17 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, * Before we leave XLOG_FROM_STREAM state, make sure that * walreceiver is not active, so that it won't overwrite * WAL that we restore from archive. + * + * If walreceiver is actively streaming (or attempting to + * connect), we must shut it down. However, if it's already + * in WAITING state (e.g., due to timeline divergence), we + * only need to reset the install flag to allow archive + * restoration. */ - XLogShutdownWalRcv(); + if (WalRcvStreaming()) + XLogShutdownWalRcv(); + else + ResetInstallXLogFileSegmentActive(); /* * Before we sleep, re-scan for possible new timelines if @@ -13191,15 +13200,25 @@ StartupRequestWalReceiverRestart(void) } } +/* + * Reset the InstallXLogFileSegmentActive flag without shutting down + * walreceiver. + */ +void +ResetInstallXLogFileSegmentActive(void) +{ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + XLogCtl->InstallXLogFileSegmentActive = false; + LWLockRelease(ControlFileLock); +} + /* Thin wrapper around ShutdownWalRcv(). */ static void XLogShutdownWalRcv(void) { ShutdownWalRcv(); - LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); - XLogCtl->InstallXLogFileSegmentActive = false; - LWLockRelease(ControlFileLock); + ResetInstallXLogFileSegmentActive(); } /* diff --git a/src/test/recovery/t/004_timeline_switch.pl b/src/test/recovery/t/004_timeline_switch.pl index c101980e9e2..a813caa77e0 100644 --- a/src/test/recovery/t/004_timeline_switch.pl +++ b/src/test/recovery/t/004_timeline_switch.pl @@ -71,6 +71,12 @@ my $result = $node_standby_2->safe_psql('postgres', "SELECT count(*) FROM tab_int"); is($result, qq(2000), 'check content of standby 2'); +# Check the logs, WAL receiver should not have been stopped. There is no need +# to rely on a position in the logs: a new log file is used on node restart. +ok( !$node_standby_2->log_contains( + "FATAL: .* terminating walreceiver process due to administrator command"), + 'WAL receiver should not be stopped across timeline jumps'); + # Ensure that a standby is able to follow a primary on a newer timeline # when WAL archiving is enabled. -- 2.51.0