From 193a1a538f0823321478ad5d8b566c2e3aa8b0a4 Mon Sep 17 00:00:00 2001 From: alterego655 <824662526@qq.com> Date: Thu, 30 Oct 2025 21:45:31 +0800 Subject: [PATCH v2] Fix unconditional walreceiver shutdown during stream-archive transition Commit 3635a0a introduced unconditional walreceiver shutdown when switching from streaming to archive WAL sources. This causes problems during timeline divergence, when walreceiver enters WALRCV_WAITING state but remains alive. The unconditional shutdown breaks monitoring: walreceiver gets repeatedly terminated and respawned, causing pg_stat_wal_receiver.status to show 'streaming' instead of 'waiting', masking the underlying replication problem. In worst cases with synchronous replication, this can lead to unwritable clusters when the standby reports false readiness. Fix by making the shutdown conditional: only terminate walreceiver when it's actively streaming (WALRCV_STREAMING, WALRCV_STARTING, or WALRCV_RESTARTING). When in WALRCV_WAITING state, just reset InstallXLogFileSegmentActive flag to allow archive restoration without killing the process. --- src/backend/access/transam/xlog.c | 16 +++++++++++++--- src/backend/access/transam/xlogrecovery.c | 11 ++++++++++- src/include/access/xlog.h | 1 + src/test/recovery/t/004_timeline_switch.pl | 5 +++++ 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 37edd30f32d..022c0270e4d 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -9516,15 +9516,25 @@ GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli) LWLockRelease(ControlFileLock); } +/* + * Reset the InstallXLogFileSegmentActive flag without shutting down + * walreceiver. + */ + void + ResetInstallXLogFileSegmentActive(void) + { + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + XLogCtl->InstallXLogFileSegmentActive = false; + LWLockRelease(ControlFileLock); + } + /* Thin wrapper around ShutdownWalRcv(). */ void XLogShutdownWalRcv(void) { ShutdownWalRcv(); - LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); - XLogCtl->InstallXLogFileSegmentActive = false; - LWLockRelease(ControlFileLock); + ResetInstallXLogFileSegmentActive(); } /* Enable WAL file recycling and preallocation. */ diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 262c010fc3c..12d00eab017 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -3686,8 +3686,17 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, * Before we leave XLOG_FROM_STREAM state, make sure that * walreceiver is not active, so that it won't overwrite * WAL that we restore from archive. + * + * If walreceiver is actively streaming (or attempting to + * connect), we must shut it down. However, if it's already + * in WAITING state (e.g., due to timeline divergence), we + * only need to reset the install flag to allow archive + * restoration. */ - XLogShutdownWalRcv(); + if (WalRcvStreaming()) + XLogShutdownWalRcv(); + else + ResetInstallXLogFileSegmentActive(); /* * Before we sleep, re-scan for possible new timelines if diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index d313099c027..e4208a005b5 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -270,6 +270,7 @@ extern void ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli); extern void SetInstallXLogFileSegmentActive(void); extern bool IsInstallXLogFileSegmentActive(void); extern void XLogShutdownWalRcv(void); +extern void ResetInstallXLogFileSegmentActive(void); /* * Routines to start, stop, and get status of a base backup. diff --git a/src/test/recovery/t/004_timeline_switch.pl b/src/test/recovery/t/004_timeline_switch.pl index 9c8334cf278..8baa4f2d5fe 100644 --- a/src/test/recovery/t/004_timeline_switch.pl +++ b/src/test/recovery/t/004_timeline_switch.pl @@ -66,6 +66,11 @@ my $result = $node_standby_2->safe_psql('postgres', "SELECT count(*) FROM tab_int"); is($result, qq(2000), 'check content of standby 2'); +# Check the logs, WAL receiver should not have been stopped. There is no need +# to rely on a position in the logs: a new log file is used on node restart. +ok( !$node_standby_2->log_contains( + "FATAL: .* terminating walreceiver process due to administrator command"), + 'WAL receiver should not be stopped across timeline jumps'); # Ensure that a standby is able to follow a primary on a newer timeline # when WAL archiving is enabled. -- 2.51.0