From 6f32238f119236322dfb16262a88c3a9c5141413 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Thu, 15 Dec 2022 14:11:43 -0800 Subject: [PATCH v1 1/1] handle race condition when restarting wal receivers --- src/backend/postmaster/postmaster.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index eac3450774..9d6f58e0b3 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -1601,9 +1601,9 @@ checkControlFile(void) * * In normal conditions we wait at most one minute, to ensure that the other * background tasks handled by ServerLoop get done even when no requests are - * arriving. However, if there are background workers waiting to be started, - * we don't actually sleep so that they are quickly serviced. Other exception - * cases are as shown in the code. + * arriving. However, if there are background workers or a WAL receiver + * waiting to be started, we make sure they are quickly serviced. Other + * exception cases are as shown in the code. */ static void DetermineSleepTime(struct timeval *timeout) @@ -1611,11 +1611,12 @@ DetermineSleepTime(struct timeval *timeout) TimestampTz next_wakeup = 0; /* - * Normal case: either there are no background workers at all, or we're in - * a shutdown sequence (during which we ignore bgworkers altogether). + * Normal case: either there are no background workers and no WAL receiver + * at all, or we're in a shutdown sequence (during which we ignore + * bgworkers and the WAL receiver altogether). */ if (Shutdown > NoShutdown || - (!StartWorkerNeeded && !HaveCrashedWorker)) + (!StartWorkerNeeded && !HaveCrashedWorker && !WalReceiverRequested)) { if (AbortStartTime != 0) { @@ -1674,6 +1675,21 @@ DetermineSleepTime(struct timeval *timeout) } } + /* + * If WalReceiverRequested is set, we're probably waiting on a SIGCHLD to + * arrive to clear WalReceiverPID before starting the new WAL receiver. We + * don't expect that to take long, so limit the sleep to 100ms so that we + * start the new WAL receiver promptly. + */ + if (WalReceiverRequested) + { + TimestampTz walrcv_wakeup; + + walrcv_wakeup = TimestampTzPlusMilliseconds(GetCurrentTimestamp(), 100); + if (next_wakeup == 0 || walrcv_wakeup < next_wakeup) + next_wakeup = walrcv_wakeup; + } + if (next_wakeup != 0) { long secs; -- 2.25.1