From 00234c83cedcb684f4b6e12d22608f929f4d3c0b Mon Sep 17 00:00:00 2001 From: Bharath Rupireddy Date: Thu, 18 May 2023 14:15:05 +0000 Subject: [PATCH v4] Optimize walsender wake up logic with Conditional Variables WalSndWakeup() currently loops through all the walsenders slots, with a spinlock acquisition and release for every iteration, just to wake up only the waiting walsenders. WalSndWakeup() gets costlier even when there's a single walsender available on the standby (i.e., a cascading replica or a logical decoding client). This wasn't a problem before e101dfac3a53c. But, for allowing logical decoding on standby, we needed to wake up logical walsenders after every WAL record is applied on the standby. This really made WalSndWakeup() costly, causing performance regression. To solve this, we use condition variable (CV) to efficiently wake up walsenders in WalSndWakeup(). Every walsender prepares to sleep on a shared memory CV. Note that it just prepares to sleep on the CV (i.e., adds itself to the CV's waitlist), but does not actually wait on the CV (IOW, it never calls ConditionVariableSleep()). It still uses WaitEventSetWait() for waiting, because CV infrastructure doesn't handle FeBe socket events currently. The processes (startup process, walreceiver etc.) wanting to wake up walsenders use ConditionVariableBroadcast(), which in turn calls SetLatch(), helping walsenders come out of WaitEventSetWait(). This approach is simple and efficient because it makes WalSndWakeup() life easy. A desirable future improvement would be to add support for CVs into WaitEventSetWait(). And, we use separate shared memory CVs for physical and logical walsenders for selective wake ups, see WalSndWakeup() for more details. Reported-by: Andres Freund Suggested-by: Andres Freund Author: Bharath Rupireddy Reviewed-by: Drouvot, Bertrand Reviewed-by: Zhijie Hou Discussion: https://www.postgresql.org/message-id/20230509190247.3rrplhdgem6su6cg%40awork3.anarazel.de --- src/backend/replication/walsender.c | 72 ++++++++++++++------- src/include/replication/walsender_private.h | 7 ++ 2 files changed, 55 insertions(+), 24 deletions(-) diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 45b8b3684f..57ef3dbb67 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -3309,6 +3309,9 @@ WalSndShmemInit(void) SpinLockInit(&walsnd->mutex); } + + ConditionVariableInit(&WalSndCtl->physicalWALSndCV); + ConditionVariableInit(&WalSndCtl->logicalWALSndCV); } } @@ -3330,31 +3333,17 @@ WalSndShmemInit(void) void WalSndWakeup(bool physical, bool logical) { - int i; - - for (i = 0; i < max_wal_senders; i++) - { - Latch *latch; - ReplicationKind kind; - WalSnd *walsnd = &WalSndCtl->walsnds[i]; - - /* - * Get latch pointer with spinlock held, for the unlikely case that - * pointer reads aren't atomic (as they're 8 bytes). While at it, also - * get kind. - */ - SpinLockAcquire(&walsnd->mutex); - latch = walsnd->latch; - kind = walsnd->kind; - SpinLockRelease(&walsnd->mutex); - - if (latch == NULL) - continue; + /* + * Let's wake up all the waiting physical and/or logical walsenders using + * their respective condition variables (CVs). Note that every walsender + * would have prepared to sleep on the CV (i.e., added itself to the CV's + * waitlist) in WalSndWait before actually waiting. + */ + if (physical) + ConditionVariableBroadcast(&WalSndCtl->physicalWALSndCV); - if ((physical && kind == REPLICATION_KIND_PHYSICAL) || - (logical && kind == REPLICATION_KIND_LOGICAL)) - SetLatch(latch); - } + if (logical) + ConditionVariableBroadcast(&WalSndCtl->logicalWALSndCV); } /* @@ -3368,9 +3357,44 @@ WalSndWait(uint32 socket_events, long timeout, uint32 wait_event) WaitEvent event; ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos, socket_events, NULL); + + /* + * We use condition variable (CV) to efficiently wake up walsenders in + * WalSndWakeup(). + * + * Every walsender prepares to sleep on a shared memory CV. Note that it + * just prepares to sleep on the CV (i.e., adds itself to the CV's + * waitlist), but does not actually wait on the CV (IOW, it never calls + * ConditionVariableSleep()). It still uses WaitEventSetWait() for waiting, + * because CV infrastructure doesn't handle FeBe socket events currently. + * The processes (startup process, walreceiver etc.) wanting to wake up + * walsenders use ConditionVariableBroadcast(), which in turn calls + * SetLatch(), helping walsenders come out of WaitEventSetWait(). + * + * This approach is simple and efficient because, one doesn't have to loop + * through all the walsenders slots, with a spinlock acquisition and + * release for every iteration, just to wake up only the waiting + * walsenders. It makes WalSndWakeup() callers' life easy. + * + * XXX: A desirable future improvement would be to add support for CVs into + * WaitEventSetWait(). + * + * And, we use separate shared memory CVs for physical and logical + * walsenders for selective wake ups, see WalSndWakeup() for more details. + */ + if (MyWalSnd->kind == REPLICATION_KIND_PHYSICAL) + ConditionVariablePrepareToSleep(&WalSndCtl->physicalWALSndCV); + else if (MyWalSnd->kind == REPLICATION_KIND_LOGICAL) + ConditionVariablePrepareToSleep(&WalSndCtl->logicalWALSndCV); + if (WaitEventSetWait(FeBeWaitSet, timeout, &event, 1, wait_event) == 1 && (event.events & WL_POSTMASTER_DEATH)) + { + ConditionVariableCancelSleep(); proc_exit(1); + } + + ConditionVariableCancelSleep(); } /* diff --git a/src/include/replication/walsender_private.h b/src/include/replication/walsender_private.h index ff25aa70a8..4b33a96d25 100644 --- a/src/include/replication/walsender_private.h +++ b/src/include/replication/walsender_private.h @@ -17,6 +17,7 @@ #include "nodes/nodes.h" #include "nodes/replnodes.h" #include "replication/syncrep.h" +#include "storage/condition_variable.h" #include "storage/latch.h" #include "storage/shmem.h" #include "storage/spin.h" @@ -108,6 +109,12 @@ typedef struct */ bool sync_standbys_defined; + /* Used to wake up physical walsenders */ + ConditionVariable physicalWALSndCV; + + /* Used to wake up logical walsenders */ + ConditionVariable logicalWALSndCV; + WalSnd walsnds[FLEXIBLE_ARRAY_MEMBER]; } WalSndCtlData; -- 2.34.1