From 0282b2b315f2a3a8685eb5ce1e48ce8b57548821 Mon Sep 17 00:00:00 2001 From: Nisha Moond Date: Tue, 25 Feb 2025 17:33:01 +0530 Subject: [PATCH v1] Fix race condition in ReplicationSlotAcquire() indroduced by commit f41d846 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After commit f41d846, a process could acquire and use a replication slot that had just been invalidated, leading to unexpected failures. This occured because the invalidation check was performed before setting the slot’s active_pid. The fix ensures the check happens only after acquiring slot, i.e., after setting s->active_pid = MyProcPid, preventing a race condition with InvalidatePossiblyObsoleteSlot() and ensuring that invalid slots are not mistakenly used. --- src/backend/replication/slot.c | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index d089085..af388ad 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -580,19 +580,6 @@ retry: name))); } - /* Invalid slots can't be modified or used before accessing the WAL. */ - if (error_if_invalid && s->data.invalidated != RS_INVAL_NONE) - { - LWLockRelease(ReplicationSlotControlLock); - - ereport(ERROR, - errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("can no longer access replication slot \"%s\"", - NameStr(s->data.name)), - errdetail("This replication slot has been invalidated due to \"%s\".", - GetSlotInvalidationCauseName(s->data.invalidated))); - } - /* * This is the slot we want; check if it's active under some other * process. In single user mode, we don't need this check. @@ -650,13 +637,32 @@ retry: else if (!nowait) ConditionVariableCancelSleep(); /* no sleep needed after all */ - /* Let everybody know we've modified this slot */ - ConditionVariableBroadcast(&s->active_cv); - /* We made this slot active, so it's ours now. */ MyReplicationSlot = s; /* + * Is the slot invalid? Check the invalidated flag only after acquiring + * the slot, i.e., after setting s->active_pid as MyProcPid, to prevent a + * race condition with InvalidatePossiblyObsoleteSlot(). + * + * If this check is performed before acquiring the slot or without holding + * a spinlock, the checkpointer process may invalidate the slot right + * after the check but before slot's active_pid is set. This could result + * in the process mistakenly acquiring and using an invalidated slot, + * leading to unexpected behavior or failures later. + */ + if (error_if_invalid && s->data.invalidated != RS_INVAL_NONE) + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("can no longer access replication slot \"%s\"", + NameStr(s->data.name)), + errdetail("This replication slot has been invalidated due to \"%s\".", + GetSlotInvalidationCauseName(s->data.invalidated))); + + /* Let everybody know we've modified this slot */ + ConditionVariableBroadcast(&s->active_cv); + + /* * The call to pgstat_acquire_replslot() protects against stats for a * different slot, from before a restart or such, being present during * pgstat_report_replslot(). -- 1.8.3.1