From b09ca362cd5c748c5c7861aef60b52031629c174 Mon Sep 17 00:00:00 2001 From: Masahiko Sawada Date: Tue, 31 Jan 2023 10:58:26 +0900 Subject: [PATCH v2] Fix a race condition of updating procArray->replication_slot_xmin. Previously, ReplicationSlotsComputeRequiredXmin() computed the oldest xmin across all slots while not holding ProcArrayLock if already_locked is false, and acquires the ProcArrayLock just before updating the replication slot xmin. Therefore, if a process calls ReplicationSlotsComputeRequiredXmin() with already_locked being false and another process updates the replication slot xmin before the process acquiring the lock, the slot xmin was overwritten with an old value. In the reported failure, a walsender for an apply worker computes InvalidTransaction as the oldest xmin and overwrote a valid replication slot xmin value computed by a walsender for a tablesync worker with this value. Then the walsender for a tablesync worker ended up computing the transaction id by GetOldestSafeDecodingTransactionId() without considering replication slot xmin. That led to an error ""cannot build an initial slot snapshot as oldest safe xid %u follows snapshot's xmin %u", which was an assertion failure prior to 240e0dbacd3. This commit changes ReplicationSlotsComputeRequiredXmin() so that it computes the oldest xmin while holding ProcArrayLock in exclusive mode. We keep already_locked parameter in ProcArraySetReplicationSlotXmin() on backbranches to not break ABI compatibility. --- src/backend/replication/slot.c | 6 ++++++ src/backend/storage/ipc/procarray.c | 11 ++++------- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index 037a347cba..b39c31d85b 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -771,6 +771,9 @@ ReplicationSlotsComputeRequiredXmin(bool already_locked) Assert(ReplicationSlotCtl != NULL); + if (!already_locked) + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); for (i = 0; i < max_replication_slots; i++) @@ -810,6 +813,9 @@ ReplicationSlotsComputeRequiredXmin(bool already_locked) LWLockRelease(ReplicationSlotControlLock); ProcArraySetReplicationSlotXmin(agg_xmin, agg_catalog_xmin, already_locked); + + if (!already_locked) + LWLockRelease(ProcArrayLock); } /* diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 33fe9c06a3..198b238304 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -3942,22 +3942,19 @@ TerminateOtherDBBackends(Oid databaseId) * Install limits to future computations of the xmin horizon to prevent vacuum * and HOT pruning from removing affected rows still needed by clients with * replication slots. + * + * NB: the caller must hold ProcArrayLock in an exclusive mode regardless of + * already_locked which is unused now but kept for ABI compatibility. */ void ProcArraySetReplicationSlotXmin(TransactionId xmin, TransactionId catalog_xmin, bool already_locked) { - Assert(!already_locked || LWLockHeldByMe(ProcArrayLock)); - - if (!already_locked) - LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + Assert(LWLockHeldByMeInMode(ProcArrayLock, LW_EXCLUSIVE)); procArray->replication_slot_xmin = xmin; procArray->replication_slot_catalog_xmin = catalog_xmin; - if (!already_locked) - LWLockRelease(ProcArrayLock); - elog(DEBUG1, "xmin required by slots: data %u, catalog %u", xmin, catalog_xmin); } -- 2.31.1