From 8b266d6f15c9c6db9460a177f2d6382c1124c41a Mon Sep 17 00:00:00 2001 From: Dilip Kumar Date: Mon, 18 Aug 2025 12:18:29 +0000 Subject: [PATCH v8] Fix DROP SUBSCRIPTION deadlock with walsender process The DROP SUBSCRIPTION command caused a deadlock with the walsender process because of a lock conflict on the pg_subscription catalog. This happened when the command was trying to drop a replication slot by connecting to a newly created database with uninitialized caches. The DROP SUBSCRIPTION command would first acquire an AccessExclusiveLock on pg_subscription to prevent new workers from starting. However, the connection to the new database needed to acquire an AccessShareLock on the same catalog during its initialization. This created a deadlock where DROP SUBSCRIPTION held its lock and was waiting for the new connection to complete, while the new connection was simultaneously blocked from starting because it couldn't acquire its required lock. This commit resolves the deadlock by having DROP SUBSCRIPTION acquire a less restrictive RowExclusiveLock on the catalog instead. To address the original concern of orphaned workers, a new check is implemented. The replication worker now takes a shared object lock on the subscription itself. If a worker starts for a subscription that no longer exists, it immediately detects this condition and exits. This ensures that no orphan workers are created without the need for an overly broad AccessExclusiveLock on the system catalog. --- src/backend/commands/subscriptioncmds.c | 8 ++++--- src/backend/replication/logical/worker.c | 7 ++++++ src/test/subscription/t/100_bugs.pl | 30 ++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/src/backend/commands/subscriptioncmds.c b/src/backend/commands/subscriptioncmds.c index 334717c0e96..cb9867c96d4 100644 --- a/src/backend/commands/subscriptioncmds.c +++ b/src/backend/commands/subscriptioncmds.c @@ -1341,10 +1341,12 @@ DropSubscription(DropSubscriptionStmt *stmt, bool isTopLevel) List *rstates; /* - * Lock pg_subscription with AccessExclusiveLock to ensure that the - * launcher doesn't restart new worker during dropping the subscription + * The launcher may concurrently start a new worker for this subscription. + * During initialization, the worker checks for subscription validity and + * exits if the subscription has already been dropped. See + * InitializeLogRepWorker. */ - rel = table_open(SubscriptionRelationId, AccessExclusiveLock); + rel = table_open(SubscriptionRelationId, RowExclusiveLock); tup = SearchSysCache2(SUBSCRIPTIONNAME, MyDatabaseId, CStringGetDatum(stmt->subname)); diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index 155bf7c44ee..2c4bc62341f 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -3638,6 +3638,13 @@ ApplyWorkerMain(Datum main_arg) StartTransactionCommand(); oldctx = MemoryContextSwitchTo(ApplyContext); + /* + * Lock the subscription to prevent it from being concurrently dropped, + * then re-verify its existence. After the initialization, the worker will + * be terminated gracefully if the subscription is dropped. + */ + LockSharedObject(SubscriptionRelationId, MyLogicalRepWorker->subid, 0, + AccessShareLock); MySubscription = GetSubscription(MyLogicalRepWorker->subid, true); if (!MySubscription) { diff --git a/src/test/subscription/t/100_bugs.pl b/src/test/subscription/t/100_bugs.pl index 28ca4affbb9..061eb2a32c1 100644 --- a/src/test/subscription/t/100_bugs.pl +++ b/src/test/subscription/t/100_bugs.pl @@ -418,4 +418,34 @@ is($result, qq(2|f $node_publisher->stop('fast'); $node_subscriber->stop('fast'); +# BUG #18988 +# The bug happened due to a self-deadlock between the DROP SUBSCRIPTION +# command and the walsender process for accessing pg_subscription. This +# occurred when DROP SUBSCRIPTION attempted to remove a replication slot by +# connecting to a newly created database whose caches are not yet +# initialized. +# +# The bug is fixed by reducing the lock-level during DROP SUBSCRIPTION. +$node_publisher->start(); + +$publisher_connstr = $node_publisher->connstr . ' dbname=regress_db'; +$node_publisher->safe_psql( + 'postgres', qq( + CREATE DATABASE regress_db; + CREATE SUBSCRIPTION regress_sub1 CONNECTION '$publisher_connstr' PUBLICATION regress_pub WITH (connect=false); +)); + +my ($ret, $stdout, $stderr) = + $node_publisher->psql('postgres', q{DROP SUBSCRIPTION regress_sub1}); + +isnt($ret, 0, "replication slot does not exist: exit code not 0"); +like( + $stderr, + qr/ERROR: could not drop replication slot "regress_sub1" on publisher/, + "could not drop replication slot: error message"); + +$node_publisher->safe_psql('postgres', "DROP DATABASE regress_db"); + +$node_publisher->stop('fast'); + done_testing(); -- 2.51.0.rc1.167.g924127e9c0-goog