From ca245e5934f2225fa3e71b40792bab2a2d41620b Mon Sep 17 00:00:00 2001 From: Dilip Kumar Date: Tue, 5 Aug 2025 05:40:12 +0000 Subject: [PATCH v1] Fix drop subcription deadlock with create database The DROP SUBSCRIPTION command previously acquired an AccessExclusiveLock on the pg_subscription catalog to prevent the launcher from starting new workers. This created a deadlock because new database connections require an AccessShareLock on the same catalog to build their cache. This commit resolves the deadlock by downgrading the lock on the pg_subscription catalog to AccessShareLock during drop subscriber. To prevent the launcher from starting a new worker for the subscription being dropped, we acquired a shared object lock on the subscription, which was already held by DROP SUBSCRIPTION in exclusive mode. --- src/backend/commands/subscriptioncmds.c | 6 +--- src/backend/replication/logical/launcher.c | 32 ++++++++++++++++++---- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/src/backend/commands/subscriptioncmds.c b/src/backend/commands/subscriptioncmds.c index 9467f58a23d..05478e53530 100644 --- a/src/backend/commands/subscriptioncmds.c +++ b/src/backend/commands/subscriptioncmds.c @@ -1571,11 +1571,7 @@ DropSubscription(DropSubscriptionStmt *stmt, bool isTopLevel) List *rstates; bool must_use_password; - /* - * Lock pg_subscription with AccessExclusiveLock to ensure that the - * launcher doesn't restart new worker during dropping the subscription - */ - rel = table_open(SubscriptionRelationId, AccessExclusiveLock); + rel = table_open(SubscriptionRelationId, AccessShareLock); tup = SearchSysCache2(SUBSCRIPTIONNAME, MyDatabaseId, CStringGetDatum(stmt->subname)); diff --git a/src/backend/replication/logical/launcher.c b/src/backend/replication/logical/launcher.c index 294d0d74d8c..f3c48e9b4f0 100644 --- a/src/backend/replication/logical/launcher.c +++ b/src/backend/replication/logical/launcher.c @@ -35,6 +35,7 @@ #include "replication/walreceiver.h" #include "replication/worker_internal.h" #include "storage/ipc.h" +#include "storage/lmgr.h" #include "storage/proc.h" #include "storage/procarray.h" #include "tcop/tcopprot.h" @@ -42,6 +43,7 @@ #include "utils/memutils.h" #include "utils/pg_lsn.h" #include "utils/snapmgr.h" +#include "utils/syscache.h" /* max sleep time between cycles (3min) */ #define DEFAULT_NAPTIME_PER_CYCLE 180000L @@ -1147,11 +1149,9 @@ ApplyLauncherMain(Datum main_arg) pqsignal(SIGTERM, die); BackgroundWorkerUnblockSignals(); - /* - * Establish connection to nailed catalogs (we only ever access - * pg_subscription). - */ - BackgroundWorkerInitializeConnection(NULL, NULL, 0); + + /* Connect to postgres, as we need to look up the syscache. */ + BackgroundWorkerInitializeConnection("postgres", NULL, 0); /* Enter main loop */ for (;;) @@ -1209,6 +1209,25 @@ ApplyLauncherMain(Datum main_arg) if (last_start == 0 || (elapsed = TimestampDifferenceMilliseconds(last_start, now)) >= wal_retrieve_retry_interval) { + HeapTuple tup; + + /* + * Lock the subscription to prevent it from being concurrently dropped, + * then re-verify its existence. + */ + StartTransactionCommand(); + LockSharedObject(SubscriptionRelationId, sub->oid, 0, + AccessShareLock); + tup = SearchSysCache1(SUBSCRIPTIONOID, + ObjectIdGetDatum(sub->oid)); + if (!HeapTupleIsValid(tup)) + { + UnlockSharedObject(SubscriptionRelationId, sub->oid, 0, + AccessShareLock); + CommitTransactionCommand(); + continue; + } + ApplyLauncherSetWorkerStartTime(sub->oid, now); if (!logicalrep_worker_launch(WORKERTYPE_APPLY, sub->dbid, sub->oid, sub->name, @@ -1225,6 +1244,9 @@ ApplyLauncherMain(Datum main_arg) wait_time = Min(wait_time, wal_retrieve_retry_interval); } + UnlockSharedObject(SubscriptionRelationId, sub->oid, 0, + AccessShareLock); + CommitTransactionCommand(); } else { -- 2.50.1.565.gc32cd1483b-goog