From 42044b5745213b8a9ebba487e5e8b7ac23d1e3e7 Mon Sep 17 00:00:00 2001 From: Alvaro Herrera Date: Mon, 10 Jun 2024 15:49:07 +0200 Subject: [PATCH v2 1/2] Fix creation of partition descriptor during concurrent detach When a partition is being detached in concurrent mode, it is possible for find_inheritance_children_extended() to return that partition in the list, and immediately after that receive an invalidation message that sets its relpartbound to NULL just before we read it. (This can happen because table_open() reads invalidation messages.) Currently we raise an error ERROR: missing relpartbound for relation %u about the situation, but that's bogus. We can cope better by retrying the find_inheritance_children_extended call to get a new list, which will no longer have the partition being detached. Noticed while investigating bug #18377. Backpatch to 14, where DETACH CONCURRENTLY appeared. Discussion: https://postgr.es/m/202405201616.y4ht2qe5ihoy@alvherre.pgsql --- src/backend/partitioning/partdesc.c | 54 ++++++++++++++++++++++------- 1 file changed, 41 insertions(+), 13 deletions(-) diff --git a/src/backend/partitioning/partdesc.c b/src/backend/partitioning/partdesc.c index 47c97566e6..b8c196536e 100644 --- a/src/backend/partitioning/partdesc.c +++ b/src/backend/partitioning/partdesc.c @@ -24,6 +24,7 @@ #include "utils/builtins.h" #include "utils/fmgroids.h" #include "utils/hsearch.h" +#include "utils/inval.h" #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/partcache.h" @@ -144,16 +145,19 @@ RelationBuildPartitionDesc(Relation rel, bool omit_detached) ListCell *cell; int i, nparts; + bool retried = false; PartitionKey key = RelationGetPartitionKey(rel); MemoryContext new_pdcxt; MemoryContext oldcxt; int *mapping; +retry: + /* * Get partition oids from pg_inherits. This uses a single snapshot to * fetch the list of children, so while more children may be getting added - * concurrently, whatever this function returns will be accurate as of - * some well-defined point in time. + * or removed concurrently, whatever this function returns will be + * accurate as of some well-defined point in time. */ detached_exist = false; detached_xmin = InvalidTransactionId; @@ -196,18 +200,27 @@ RelationBuildPartitionDesc(Relation rel, bool omit_detached) } /* - * The system cache may be out of date; if so, we may find no pg_class - * tuple or an old one where relpartbound is NULL. In that case, try - * the table directly. We can't just AcceptInvalidationMessages() and - * retry the system cache lookup because it's possible that a - * concurrent ATTACH PARTITION operation has removed itself from the - * ProcArray but not yet added invalidation messages to the shared - * queue; InvalidateSystemCaches() would work, but seems excessive. + * Two problems are possible here. First, a concurrent ATTACH + * PARTITION might be in the process of adding a new partition, but + * the syscache doesn't have it, or its copy of it does not yet have + * its relpartbound set. We cannot just AcceptInvalidationMessages(), + * because the other process might have already removed itself from + * the ProcArray but not yet added its invalidation messages to the + * shared queue. We solve this problem by reading pg_class directly + * for the desired tuple. * - * Note that this algorithm assumes that PartitionBoundSpec we manage - * to fetch is the right one -- so this is only good enough for - * concurrent ATTACH PARTITION, not concurrent DETACH PARTITION or - * some hypothetical operation that changes the partition bounds. + * The other problem is that DETACH CONCURRENTLY is in the process of + * removing a partition, which happens in two steps: first it marks it + * as "detach pending", commits, then unsets relpartbound. If + * find_inheritance_children_extended included that partition but we + * below we see that DETACH CONCURRENTLY has reset relpartbound for + * it, we'd see an inconsistent view. (The inconsistency is seen + * because table_open below reads invalidation messages.) We protect + * against this by doing the find_inheritance_children_extended again. + * We only need to do it once, because only one DETACH CONCURRENTLY + * can run at one time, and it has a wait-for-snapshots phase which + * would wait for us, so we cannot be affected by more than one of + * them. */ if (boundspec == NULL) { @@ -231,6 +244,21 @@ RelationBuildPartitionDesc(Relation rel, bool omit_detached) boundspec = stringToNode(TextDatumGetCString(datum)); systable_endscan(scan); table_close(pg_class, AccessShareLock); + + /* + * If we still don't get a relpartbound value, then it must be + * because of DETACH CONCURRENTLY. Restart from above. + * + * Note that the memory context is a short-lived one, so we don't + * need to worry about leaking the part of the partition + * descriptor we've already built. + */ + if (!boundspec && !retried) + { + AcceptInvalidationMessages(); + retried = true; + goto retry; + } } /* Sanity checks. */ -- 2.39.2