From c4986fd4978206c870ed04846a5b37e71e9ff668 Mon Sep 17 00:00:00 2001 From: Andrey Borodin Date: Sun, 27 Jul 2025 11:37:55 +0500 Subject: [PATCH v8-PG17] Avoid edge case 2 in multixacts --- src/backend/access/transam/multixact.c | 94 +++++++++++++++++--------- 1 file changed, 62 insertions(+), 32 deletions(-) diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index b7b47ef076a..7eaefd55903 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -274,12 +274,6 @@ typedef struct MultiXactStateData /* support for members anti-wraparound measures */ MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */ - /* - * This is used to sleep until a multixact offset is written when we want - * to create the next one. - */ - ConditionVariable nextoff_cv; - /* * Per-backend data starts here. We have two arrays stored in the area * immediately following the MultiXactStateData struct. Each is indexed by @@ -918,10 +912,20 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, int i; LWLock *lock; LWLock *prevlock = NULL; + MultiXactId next = multi + 1; + int next_pageno; pageno = MultiXactIdToOffsetPage(multi); entryno = MultiXactIdToOffsetEntry(multi); + /* + * We must also fill next offset to keep current multi readable + * Handle wraparound as GetMultiXactIdMembers() does it. + */ + if (next < FirstMultiXactId) + next = FirstMultiXactId; + next_pageno = MultiXactIdToOffsetPage(next); + lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); LWLockAcquire(lock, LW_EXCLUSIVE); @@ -936,19 +940,56 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; - *offptr = offset; + /* + * We might have filled this offset previosuly. + * Cross-check for correctness. + */ + Assert((*offptr == 0) || (*offptr == offset)); + *offptr = offset; MultiXactOffsetCtl->shared->page_dirty[slotno] = true; + if (next_pageno == pageno) + { + offptr[1] = offset + nmembers; + } + else + { + int next_slotno; + MultiXactOffset *next_offptr; + int next_entryno = MultiXactIdToOffsetEntry(next); + Assert(next_entryno == 0); /* This is an overflow-only branch */ + + /* Swap the lock for a lock of next page */ + LWLockRelease(lock); + lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno); + LWLockAcquire(lock, LW_EXCLUSIVE); + + if (SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, next_pageno)) + { + /* Just read a next page */ + next_slotno = SimpleLruReadPage(MultiXactOffsetCtl, next_pageno, true, next); + } + else + { + /* + * We have to create a new page. + * SimpleLruWritePage is already prepared to deal + * with creating a new segment file. We do not need to handle + * race conditions, because this code is only executed in redo + * and we hold appropriate lock of MultiXactOffsetCtl. + */ + next_slotno = SimpleLruZeroPage(MultiXactOffsetCtl, next_pageno); + SimpleLruWritePage(MultiXactOffsetCtl, next_slotno); + } + next_offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[next_slotno]; + next_offptr[next_entryno] = offset + nmembers; + MultiXactMemberCtl->shared->page_dirty[next_slotno] = true; + } + /* Release MultiXactOffset SLRU lock. */ LWLockRelease(lock); - /* - * If anybody was waiting to know the offset of this multixact ID we just - * wrote, they can read it now, so wake them up. - */ - ConditionVariableBroadcast(&MultiXactState->nextoff_cv); - prev_pageno = -1; for (i = 0; i < nmembers; i++, offset++) @@ -1307,7 +1348,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, MultiXactOffset nextOffset; MultiXactMember *ptr; LWLock *lock; - bool slept = false; debug_elog3(DEBUG2, "GetMembers: asked for %u", multi); @@ -1386,7 +1426,10 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, * 1. This multixact may be the latest one created, in which case there is * no next one to look at. In this case the nextOffset value we just * saved is the correct endpoint. + * TODO: how does it work on Standby? MultiXactState->nextMXact does not seem to be up-to date. + * nextMXact and nextOffset are in sync, so nothing bad can happen, but nextMXact seems mostly random. * + * THIS IS NOT POSSIBLE ANYMORE, KEEP IT FOR HISTORIC REASONS. * 2. The next multixact may still be in process of being filled in: that * is, another process may have done GetNewMultiXactId but not yet written * the offset entry for that ID. In that scenario, it is guaranteed that @@ -1412,7 +1455,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, * cases, so it seems better than holding the MultiXactGenLock for a long * time on every multixact creation. */ -retry: pageno = MultiXactIdToOffsetPage(multi); entryno = MultiXactIdToOffsetEntry(multi); @@ -1476,14 +1518,10 @@ retry: if (nextMXOffset == 0) { - /* Corner case 2: next multixact is still being filled in */ - LWLockRelease(lock); - CHECK_FOR_INTERRUPTS(); - - ConditionVariableSleep(&MultiXactState->nextoff_cv, - WAIT_EVENT_MULTIXACT_CREATION); - slept = true; - goto retry; + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("MultiXact %d has invalid next offset", + multi))); } length = nextMXOffset - offset; @@ -1492,12 +1530,6 @@ retry: LWLockRelease(lock); lock = NULL; - /* - * If we slept above, clean up state; it's no longer needed. - */ - if (slept) - ConditionVariableCancelSleep(); - ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember)); truelength = 0; @@ -1987,7 +2019,6 @@ MultiXactShmemInit(void) /* Make sure we zero out the per-backend state */ MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE); - ConditionVariableInit(&MultiXactState->nextoff_cv); } else Assert(found); @@ -2198,8 +2229,7 @@ TrimMultiXact(void) * TrimCLOG() for background. Unlike CLOG, some WAL record covers every * pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL * rule "write xlog before data," nextMXact successors may carry obsolete, - * nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers() - * operates normally. + * nonzero offset values. */ entryno = MultiXactIdToOffsetEntry(nextMXact); if (entryno != 0) -- 2.39.5 (Apple Git-154)