From 7fec26347c80d42f0243f0d3328b38c69105a41f Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 1 Apr 2025 00:16:17 +0300 Subject: [PATCH v6 09/12] Use CSN snapshots during Hot Standby Replace the known-assigned-XIDs mechanism with a CSN log. The CSN log (pg_csn) tracks the commit LSN of each transaction, when replaying the WAL on a standby. It's only used on the standby, and is initialized from scratch at server startup like pg_subtrans. Based on 0001-CSN-base-snapshot.patch from https://www.postgresql.org/message-id/2020081009525213277261%40highgo.ca. This patch has a long lineage, various CSN patches have been posted with parts from Stas Kelvich, Movead Li, Ants Aasma, Heikki Linnakangas, Alexander Kuzmenkov --- contrib/pg_visibility/pg_visibility.c | 1 + src/backend/access/rmgrdesc/xactdesc.c | 26 - src/backend/access/transam/Makefile | 1 + src/backend/access/transam/csn_log.c | 469 +++++ src/backend/access/transam/meson.build | 1 + src/backend/access/transam/transam.c | 3 + src/backend/access/transam/twophase.c | 34 +- src/backend/access/transam/varsup.c | 1 + src/backend/access/transam/xact.c | 138 +- src/backend/access/transam/xlog.c | 118 +- src/backend/access/transam/xlogrecovery.c | 13 +- src/backend/access/transam/xlogutils.c | 2 +- src/backend/backup/basebackup.c | 3 + src/backend/postmaster/startup.c | 2 +- src/backend/replication/logical/decode.c | 8 - src/backend/replication/logical/snapbuild.c | 2 +- src/backend/storage/ipc/ipci.c | 3 + src/backend/storage/ipc/procarray.c | 1538 ++--------------- src/backend/storage/ipc/standby.c | 102 +- src/backend/storage/lmgr/lwlock.c | 2 + .../utils/activity/wait_event_names.txt | 1 + src/backend/utils/probes.d | 2 + src/backend/utils/time/snapmgr.c | 34 +- src/bin/initdb/initdb.c | 3 +- src/bin/pg_rewind/filemap.c | 3 + src/include/access/csn_log.h | 30 + src/include/access/transam.h | 3 + src/include/access/twophase.h | 3 +- src/include/access/xact.h | 12 +- src/include/access/xlogutils.h | 33 +- src/include/storage/lwlock.h | 2 + src/include/storage/procarray.h | 13 +- src/include/utils/snapshot.h | 8 + 33 files changed, 821 insertions(+), 1793 deletions(-) create mode 100644 src/backend/access/transam/csn_log.c create mode 100644 src/include/access/csn_log.h diff --git a/contrib/pg_visibility/pg_visibility.c b/contrib/pg_visibility/pg_visibility.c index d79ef35006b..c5c7a4dd2c3 100644 --- a/contrib/pg_visibility/pg_visibility.c +++ b/contrib/pg_visibility/pg_visibility.c @@ -607,6 +607,7 @@ collect_visibility_data(Oid relid, bool include_pd) * now perform minimal checking on a standby by always using nextXid, this * approach is better than nothing and will at least catch extremely broken * cases where a xid is in the future. + * XXX KnownAssignedXids is gone. * 3. Ignore walsender xmin, because it could go backward if some replication * connections don't use replication slots. * diff --git a/src/backend/access/rmgrdesc/xactdesc.c b/src/backend/access/rmgrdesc/xactdesc.c index 715cc1f7bad..56f7bd81780 100644 --- a/src/backend/access/rmgrdesc/xactdesc.c +++ b/src/backend/access/rmgrdesc/xactdesc.c @@ -422,17 +422,6 @@ xact_desc_prepare(StringInfo buf, uint8 info, xl_xact_prepare *xlrec, RepOriginI timestamptz_to_str(parsed.origin_timestamp)); } -static void -xact_desc_assignment(StringInfo buf, xl_xact_assignment *xlrec) -{ - int i; - - appendStringInfoString(buf, "subxacts:"); - - for (i = 0; i < xlrec->nsubxacts; i++) - appendStringInfo(buf, " %u", xlrec->xsub[i]); -} - void xact_desc(StringInfo buf, XLogReaderState *record) { @@ -460,18 +449,6 @@ xact_desc(StringInfo buf, XLogReaderState *record) xact_desc_prepare(buf, XLogRecGetInfo(record), xlrec, XLogRecGetOrigin(record)); } - else if (info == XLOG_XACT_ASSIGNMENT) - { - xl_xact_assignment *xlrec = (xl_xact_assignment *) rec; - - /* - * Note that we ignore the WAL record's xid, since we're more - * interested in the top-level xid that issued the record and which - * xids are being reported here. - */ - appendStringInfo(buf, "xtop %u: ", xlrec->xtop); - xact_desc_assignment(buf, xlrec); - } else if (info == XLOG_XACT_INVALIDATIONS) { xl_xact_invals *xlrec = (xl_xact_invals *) rec; @@ -503,9 +480,6 @@ xact_identify(uint8 info) case XLOG_XACT_ABORT_PREPARED: id = "ABORT_PREPARED"; break; - case XLOG_XACT_ASSIGNMENT: - id = "ASSIGNMENT"; - break; case XLOG_XACT_INVALIDATIONS: id = "INVALIDATION"; break; diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile index 661c55a9db7..2520d77c7c8 100644 --- a/src/backend/access/transam/Makefile +++ b/src/backend/access/transam/Makefile @@ -15,6 +15,7 @@ include $(top_builddir)/src/Makefile.global OBJS = \ clog.o \ commit_ts.o \ + csn_log.o \ generic_xlog.o \ multixact.o \ parallel.o \ diff --git a/src/backend/access/transam/csn_log.c b/src/backend/access/transam/csn_log.c new file mode 100644 index 00000000000..40673c8579f --- /dev/null +++ b/src/backend/access/transam/csn_log.c @@ -0,0 +1,469 @@ +/*----------------------------------------------------------------------------- + * + * csn_log.c + * Track commit record LSNs of finished transactions + * + * This module provides an SLRU to store the LSN of the commit record of each + * transaction. CSN stands for Commit Sequence Number, and in principle we + * could use a separate counter that is incremented at every commit. For + * simplicity, though, we use the commit records LSN as the sequence number. + * + * Like pg_subtrans, this mapping need to be kept only for xid's greater then + * oldestXmin, and doesn't need to be preserved over crashes. Also, this is + * only needed in hot standby mode, and immediately after exiting hot standby + * mode, until all old snapshots taken during standby mode are gone. + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/csn_log.c + * + *----------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/clog.h" +#include "access/csn_log.h" +#include "access/slru.h" +#include "access/subtrans.h" +#include "access/transam.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "utils/snapmgr.h" + +/* + * Defines for CSNLog page sizes. A page is the same BLCKSZ as is used + * everywhere else in Postgres. + * + * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF, + * CSNLog page numbering also wraps around at + * 0xFFFFFFFF/CSN_LOG_XACTS_PER_PAGE, and CSNLog segment numbering at + * 0xFFFFFFFF/CLOG_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need take no + * explicit notice of that fact in this module, except when comparing segment + * and page numbers in TruncateCSNLog (see CSNLogPagePrecedes). + */ + +/* We store the commit CSN for each xid */ +#define CSN_LOG_XACTS_PER_PAGE (BLCKSZ / sizeof(XLogRecPtr)) + +#define TransactionIdToPage(xid) ((xid) / (TransactionId) CSN_LOG_XACTS_PER_PAGE) +#define TransactionIdToPgIndex(xid) ((xid) % (TransactionId) CSN_LOG_XACTS_PER_PAGE) + +#define PgIndexToTransactionId(pageno, idx) (CSN_LOG_XACTS_PER_PAGE * (pageno) + idx) + + + +/* + * Link to shared-memory data structures for CSNLog control + */ +static SlruCtlData CSNLogCtlData; +#define CsnlogCtl (&CSNLogCtlData) + +static int ZeroCSNLogPage(int pageno); +static bool CSNLogPagePrecedes(int64 page1, int64 page2); +static void CSNLogSetPageStatus(TransactionId xid, int nsubxids, + TransactionId *subxids, + XLogRecPtr csn, int pageno); +static void CSNLogSetCSNInSlot(TransactionId xid, XLogRecPtr csn, + int slotno); + + +/* + * Record commit LSN of a transaction and its subtransaction tree. + * + * xid is a single xid to set status for. This will typically be the top level + * transaction ID for a top level commit. + * + * subxids is an array of xids of length nsubxids, in logical XID order, + * representing subtransactions in the tree of XIDs. In various cases nsubxids + * may be zero. + * + * commitLsn is the LSN of the commit record. This is currently never called + * for aborted transactions. + */ +void +CSNLogSetCSN(TransactionId xid, int nsubxids, TransactionId *subxids, + XLogRecPtr commitLsn) +{ + int pageno; + int i = 0; + int offset = 0; + + Assert(TransactionIdIsValid(xid)); + + pageno = TransactionIdToPage(xid); /* get page of parent */ + for (;;) + { + int num_on_page = 0; + + while (i < nsubxids && TransactionIdToPage(subxids[i]) == pageno) + { + num_on_page++; + i++; + } + + CSNLogSetPageStatus(xid, + num_on_page, subxids + offset, + commitLsn, pageno); + if (i >= nsubxids) + break; + + offset = i; + pageno = TransactionIdToPage(subxids[offset]); + xid = InvalidTransactionId; + } +} + +/* + * Record the final state of transaction entries in the CSN log for all + * entries on a single page. Atomic only on this page. + * + * Otherwise API is same as CSNLogSetCSN() + */ +static void +CSNLogSetPageStatus(TransactionId xid, int nsubxids, TransactionId *subxids, + XLogRecPtr commitLsn, int pageno) +{ + int slotno; + int i; + LWLock *lock; + + lock = SimpleLruGetBankLock(CsnlogCtl, pageno); + LWLockAcquire(lock, LW_EXCLUSIVE); + + slotno = SimpleLruReadPage(CsnlogCtl, pageno, true, xid); + + /* Subtransactions first, if needed ... */ + for (i = 0; i < nsubxids; i++) + { + Assert(CsnlogCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i])); + CSNLogSetCSNInSlot(subxids[i], commitLsn, slotno); + } + + /* ... then the main transaction */ + if (TransactionIdIsValid(xid)) + CSNLogSetCSNInSlot(xid, commitLsn, slotno); + + CsnlogCtl->shared->page_dirty[slotno] = true; + + LWLockRelease(lock); +} + +/* + * Sets the commit status of a single transaction. + */ +static void +CSNLogSetCSNInSlot(TransactionId xid, XLogRecPtr csn, int slotno) +{ + int entryno = TransactionIdToPgIndex(xid); + XLogRecPtr *ptr; + + ptr = (XLogRecPtr *) (CsnlogCtl->shared->page_buffer[slotno] + entryno * sizeof(XLogRecPtr)); + + *ptr = csn; +} + +/* + * Interrogate the state of a transaction in the log. + * + * NB: this is a low-level routine and is NOT the preferred entry point + * for most uses; TransactionIdGetXidCSN() in csn_snapshot.c is the + * intended caller. + */ +XLogRecPtr +CSNLogGetCSNByXid(TransactionId xid) +{ + int pageno = TransactionIdToPage(xid); + int entryno = TransactionIdToPgIndex(xid); + int slotno; + XLogRecPtr *ptr; + XLogRecPtr xid_csn; + + Assert(TransactionIdIsValid(xid)); + + /* Can't ask about stuff that might not be around anymore */ + Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin)); + + /* lock is acquired by SimpleLruReadPage_ReadOnly */ + + slotno = SimpleLruReadPage_ReadOnly(CsnlogCtl, pageno, xid); + ptr = (XLogRecPtr *) (CsnlogCtl->shared->page_buffer[slotno] + entryno * sizeof(XLogRecPtr)); + xid_csn = *ptr; + + LWLockRelease(SimpleLruGetBankLock(CsnlogCtl, pageno)); + + return xid_csn; +} + +/* + * Number of shared CSNLog buffers. + */ +static Size +CSNLogShmemBuffers(void) +{ + return Min(32, Max(16, NBuffers / 512)); +} + +/* + * Reserve shared memory for CsnlogCtl. + */ +Size +CSNLogShmemSize(void) +{ + /* FIXME: skip if not InHotStandby? */ + return SimpleLruShmemSize(CSNLogShmemBuffers(), 0); +} + +/* + * Initialization of shared memory for CSNLog. + */ +void +CSNLogShmemInit(void) +{ + CsnlogCtl->PagePrecedes = CSNLogPagePrecedes; + SimpleLruInit(CsnlogCtl, "CSNLog Ctl", CSNLogShmemBuffers(), 0, + "pg_csn", LWTRANCHE_CSN_LOG_BUFFER, + LWTRANCHE_CSN_LOG_SLRU, SYNC_HANDLER_NONE, false); + SlruPagePrecedesUnitTests(CsnlogCtl, CSN_LOG_XACTS_PER_PAGE); +} + +/* + * This func must be called ONCE on system install. It creates the initial + * CSNLog segment. The pg_csn directory is assumed to have been + * created by initdb, and CSNLogShmemInit must have been called already. + * + * Note: it's not really necessary to create the initial segment now, + * since slru.c would create it on first write anyway. But we may as well + * do it to be sure the directory is set up correctly. + */ +void +BootStrapCSNLog(void) +{ + int slotno; + LWLock *lock; + + lock = SimpleLruGetBankLock(CsnlogCtl, 0); + LWLockAcquire(lock, LW_EXCLUSIVE); + + /* Create and zero the first page of the commit log */ + slotno = ZeroCSNLogPage(0); + + /* Make sure it's written out */ + SimpleLruWritePage(CsnlogCtl, slotno); + Assert(!CsnlogCtl->shared->page_dirty[slotno]); + + LWLockRelease(lock); +} + +/* + * Initialize (or reinitialize) a page of CSNLog to zeroes. + * + * The page is not actually written, just set up in shared memory. + * The slot number of the new page is returned. + * + * Control lock must be held at entry, and will be held at exit. + */ +static int +ZeroCSNLogPage(int pageno) +{ + return SimpleLruZeroPage(CsnlogCtl, pageno); +} + +/* + * Initialize a page of CSNLog based on pg_xact. + * + * All committed transactions are stamped with 'csn' + */ +static void +InitCSNLogPage(int pageno, TransactionId *xid, TransactionId nextXid, XLogRecPtr csn) +{ + XLogRecPtr dummy; + int slotno; + + slotno = ZeroCSNLogPage(pageno); + + while (*xid < nextXid && TransactionIdToPage(*xid) == pageno) + { + XidStatus status = TransactionIdGetStatus(*xid, &dummy); + + if (status == TRANSACTION_STATUS_COMMITTED || + status == TRANSACTION_STATUS_ABORTED) + CSNLogSetCSNInSlot(*xid, csn, slotno); + + TransactionIdAdvance(*xid); + } + SimpleLruZeroPage(CsnlogCtl, pageno); +} + +/* + * This must be called ONCE during postmaster or standalone-backend startup, + * after StartupXLOG has initialized ShmemVariableCache->nextXid, and after + * initializing the CLOG. + * + * oldestActiveXID is the oldest XID of any prepared transaction, or nextXid + * if there are none. + * + * All transactions that have already completed are marked with 'csn'. ('csn' + * is supposed to be an "older than anything we'll ever need to compare with") + */ +void +StartupCSNLog(TransactionId oldestActiveXID, XLogRecPtr csn) +{ + TransactionId xid; + FullTransactionId nextXid; + int startPage; + int endPage; + LWLock *prevlock = NULL; + LWLock *lock; + + /* + * Since we don't expect pg_csn to be valid across crashes, we initialize + * the currently-active page(s) to zeroes during startup. Whenever we + * advance into a new page, ExtendCSNLog will likewise zero the new page + * without regard to whatever was previously on disk. + */ + startPage = TransactionIdToPage(oldestActiveXID); + nextXid = TransamVariables->nextXid; + endPage = TransactionIdToPage(XidFromFullTransactionId(nextXid)); + + Assert(TransactionIdIsValid(oldestActiveXID)); + Assert(FullTransactionIdIsValid(nextXid)); + + xid = oldestActiveXID; + for (;;) + { + lock = SimpleLruGetBankLock(CsnlogCtl, startPage); + if (prevlock != lock) + { + if (prevlock) + LWLockRelease(prevlock); + LWLockAcquire(lock, LW_EXCLUSIVE); + prevlock = lock; + } + + InitCSNLogPage(startPage, &xid, XidFromFullTransactionId(nextXid), csn); + if (startPage == endPage) + break; + + startPage++; + /* must account for wraparound */ + if (startPage > TransactionIdToPage(MaxTransactionId)) + startPage = 0; + } + + LWLockRelease(lock); +} + +/* + * This must be called ONCE during postmaster or standalone-backend shutdown + */ +void +ShutdownCSNLog(void) +{ + /* + * Flush dirty CSNLog pages to disk. + * + * This is not actually necessary from a correctness point of view. We do + * it merely as a debugging aid. + */ + TRACE_POSTGRESQL_CSNLOG_CHECKPOINT_START(false); + SimpleLruWriteAll(CsnlogCtl, false); + TRACE_POSTGRESQL_CSNLOG_CHECKPOINT_DONE(false); +} + +/* + * Perform a checkpoint --- either during shutdown, or on-the-fly + */ +void +CheckPointCSNLog(void) +{ + /* + * Flush dirty CSNLog pages to disk. + * + * This is not actually necessary from a correctness point of view. We do + * it merely to improve the odds that writing of dirty pages is done by + * the checkpoint process and not by backends. + */ + TRACE_POSTGRESQL_CSNLOG_CHECKPOINT_START(true); + SimpleLruWriteAll(CsnlogCtl, true); + TRACE_POSTGRESQL_CSNLOG_CHECKPOINT_DONE(true); +} + +/* + * Make sure that CSNLog has room for a newly-allocated XID. + * + * NB: this is called while holding XidGenLock. We want it to be very fast + * most of the time; even when it's not so fast, no actual I/O need happen + * unless we're forced to write out a dirty clog or xlog page to make room + * in shared memory. + */ +void +ExtendCSNLog(TransactionId newestXact) +{ + int64 pageno; + LWLock *lock; + + /* + * No work except at first XID of a page. But beware: just after + * wraparound, the first XID of page zero is FirstNormalTransactionId. + */ + if (TransactionIdToPgIndex(newestXact) != 0 && + !TransactionIdEquals(newestXact, FirstNormalTransactionId)) + return; + + pageno = TransactionIdToPage(newestXact); + + lock = SimpleLruGetBankLock(CsnlogCtl, pageno); + + LWLockAcquire(lock, LW_EXCLUSIVE); + + /* Zero the page and make an XLOG entry about it */ + ZeroCSNLogPage(pageno); + + LWLockRelease(lock); +} + +/* + * Remove all CSNLog segments before the one holding the passed + * transaction ID. + * + * This is normally called during checkpoint, with oldestXact being the + * oldest TransactionXmin of any running transaction. + */ +void +TruncateCSNLog(TransactionId oldestXact) +{ + int cutoffPage; + + /* + * The cutoff point is the start of the segment containing oldestXact. We + * pass the *page* containing oldestXact to SimpleLruTruncate. We step + * back one transaction to avoid passing a cutoff page that hasn't been + * created yet in the rare case that oldestXact would be the first item on + * a page and oldestXact == next XID. In that case, if we didn't subtract + * one, we'd trigger SimpleLruTruncate's wraparound detection. + */ + TransactionIdRetreat(oldestXact); + cutoffPage = TransactionIdToPage(oldestXact); + + SimpleLruTruncate(CsnlogCtl, cutoffPage); +} + +/* + * Decide which of two CSNLog page numbers is "older" for truncation purposes. + * Analogous to CLOGPagePrecedes() and SubTransPagePrecedes(). + */ +static bool +CSNLogPagePrecedes(int64 page1, int64 page2) +{ + TransactionId xid1; + TransactionId xid2; + + xid1 = ((TransactionId) page1) * CSN_LOG_XACTS_PER_PAGE; + xid1 += FirstNormalTransactionId + 1; + xid2 = ((TransactionId) page2) * CSN_LOG_XACTS_PER_PAGE; + xid2 += FirstNormalTransactionId + 1; + + return (TransactionIdPrecedes(xid1, xid2) && + TransactionIdPrecedes(xid1, xid2 + CSN_LOG_XACTS_PER_PAGE - 1)); +} diff --git a/src/backend/access/transam/meson.build b/src/backend/access/transam/meson.build index e8ae9b13c8e..e2a3419fc22 100644 --- a/src/backend/access/transam/meson.build +++ b/src/backend/access/transam/meson.build @@ -2,6 +2,7 @@ backend_sources += files( 'clog.c', + 'csn_log.c', 'commit_ts.c', 'generic_xlog.c', 'multixact.c', diff --git a/src/backend/access/transam/transam.c b/src/backend/access/transam/transam.c index 9a39451a29a..b4c42c0f156 100644 --- a/src/backend/access/transam/transam.c +++ b/src/backend/access/transam/transam.c @@ -377,6 +377,9 @@ TransactionIdLatest(TransactionId mainxid, * Also, because we group transactions on the same clog page to conserve * storage, we might return the LSN of a later transaction that falls into * the same group. + * + * XXX: Now that we have the CSN-log, should we use that during recovery? Or + * rename this function to reduce confusion. */ XLogRecPtr TransactionIdGetCommitLSN(TransactionId xid) diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 73a80559194..2330632e569 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -77,6 +77,7 @@ #include #include "access/commit_ts.h" +#include "access/csn_log.h" #include "access/htup_details.h" #include "access/subtrans.h" #include "access/transam.h" @@ -1943,20 +1944,13 @@ restoreTwoPhaseData(void) * Our other responsibility is to determine and return the oldest valid XID * among the prepared xacts (if none, return TransamVariables->nextXid). * This is needed to synchronize pg_subtrans startup properly. - * - * If xids_p and nxids_p are not NULL, pointer to a palloc'd array of all - * top-level xids is stored in *xids_p. The number of entries in the array - * is returned in *nxids_p. */ TransactionId -PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p) +PrescanPreparedTransactions(void) { FullTransactionId nextXid = TransamVariables->nextXid; TransactionId origNextXid = XidFromFullTransactionId(nextXid); TransactionId result = origNextXid; - TransactionId *xids = NULL; - int nxids = 0; - int allocsize = 0; int i; LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); @@ -1984,34 +1978,10 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p) if (TransactionIdPrecedes(xid, result)) result = xid; - if (xids_p) - { - if (nxids == allocsize) - { - if (nxids == 0) - { - allocsize = 10; - xids = palloc(allocsize * sizeof(TransactionId)); - } - else - { - allocsize = allocsize * 2; - xids = repalloc(xids, allocsize * sizeof(TransactionId)); - } - } - xids[nxids++] = xid; - } - pfree(buf); } LWLockRelease(TwoPhaseStateLock); - if (xids_p) - { - *xids_p = xids; - *nxids_p = nxids; - } - return result; } diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index fe895787cb7..a495f1d7899 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -15,6 +15,7 @@ #include "access/clog.h" #include "access/commit_ts.h" +#include "access/csn_log.h" #include "access/subtrans.h" #include "access/transam.h" #include "access/xact.h" diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index b885513f765..5250a158145 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -21,6 +21,7 @@ #include #include "access/commit_ts.h" +#include "access/csn_log.h" #include "access/multixact.h" #include "access/parallel.h" #include "access/subtrans.h" @@ -210,7 +211,6 @@ typedef struct TransactionStateData int prevSecContext; /* previous SecurityRestrictionContext */ bool prevXactReadOnly; /* entry-time xact r/o state */ bool startedInRecovery; /* did we start in recovery? */ - bool didLogXid; /* has xid been included in WAL record? */ int parallelModeLevel; /* Enter/ExitParallelMode counter */ bool parallelChildXact; /* is any parent transaction parallel? */ bool chain; /* start a new block after this one */ @@ -250,13 +250,6 @@ static TransactionStateData TopTransactionStateData = { .topXidLogged = false, }; -/* - * unreportedXids holds XIDs of all subtransactions that have not yet been - * reported in an XLOG_XACT_ASSIGNMENT record. - */ -static int nUnreportedXids; -static TransactionId unreportedXids[PGPROC_MAX_CACHED_SUBXIDS]; - static TransactionState CurrentTransactionState = &TopTransactionStateData; /* @@ -532,18 +525,6 @@ GetCurrentFullTransactionIdIfAny(void) return CurrentTransactionState->fullTransactionId; } -/* - * MarkCurrentTransactionIdLoggedIfAny - * - * Remember that the current xid - if it is assigned - now has been wal logged. - */ -void -MarkCurrentTransactionIdLoggedIfAny(void) -{ - if (FullTransactionIdIsValid(CurrentTransactionState->fullTransactionId)) - CurrentTransactionState->didLogXid = true; -} - /* * IsSubxactTopXidLogPending * @@ -636,7 +617,6 @@ AssignTransactionId(TransactionState s) { bool isSubXact = (s->parent != NULL); ResourceOwner currentOwner; - bool log_unknown_top = false; /* Assert that caller didn't screw up */ Assert(!FullTransactionIdIsValid(s->fullTransactionId)); @@ -680,20 +660,6 @@ AssignTransactionId(TransactionState s) pfree(parents); } - /* - * When wal_level=logical, guarantee that a subtransaction's xid can only - * be seen in the WAL stream if its toplevel xid has been logged before. - * If necessary we log an xact_assignment record with fewer than - * PGPROC_MAX_CACHED_SUBXIDS. Note that it is fine if didLogXid isn't set - * for a transaction even though it appears in a WAL record, we just might - * superfluously log something. That can happen when an xid is included - * somewhere inside a wal record, but not in XLogRecord->xl_xid, like in - * xl_standby_locks. - */ - if (isSubXact && XLogLogicalInfoActive() && - !TopTransactionStateData.didLogXid) - log_unknown_top = true; - /* * Generate a new FullTransactionId and record its xid in PGPROC and * pg_subtrans. @@ -729,59 +695,6 @@ AssignTransactionId(TransactionState s) XactLockTableInsert(XidFromFullTransactionId(s->fullTransactionId)); CurrentResourceOwner = currentOwner; - - /* - * Every PGPROC_MAX_CACHED_SUBXIDS assigned transaction ids within each - * top-level transaction we issue a WAL record for the assignment. We - * include the top-level xid and all the subxids that have not yet been - * reported using XLOG_XACT_ASSIGNMENT records. - * - * This is required to limit the amount of shared memory required in a hot - * standby server to keep track of in-progress XIDs. See notes for - * RecordKnownAssignedTransactionIds(). - * - * We don't keep track of the immediate parent of each subxid, only the - * top-level transaction that each subxact belongs to. This is correct in - * recovery only because aborted subtransactions are separately WAL - * logged. - * - * This is correct even for the case where several levels above us didn't - * have an xid assigned as we recursed up to them beforehand. - */ - if (isSubXact && XLogStandbyInfoActive()) - { - unreportedXids[nUnreportedXids] = XidFromFullTransactionId(s->fullTransactionId); - nUnreportedXids++; - - /* - * ensure this test matches similar one in - * RecoverPreparedTransactions() - */ - if (nUnreportedXids >= PGPROC_MAX_CACHED_SUBXIDS || - log_unknown_top) - { - xl_xact_assignment xlrec; - - /* - * xtop is always set by now because we recurse up transaction - * stack to the highest unassigned xid and then come back down - */ - xlrec.xtop = GetTopTransactionId(); - Assert(TransactionIdIsValid(xlrec.xtop)); - xlrec.nsubxacts = nUnreportedXids; - - XLogBeginInsert(); - XLogRegisterData(&xlrec, MinSizeOfXactAssignment); - XLogRegisterData(unreportedXids, - nUnreportedXids * sizeof(TransactionId)); - - (void) XLogInsert(RM_XACT_ID, XLOG_XACT_ASSIGNMENT); - - nUnreportedXids = 0; - /* mark top, not current xact as having been logged */ - TopTransactionStateData.didLogXid = true; - } - } } /* @@ -1481,11 +1394,11 @@ RecordTransactionCommit(void) * temp tables will be lost anyway, unlogged tables will be truncated and * HOT pruning will be done again later. (Given the foregoing, you might * think that it would be unnecessary to emit the XLOG record at all in - * this case, but we don't currently try to do that. It would certainly - * cause problems at least in Hot Standby mode, where the - * KnownAssignedXids machinery requires tracking every XID assignment. It - * might be OK to skip it only when wal_level < replica, but for now we - * don't.) + * this case, but we don't currently try to do that. It might cause + * inefficiencies in Hot Standby mode, if nothing else, where the + * commit/abort records allow advancing the xmin horizon for new + * snapshots. It might be OK to skip it only when wal_level < replica, but + * for now we don't.) * * However, if we're doing cleanup of any non-temp rels or committing any * command that wanted to force sync commit, then we must flush XLOG @@ -1953,13 +1866,6 @@ AtSubAbort_childXids(void) s->childXids = NULL; s->nChildXids = 0; s->maxChildXids = 0; - - /* - * We could prune the unreportedXids array here. But we don't bother. That - * would potentially reduce number of XLOG_XACT_ASSIGNMENT records but it - * would likely introduce more CPU time into the more common paths, so we - * choose not to do that. - */ } /* ---------------------------------------------------------------- @@ -2142,12 +2048,6 @@ StartTransaction(void) currentCommandId = FirstCommandId; currentCommandIdUsed = false; - /* - * initialize reported xid accounting - */ - nUnreportedXids = 0; - s->didLogXid = false; - /* * must initialize resource-management stuff first */ @@ -6154,7 +6054,7 @@ xact_redo_commit(xl_xact_parsed_commit *parsed, TransactionTreeSetCommitTsData(xid, parsed->nsubxacts, parsed->subxacts, commit_time, origin_id); - if (standbyState == STANDBY_DISABLED) + if (!InHotStandby) { /* * Mark the transaction committed in pg_xact. @@ -6174,6 +6074,12 @@ xact_redo_commit(xl_xact_parsed_commit *parsed, */ RecordKnownAssignedTransactionIds(max_xid); + /* + * Mark the CSNLOG first. The transaction won't become visible to new + * snapshots until the call to ProcArrayRecoveryEndTransaction(). + */ + CSNLogSetCSN(xid, parsed->nsubxacts, parsed->subxacts, lsn); + /* * Mark the transaction committed in pg_xact. We use async commit * protocol during recovery to provide information on database @@ -6186,9 +6092,9 @@ xact_redo_commit(xl_xact_parsed_commit *parsed, TransactionIdAsyncCommitTree(xid, parsed->nsubxacts, parsed->subxacts, lsn); /* - * We must mark clog before we update the ProcArray. + * Make the commit visible to new snapshots in the ProcArray. */ - ExpireTreeKnownAssignedTransactionIds(xid, parsed->nsubxacts, parsed->subxacts, max_xid); + ProcArrayRecoveryEndTransaction(max_xid, lsn); /* * Send any cache invalidations attached to the commit. We must @@ -6294,7 +6200,7 @@ xact_redo_abort(xl_xact_parsed_abort *parsed, TransactionId xid, parsed->subxacts); AdvanceNextFullTransactionIdPastXid(max_xid); - if (standbyState == STANDBY_DISABLED) + if (!InHotStandby) { /* Mark the transaction aborted in pg_xact, no need for async stuff */ TransactionIdAbortTree(xid, parsed->nsubxacts, parsed->subxacts); @@ -6312,13 +6218,15 @@ xact_redo_abort(xl_xact_parsed_abort *parsed, TransactionId xid, */ RecordKnownAssignedTransactionIds(max_xid); + /* Note: we don't need to update the CSN log on abort. */ + /* Mark the transaction aborted in pg_xact, no need for async stuff */ TransactionIdAbortTree(xid, parsed->nsubxacts, parsed->subxacts); /* * We must update the ProcArray after we have marked clog. */ - ExpireTreeKnownAssignedTransactionIds(xid, parsed->nsubxacts, parsed->subxacts, max_xid); + ProcArrayRecoveryEndTransaction(max_xid, lsn); /* * There are no invalidation messages to send or undo. @@ -6426,14 +6334,6 @@ xact_redo(XLogReaderState *record) XLogRecGetOrigin(record)); LWLockRelease(TwoPhaseStateLock); } - else if (info == XLOG_XACT_ASSIGNMENT) - { - xl_xact_assignment *xlrec = (xl_xact_assignment *) XLogRecGetData(record); - - if (standbyState >= STANDBY_INITIALIZED) - ProcArrayApplyXidAssignment(xlrec->xtop, - xlrec->nsubxacts, xlrec->xsub); - } else if (info == XLOG_XACT_INVALIDATIONS) { /* diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index fc30a52d496..cbeac223e1c 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -48,6 +48,7 @@ #include "access/clog.h" #include "access/commit_ts.h" +#include "access/csn_log.h" #include "access/heaptoast.h" #include "access/multixact.h" #include "access/rewriteheap.h" @@ -951,8 +952,6 @@ XLogInsertRecord(XLogRecData *rdata, END_CRIT_SECTION(); - MarkCurrentTransactionIdLoggedIfAny(); - /* * Mark top transaction id is logged (if needed) so that we should not try * to log it again with the next WAL record in the current subtransaction. @@ -5230,6 +5229,7 @@ BootStrapXLOG(uint32 data_checksum_version) /* Bootstrap the commit log, too */ BootStrapCLOG(); + BootStrapCSNLog(); BootStrapCommitTs(); BootStrapSUBTRANS(); BootStrapMultiXact(); @@ -5831,16 +5831,16 @@ StartupXLOG(void) */ if (ArchiveRecoveryRequested && EnableHotStandby) { - TransactionId *xids; - int nxids; + FullTransactionId latestCompletedXid; ereport(DEBUG1, (errmsg_internal("initializing for hot standby"))); + InHotStandby = true; InitRecoveryTransactionEnvironment(); if (wasShutdown) - oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids); + oldestActiveXID = PrescanPreparedTransactions(); else oldestActiveXID = checkPoint.oldestActiveXid; Assert(TransactionIdIsValid(oldestActiveXID)); @@ -5855,39 +5855,17 @@ StartupXLOG(void) */ StartupSUBTRANS(oldestActiveXID); - /* - * If we're beginning at a shutdown checkpoint, we know that - * nothing was running on the primary at this point. So fake-up an - * empty running-xacts record and use that here and now. Recover - * additional standby state for prepared transactions. - */ - if (wasShutdown) - { - RunningTransactionsData running; - TransactionId latestCompletedXid; + latestCompletedXid = checkPoint.nextXid; + FullTransactionIdRetreat(&latestCompletedXid); + TransamVariables->latestCompletedXid = latestCompletedXid; - /* Update pg_subtrans entries for any prepared transactions */ - StandbyRecoverPreparedTransactions(); + StartupCSNLog(oldestActiveXID, RedoRecPtr); - /* - * Construct a RunningTransactions snapshot representing a - * shut down server, with only prepared transactions still - * alive. We're never overflowed at this point because all - * subxids are listed with their parent prepared transactions. - */ - running.xcnt = nxids; - running.subxcnt = 0; - running.subxid_status = SUBXIDS_IN_SUBTRANS; - running.nextXid = XidFromFullTransactionId(checkPoint.nextXid); - running.oldestRunningXid = oldestActiveXID; - latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid); - TransactionIdRetreat(latestCompletedXid); - Assert(TransactionIdIsNormal(latestCompletedXid)); - running.latestCompletedXid = latestCompletedXid; - running.xids = xids; - - ProcArrayApplyRecoveryInfo(&running); - } + ProcArrayUpdateOldestRunningXid(oldestActiveXID); + + /* Update pg_subtrans entries for any prepared transactions */ + if (wasShutdown) + StandbyRecoverPreparedTransactions(); } /* @@ -5971,7 +5949,7 @@ StartupXLOG(void) * This information is not quite needed yet, but it is positioned here so * as potential problems are detected before any on-disk change is done. */ - oldestActiveXID = PrescanPreparedTransactions(NULL, NULL); + oldestActiveXID = PrescanPreparedTransactions(); /* * Allow ordinary WAL segment creation before possibly switching to a new @@ -6137,9 +6115,18 @@ StartupXLOG(void) * Start up subtrans, if not already done for hot standby. (commit * timestamps are started below, if necessary.) */ - if (standbyState == STANDBY_DISABLED) + if (!InHotStandby) + { StartupSUBTRANS(oldestActiveXID); + /* + * TODO: we don't need to update CSN log from now on, but it's still + * required by snapshots that were taken before recovery ended. We + * just let it be, but it would be nice to truncate it to 0 after all + * the snapshots are gone. + */ + } + /* * Perform end of recovery actions for any SLRUs that need it. */ @@ -6225,12 +6212,12 @@ StartupXLOG(void) * Shutdown the recovery environment. This must occur after * RecoverPreparedTransactions() (see notes in lock_twophase_recover()) * and after switching SharedRecoveryState to RECOVERY_STATE_DONE so as - * any session building a snapshot will not rely on KnownAssignedXids as + * any session building a snapshot will not rely on the CSN log as * RecoveryInProgress() would return false at this stage. This is * particularly critical for prepared 2PC transactions, that would still * need to be included in snapshots once recovery has ended. */ - if (standbyState != STANDBY_DISABLED) + if (InHotStandby) ShutdownRecoveryTransactionEnvironment(); /* @@ -7002,7 +6989,7 @@ CreateCheckPoint(int flags) * starting snapshot of locks and transactions. */ if (!shutdown && XLogStandbyInfoActive()) - checkPoint.oldestActiveXid = GetOldestActiveTransactionId(); + checkPoint.oldestActiveXid = GetOldestActiveTransactionId(true); else checkPoint.oldestActiveXid = InvalidTransactionId; @@ -7396,6 +7383,9 @@ CreateCheckPoint(int flags) if (!RecoveryInProgress()) TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning()); + if (shutdown) + TruncateCSNLog(GetOldestTransactionIdConsideredRunning()); + /* Real work is done; log and update stats. */ LogCheckpointEnd(false); @@ -7567,6 +7557,7 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags) TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags); CheckpointStats.ckpt_write_t = GetCurrentTimestamp(); CheckPointCLOG(); + CheckPointCSNLog(); CheckPointCommitTs(); CheckPointSUBTRANS(); CheckPointMultiXact(); @@ -7863,7 +7854,10 @@ CreateRestartPoint(int flags) * this because StartupSUBTRANS hasn't been called yet. */ if (EnableHotStandby) + { TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning()); + TruncateCSNLog(GetOldestTransactionIdConsideredRunning()); + } /* Real work is done; log and update stats. */ LogCheckpointEnd(true); @@ -8348,41 +8342,17 @@ xlog_redo(XLogReaderState *record) /* * If we see a shutdown checkpoint, we know that nothing was running - * on the primary at this point. So fake-up an empty running-xacts - * record and use that here and now. Recover additional standby state - * for prepared transactions. + * on the primary at this point, except for prepared transactions. */ - if (standbyState >= STANDBY_INITIALIZED) + if (InHotStandby) { - TransactionId *xids; - int nxids; TransactionId oldestActiveXID; - TransactionId latestCompletedXid; - RunningTransactionsData running; - oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids); + oldestActiveXID = PrescanPreparedTransactions(); + ProcArrayUpdateOldestRunningXid(oldestActiveXID); /* Update pg_subtrans entries for any prepared transactions */ StandbyRecoverPreparedTransactions(); - - /* - * Construct a RunningTransactions snapshot representing a shut - * down server, with only prepared transactions still alive. We're - * never overflowed at this point because all subxids are listed - * with their parent prepared transactions. - */ - running.xcnt = nxids; - running.subxcnt = 0; - running.subxid_status = SUBXIDS_IN_SUBTRANS; - running.nextXid = XidFromFullTransactionId(checkPoint.nextXid); - running.oldestRunningXid = oldestActiveXID; - latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid); - TransactionIdRetreat(latestCompletedXid); - Assert(TransactionIdIsNormal(latestCompletedXid)); - running.latestCompletedXid = latestCompletedXid; - running.xids = xids; - - ProcArrayApplyRecoveryInfo(&running); } /* ControlFile->checkPointCopy always tracks the latest ckpt XID */ @@ -8446,6 +8416,16 @@ xlog_redo(XLogReaderState *record) checkPoint.oldestXid)) SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); + + /* + * Remember the oldest XID that was running at the time. Normally, + * all transaction aborts and commits are WAL-logged, so our + * oldestRunningXid value should be up-to-date, but if not, this + * allows us to resynchronize. + */ + if (InHotStandby) + ProcArrayUpdateOldestRunningXid(checkPoint.oldestActiveXid); + /* ControlFile->checkPointCopy always tracks the latest ckpt XID */ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 0aa3ab59085..b213b8a74dc 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -1978,10 +1978,9 @@ ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *repl SpinLockRelease(&XLogRecoveryCtl->info_lck); /* - * If we are attempting to enter Hot Standby mode, process XIDs we see + * In Hot Standby mode, process XIDs we see */ - if (standbyState >= STANDBY_INITIALIZED && - TransactionIdIsValid(record->xl_xid)) + if (InHotStandby && TransactionIdIsValid(record->xl_xid)) RecordKnownAssignedTransactionIds(record->xl_xid); /* @@ -2258,7 +2257,7 @@ CheckRecoveryConsistency(void) * run? If so, we can tell postmaster that the database is consistent now, * enabling connections. */ - if (standbyState == STANDBY_SNAPSHOT_READY && + if (InHotStandby && !LocalHotStandbyActive && reachedConsistency && IsUnderPostmaster) @@ -3715,9 +3714,6 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, elog(LOG, "waiting for WAL to become available at %X/%X", LSN_FORMAT_ARGS(RecPtr)); - /* Do background tasks that might benefit us later. */ - KnownAssignedTransactionIdsIdleMaintenance(); - (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch, WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, @@ -3983,9 +3979,6 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, streaming_reply_sent = true; } - /* Do any background tasks that might benefit us later. */ - KnownAssignedTransactionIdsIdleMaintenance(); - /* Update pg_stat_recovery_prefetch before sleeping. */ XLogPrefetcherComputeStats(xlogprefetcher); diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index c389b27f77d..775e1a926d8 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -50,7 +50,7 @@ bool ignore_invalid_pages = false; bool InRecovery = false; /* Are we in Hot Standby mode? Only valid in startup process, see xlogutils.h */ -HotStandbyState standbyState = STANDBY_DISABLED; +bool InHotStandby = false; /* * During XLOG replay, we may see XLOG records for incremental updates of diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c index 891637e3a44..f1307ed714c 100644 --- a/src/backend/backup/basebackup.c +++ b/src/backend/backup/basebackup.c @@ -181,6 +181,9 @@ static const char *const excludeDirContents[] = /* Contents zeroed on startup, see StartupSUBTRANS(). */ "pg_subtrans", + /* Contents zeroed on startup, see StartupCSNLog(). */ + "pg_csn", + /* end of list */ NULL }; diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c index 27e86cf393f..d04286ab270 100644 --- a/src/backend/postmaster/startup.c +++ b/src/backend/postmaster/startup.c @@ -203,7 +203,7 @@ static void StartupProcExit(int code, Datum arg) { /* Shutdown the recovery environment */ - if (standbyState != STANDBY_DISABLED) + if (InHotStandby) ShutdownRecoveryTransactionEnvironment(); } diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index 6a428e9720e..808b1d85379 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -270,14 +270,6 @@ xact_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) DecodeAbort(ctx, buf, &parsed, xid, two_phase); break; } - case XLOG_XACT_ASSIGNMENT: - - /* - * We assign subxact to the toplevel xact while processing each - * record if required. So, we don't need to do anything here. See - * LogicalDecodingProcessRecord. - */ - break; case XLOG_XACT_INVALIDATIONS: { TransactionId xid; diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index 3c94a62cdf6..97d278052df 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -27,7 +27,7 @@ * removed. This is achieved by using the replication slot mechanism. * * As the percentage of transactions modifying the catalog normally is fairly - * small in comparisons to ones only manipulating user data, we keep track of + * small in comparison to ones only manipulating user data, we keep track of * the committed catalog modifying ones inside [xmin, xmax) instead of keeping * track of all running transactions like it's done in a normal snapshot. Note * that we're generally only looking at transactions that have acquired an diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 2fa045e6b0f..fc9804b2eab 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -16,6 +16,7 @@ #include "access/clog.h" #include "access/commit_ts.h" +#include "access/csn_log.h" #include "access/multixact.h" #include "access/nbtree.h" #include "access/subtrans.h" @@ -122,6 +123,7 @@ CalculateShmemSize(int *num_semaphores) size = add_size(size, XLOGShmemSize()); size = add_size(size, XLogRecoveryShmemSize()); size = add_size(size, CLOGShmemSize()); + size = add_size(size, CSNLogShmemSize()); size = add_size(size, CommitTsShmemSize()); size = add_size(size, SUBTRANSShmemSize()); size = add_size(size, TwoPhaseShmemSize()); @@ -287,6 +289,7 @@ CreateOrAttachShmemStructs(void) XLogPrefetchShmemInit(); XLogRecoveryShmemInit(); CLOGShmemInit(); + CSNLogShmemInit(); CommitTsShmemInit(); SUBTRANSShmemInit(); MultiXactShmemInit(); diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 819649741f6..3418ddf5304 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -19,20 +19,10 @@ * myProcLocks lists. They can be distinguished from regular backend PGPROCs * at need by checking for pid == 0. * - * During hot standby, we also keep a list of XIDs representing transactions - * that are known to be running on the primary (or more precisely, were running - * as of the current point in the WAL stream). This list is kept in the - * KnownAssignedXids array, and is updated by watching the sequence of - * arriving XIDs. This is necessary because if we leave those XIDs out of - * snapshots taken for standby queries, then they will appear to be already - * complete, leading to MVCC failures. Note that in hot standby, the PGPROC - * array represents standby processes, which by definition are not running - * transactions that have XIDs. - * - * It is perhaps possible for a backend on the primary to terminate without - * writing an abort record for its transaction. While that shouldn't really - * happen, it would tie up KnownAssignedXids indefinitely, so we protect - * ourselves by pruning the array when a valid list of running XIDs arrives. + * During hot standby, we don't have PGPROC entries representing transactions + * running in the primary. In snapshots taken during recovery, the snapshot + * contains a Commit-Sequence Number (CSN) which is used to determine which + * XIDs are still considered as running by the snapshot. * * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -47,6 +37,7 @@ #include +#include "access/csn_log.h" #include "access/subtrans.h" #include "access/transam.h" #include "access/twophase.h" @@ -74,22 +65,8 @@ typedef struct ProcArrayStruct int numProcs; /* number of valid procs entries */ int maxProcs; /* allocated size of procs array */ - /* - * Known assigned XIDs handling - */ - int maxKnownAssignedXids; /* allocated size of array */ - int numKnownAssignedXids; /* current # of valid entries */ - int tailKnownAssignedXids; /* index of oldest valid element */ - int headKnownAssignedXids; /* index of newest element, + 1 */ - - /* - * Highest subxid that has been removed from KnownAssignedXids array to - * prevent overflow; or InvalidTransactionId if none. We track this for - * similar reasons to tracking overflowing cached subxids in PGPROC - * entries. Must hold exclusive ProcArrayLock to change this, and shared - * lock to read it. - */ - TransactionId lastOverflowedXid; + /* In recovery, oldest XID that could be still running in primary */ + TransactionId oldest_running_primary_xid; /* oldest xmin of any replication slot */ TransactionId replication_slot_xmin; @@ -100,6 +77,21 @@ typedef struct ProcArrayStruct int pgprocnos[FLEXIBLE_ARRAY_MEMBER]; } ProcArrayStruct; +#define PROCARRAY_MAXPROCS (MaxBackends + max_prepared_xacts) + +/* + * TOTAL_MAX_CACHED_SUBXIDS is the total number of XIDs that fits in the proc + * array, as top XIDs and in the subxids caches. + * + * Local data structures are also created in various backends during + * GetSnapshotData(), TransactionIdIsInProgress() and + * GetRunningTransactionData(). All of the main structures created in those + * functions must be identically sized, since we may at times copy the whole + * of the data structures around. + */ +#define TOTAL_MAX_CACHED_SUBXIDS \ + ((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS) + /* * State for the GlobalVisTest* family of functions. Those functions can * e.g. be used to decide if a deleted row can be removed without violating @@ -255,17 +247,6 @@ typedef enum GlobalVisHorizonKind VISHORIZON_TEMP, } GlobalVisHorizonKind; -/* - * Reason codes for KnownAssignedXidsCompress(). - */ -typedef enum KAXCompressReason -{ - KAX_NO_SPACE, /* need to free up space at array end */ - KAX_PRUNE, /* we just pruned old entries */ - KAX_TRANSACTION_END, /* we just committed/removed some XIDs */ - KAX_STARTUP_PROCESS_IDLE, /* startup process is about to sleep */ -} KAXCompressReason; - static ProcArrayStruct *procArray; @@ -277,19 +258,10 @@ static PGPROC *allProcs; static TransactionId cachedXidIsNotInProgress = InvalidTransactionId; /* - * Bookkeeping for tracking emulated transactions in recovery + * Bookkeeping for tracking transactions seen during recovery */ -static TransactionId *KnownAssignedXids; -static bool *KnownAssignedXidsValid; static TransactionId latestObservedXid = InvalidTransactionId; -/* - * If we're in STANDBY_SNAPSHOT_PENDING state, standbySnapshotPendingXmin is - * the highest xid that might still be running that we don't have in - * KnownAssignedXids. - */ -static TransactionId standbySnapshotPendingXmin; - /* * State for visibility checks on different types of relations. See struct * GlobalVisState for details. As shared, catalog, normal and temporary @@ -316,7 +288,7 @@ static long xc_by_my_xact = 0; static long xc_by_latest_xid = 0; static long xc_by_main_xid = 0; static long xc_by_child_xid = 0; -static long xc_by_known_assigned = 0; +static long xc_during_recovery = 0; static long xc_no_overflow = 0; static long xc_slow_answer = 0; @@ -326,7 +298,7 @@ static long xc_slow_answer = 0; #define xc_by_latest_xid_inc() (xc_by_latest_xid++) #define xc_by_main_xid_inc() (xc_by_main_xid++) #define xc_by_child_xid_inc() (xc_by_child_xid++) -#define xc_by_known_assigned_inc() (xc_by_known_assigned++) +#define xc_during_recovery_inc() (xc_during_recovery++) #define xc_no_overflow_inc() (xc_no_overflow++) #define xc_slow_answer_inc() (xc_slow_answer++) @@ -339,28 +311,12 @@ static void DisplayXidCache(void); #define xc_by_latest_xid_inc() ((void) 0) #define xc_by_main_xid_inc() ((void) 0) #define xc_by_child_xid_inc() ((void) 0) -#define xc_by_known_assigned_inc() ((void) 0) +#define xc_during_recovery_inc() ((void) 0) #define xc_no_overflow_inc() ((void) 0) #define xc_slow_answer_inc() ((void) 0) #endif /* XIDCACHE_DEBUG */ -/* Primitives for KnownAssignedXids array handling for standby */ -static void KnownAssignedXidsCompress(KAXCompressReason reason, bool haveLock); -static void KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid, - bool exclusive_lock); -static bool KnownAssignedXidsSearch(TransactionId xid, bool remove); -static bool KnownAssignedXidExists(TransactionId xid); -static void KnownAssignedXidsRemove(TransactionId xid); -static void KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids, - TransactionId *subxids); -static void KnownAssignedXidsRemovePreceding(TransactionId removeXid); -static int KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax); -static int KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, - TransactionId *xmin, - TransactionId xmax); -static TransactionId KnownAssignedXidsGetOldestXmin(void); -static void KnownAssignedXidsDisplay(int trace_level); -static void KnownAssignedXidsReset(void); + static inline void ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid); static void ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid); static void MaintainLatestCompletedXid(TransactionId latestXid); @@ -384,31 +340,6 @@ ProcArrayShmemSize(void) size = offsetof(ProcArrayStruct, pgprocnos); size = add_size(size, mul_size(sizeof(int), PROCARRAY_MAXPROCS)); - /* - * During Hot Standby processing we have a data structure called - * KnownAssignedXids, created in shared memory. Local data structures are - * also created in various backends during GetMVCCSnapshotData(), - * TransactionIdIsInProgress() and GetRunningTransactionData(). All of the - * main structures created in those functions must be identically sized, - * since we may at times copy the whole of the data structures around. We - * refer to this size as TOTAL_MAX_CACHED_SUBXIDS. - * - * Ideally we'd only create this structure if we were actually doing hot - * standby in the current run, but we don't know that yet at the time - * shared memory is being set up. - */ -#define TOTAL_MAX_CACHED_SUBXIDS \ - ((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS) - - if (EnableHotStandby) - { - size = add_size(size, - mul_size(sizeof(TransactionId), - TOTAL_MAX_CACHED_SUBXIDS)); - size = add_size(size, - mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS)); - } - return size; } @@ -435,31 +366,12 @@ ProcArrayShmemInit(void) */ procArray->numProcs = 0; procArray->maxProcs = PROCARRAY_MAXPROCS; - procArray->maxKnownAssignedXids = TOTAL_MAX_CACHED_SUBXIDS; - procArray->numKnownAssignedXids = 0; - procArray->tailKnownAssignedXids = 0; - procArray->headKnownAssignedXids = 0; - procArray->lastOverflowedXid = InvalidTransactionId; procArray->replication_slot_xmin = InvalidTransactionId; procArray->replication_slot_catalog_xmin = InvalidTransactionId; TransamVariables->xactCompletionCount = 1; } allProcs = ProcGlobal->allProcs; - - /* Create or attach to the KnownAssignedXids arrays too, if needed */ - if (EnableHotStandby) - { - KnownAssignedXids = (TransactionId *) - ShmemInitStruct("KnownAssignedXids", - mul_size(sizeof(TransactionId), - TOTAL_MAX_CACHED_SUBXIDS), - &found); - KnownAssignedXidsValid = (bool *) - ShmemInitStruct("KnownAssignedXidsValid", - mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS), - &found); - } } /* @@ -1023,355 +935,35 @@ MaintainLatestCompletedXidRecovery(TransactionId latestXid) void ProcArrayInitRecovery(TransactionId initializedUptoXID) { - Assert(standbyState == STANDBY_INITIALIZED); + Assert(InHotStandby); Assert(TransactionIdIsNormal(initializedUptoXID)); /* - * we set latestObservedXid to the xid SUBTRANS has been initialized up - * to, so we can extend it from that point onwards in - * RecordKnownAssignedTransactionIds, and when we get consistent in - * ProcArrayApplyRecoveryInfo(). + * we set latestObservedXid to the xid SUBTRANS and CSN log have been + * initialized up to, so we can extend it from that point onwards whenever + * we observe new XIDs. */ latestObservedXid = initializedUptoXID; TransactionIdRetreat(latestObservedXid); } /* - * ProcArrayApplyRecoveryInfo -- apply recovery info about xids - * - * Takes us through 3 states: Initialized, Pending and Ready. - * Normal case is to go all the way to Ready straight away, though there - * are atypical cases where we need to take it in steps. - * - * Use the data about running transactions on the primary to create the initial - * state of KnownAssignedXids. We also use these records to regularly prune - * KnownAssignedXids because we know it is possible that some transactions - * with FATAL errors fail to write abort records, which could cause eventual - * overflow. - * - * See comments for LogStandbySnapshot(). + * Update oldest running XID. from a checkpoint record. This allows truncating + * SUBTRANS and the CSN log. */ void -ProcArrayApplyRecoveryInfo(RunningTransactions running) +ProcArrayUpdateOldestRunningXid(TransactionId oldestRunningXID) { - TransactionId *xids; - TransactionId advanceNextXid; - int nxids; - int i; - - Assert(standbyState >= STANDBY_INITIALIZED); - Assert(TransactionIdIsValid(running->nextXid)); - Assert(TransactionIdIsValid(running->oldestRunningXid)); - Assert(TransactionIdIsNormal(running->latestCompletedXid)); - - /* - * Remove stale transactions, if any. - */ - ExpireOldKnownAssignedTransactionIds(running->oldestRunningXid); - - /* - * Adjust TransamVariables->nextXid before StandbyReleaseOldLocks(), - * because we will need it up to date for accessing two-phase transactions - * in StandbyReleaseOldLocks(). - */ - advanceNextXid = running->nextXid; - TransactionIdRetreat(advanceNextXid); - AdvanceNextFullTransactionIdPastXid(advanceNextXid); - Assert(FullTransactionIdIsValid(TransamVariables->nextXid)); - /* * Remove stale locks, if any. */ - StandbyReleaseOldLocks(running->oldestRunningXid); - - /* - * If our snapshot is already valid, nothing else to do... - */ - if (standbyState == STANDBY_SNAPSHOT_READY) - return; - - /* - * If our initial RunningTransactionsData had an overflowed snapshot then - * we knew we were missing some subxids from our snapshot. If we continue - * to see overflowed snapshots then we might never be able to start up, so - * we make another test to see if our snapshot is now valid. We know that - * the missing subxids are equal to or earlier than nextXid. After we - * initialise we continue to apply changes during recovery, so once the - * oldestRunningXid is later than the nextXid from the initial snapshot we - * know that we no longer have missing information and can mark the - * snapshot as valid. - */ - if (standbyState == STANDBY_SNAPSHOT_PENDING) - { - /* - * If the snapshot isn't overflowed or if its empty we can reset our - * pending state and use this snapshot instead. - */ - if (running->subxid_status != SUBXIDS_MISSING || running->xcnt == 0) - { - /* - * If we have already collected known assigned xids, we need to - * throw them away before we apply the recovery snapshot. - */ - KnownAssignedXidsReset(); - standbyState = STANDBY_INITIALIZED; - } - else - { - if (TransactionIdPrecedes(standbySnapshotPendingXmin, - running->oldestRunningXid)) - { - standbyState = STANDBY_SNAPSHOT_READY; - elog(DEBUG1, - "recovery snapshots are now enabled"); - } - else - elog(DEBUG1, - "recovery snapshot waiting for non-overflowed snapshot or " - "until oldest active xid on standby is at least %u (now %u)", - standbySnapshotPendingXmin, - running->oldestRunningXid); - return; - } - } - - Assert(standbyState == STANDBY_INITIALIZED); - - /* - * NB: this can be reached at least twice, so make sure new code can deal - * with that. - */ + StandbyReleaseOldLocks(oldestRunningXID); - /* - * Nobody else is running yet, but take locks anyhow - */ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - - /* - * KnownAssignedXids is sorted so we cannot just add the xids, we have to - * sort them first. - * - * Some of the new xids are top-level xids and some are subtransactions. - * We don't call SubTransSetParent because it doesn't matter yet. If we - * aren't overflowed then all xids will fit in snapshot and so we don't - * need subtrans. If we later overflow, an xid assignment record will add - * xids to subtrans. If RunningTransactionsData is overflowed then we - * don't have enough information to correctly update subtrans anyway. - */ - - /* - * Allocate a temporary array to avoid modifying the array passed as - * argument. - */ - xids = palloc(sizeof(TransactionId) * (running->xcnt + running->subxcnt)); - - /* - * Add to the temp array any xids which have not already completed. - */ - nxids = 0; - for (i = 0; i < running->xcnt + running->subxcnt; i++) - { - TransactionId xid = running->xids[i]; - - /* - * The running-xacts snapshot can contain xids that were still visible - * in the procarray when the snapshot was taken, but were already - * WAL-logged as completed. They're not running anymore, so ignore - * them. - */ - if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid)) - continue; - - xids[nxids++] = xid; - } - - if (nxids > 0) - { - if (procArray->numKnownAssignedXids != 0) - { - LWLockRelease(ProcArrayLock); - elog(ERROR, "KnownAssignedXids is not empty"); - } - - /* - * Sort the array so that we can add them safely into - * KnownAssignedXids. - * - * We have to sort them logically, because in KnownAssignedXidsAdd we - * call TransactionIdFollowsOrEquals and so on. But we know these XIDs - * come from RUNNING_XACTS, which means there are only normal XIDs - * from the same epoch, so this is safe. - */ - qsort(xids, nxids, sizeof(TransactionId), xidLogicalComparator); - - /* - * Add the sorted snapshot into KnownAssignedXids. The running-xacts - * snapshot may include duplicated xids because of prepared - * transactions, so ignore them. - */ - for (i = 0; i < nxids; i++) - { - if (i > 0 && TransactionIdEquals(xids[i - 1], xids[i])) - { - elog(DEBUG1, - "found duplicated transaction %u for KnownAssignedXids insertion", - xids[i]); - continue; - } - KnownAssignedXidsAdd(xids[i], xids[i], true); - } - - KnownAssignedXidsDisplay(DEBUG3); - } - - pfree(xids); - - /* - * latestObservedXid is at least set to the point where SUBTRANS was - * started up to (cf. ProcArrayInitRecovery()) or to the biggest xid - * RecordKnownAssignedTransactionIds() was called for. Initialize - * subtrans from thereon, up to nextXid - 1. - * - * We need to duplicate parts of RecordKnownAssignedTransactionId() here, - * because we've just added xids to the known assigned xids machinery that - * haven't gone through RecordKnownAssignedTransactionId(). - */ - Assert(TransactionIdIsNormal(latestObservedXid)); - TransactionIdAdvance(latestObservedXid); - while (TransactionIdPrecedes(latestObservedXid, running->nextXid)) - { - ExtendSUBTRANS(latestObservedXid); - TransactionIdAdvance(latestObservedXid); - } - TransactionIdRetreat(latestObservedXid); /* = running->nextXid - 1 */ - - /* ---------- - * Now we've got the running xids we need to set the global values that - * are used to track snapshots as they evolve further. - * - * - latestCompletedXid which will be the xmax for snapshots - * - lastOverflowedXid which shows whether snapshots overflow - * - nextXid - * - * If the snapshot overflowed, then we still initialise with what we know, - * but the recovery snapshot isn't fully valid yet because we know there - * are some subxids missing. We don't know the specific subxids that are - * missing, so conservatively assume the last one is latestObservedXid. - * ---------- - */ - if (running->subxid_status == SUBXIDS_MISSING) - { - standbyState = STANDBY_SNAPSHOT_PENDING; - - standbySnapshotPendingXmin = latestObservedXid; - procArray->lastOverflowedXid = latestObservedXid; - } - else - { - standbyState = STANDBY_SNAPSHOT_READY; - - standbySnapshotPendingXmin = InvalidTransactionId; - - /* - * If the 'xids' array didn't include all subtransactions, we have to - * mark any snapshots taken as overflowed. - */ - if (running->subxid_status == SUBXIDS_IN_SUBTRANS) - procArray->lastOverflowedXid = latestObservedXid; - else - { - Assert(running->subxid_status == SUBXIDS_IN_ARRAY); - procArray->lastOverflowedXid = InvalidTransactionId; - } - } - - /* - * If a transaction wrote a commit record in the gap between taking and - * logging the snapshot then latestCompletedXid may already be higher than - * the value from the snapshot, so check before we use the incoming value. - * It also might not yet be set at all. - */ - MaintainLatestCompletedXidRecovery(running->latestCompletedXid); - - /* - * NB: No need to increment TransamVariables->xactCompletionCount here, - * nobody can see it yet. - */ - + procArray->oldest_running_primary_xid = oldestRunningXID; LWLockRelease(ProcArrayLock); - - KnownAssignedXidsDisplay(DEBUG3); - if (standbyState == STANDBY_SNAPSHOT_READY) - elog(DEBUG1, "recovery snapshots are now enabled"); - else - elog(DEBUG1, - "recovery snapshot waiting for non-overflowed snapshot or " - "until oldest active xid on standby is at least %u (now %u)", - standbySnapshotPendingXmin, - running->oldestRunningXid); } -/* - * ProcArrayApplyXidAssignment - * Process an XLOG_XACT_ASSIGNMENT WAL record - */ -void -ProcArrayApplyXidAssignment(TransactionId topxid, - int nsubxids, TransactionId *subxids) -{ - TransactionId max_xid; - int i; - - Assert(standbyState >= STANDBY_INITIALIZED); - - max_xid = TransactionIdLatest(topxid, nsubxids, subxids); - - /* - * Mark all the subtransactions as observed. - * - * NOTE: This will fail if the subxid contains too many previously - * unobserved xids to fit into known-assigned-xids. That shouldn't happen - * as the code stands, because xid-assignment records should never contain - * more than PGPROC_MAX_CACHED_SUBXIDS entries. - */ - RecordKnownAssignedTransactionIds(max_xid); - - /* - * Notice that we update pg_subtrans with the top-level xid, rather than - * the parent xid. This is a difference between normal processing and - * recovery, yet is still correct in all cases. The reason is that - * subtransaction commit is not marked in clog until commit processing, so - * all aborted subtransactions have already been clearly marked in clog. - * As a result we are able to refer directly to the top-level - * transaction's state rather than skipping through all the intermediate - * states in the subtransaction tree. This should be the first time we - * have attempted to SubTransSetParent(). - */ - for (i = 0; i < nsubxids; i++) - SubTransSetParent(subxids[i], topxid); - - /* KnownAssignedXids isn't maintained yet, so we're done for now */ - if (standbyState == STANDBY_INITIALIZED) - return; - - /* - * Uses same locking as transaction commit - */ - LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - - /* - * Remove subxids from known-assigned-xacts. - */ - KnownAssignedXidsRemoveTree(InvalidTransactionId, nsubxids, subxids); - - /* - * Advance lastOverflowedXid to be at least the last of these subxids. - */ - if (TransactionIdPrecedes(procArray->lastOverflowedXid, max_xid)) - procArray->lastOverflowedXid = max_xid; - - LWLockRelease(ProcArrayLock); -} /* * TransactionIdIsInProgress -- is given transaction running in some backend @@ -1379,23 +971,24 @@ ProcArrayApplyXidAssignment(TransactionId topxid, * Aside from some shortcuts such as checking RecentXmin and our own Xid, * there are four possibilities for finding a running transaction: * - * 1. The given Xid is a main transaction Id. We will find this out cheaply + * 1. In Hot Standby mode, there are no transactions with XIDs active in the + * standby. Check pg_xact to see if the transaction is known to have committed + * or aborted, otherwise it's considered as running. + * + * 2. The given Xid is a main transaction Id. We will find this out cheaply * by looking at ProcGlobal->xids. * - * 2. The given Xid is one of the cached subxact Xids in the PGPROC array. + * 3. The given Xid is one of the cached subxact Xids in the PGPROC array. * We can find this out cheaply too. * - * 3. In Hot Standby mode, we must search the KnownAssignedXids list to see - * if the Xid is running on the primary. - * * 4. Search the SubTrans tree to find the Xid's topmost parent, and then see - * if that is running according to ProcGlobal->xids[] or KnownAssignedXids. + * if that is running according to ProcGlobal->xids[]. * This is the slowest way, but sadly it has to be done always if the others * failed, unless we see that the cached subxact sets are complete (none have * overflowed). * - * ProcArrayLock has to be held while we do 1, 2, 3. If we save the top Xids - * while doing 1 and 3, we can release the ProcArrayLock while we do 4. + * ProcArrayLock has to be held while we do 2 and 3. If we save the top Xids + * while doing 2 and 3, we can release the ProcArrayLock while we do 4. * This buys back some concurrency (and we can't retrieve the main Xids from * ProcGlobal->xids[] again anyway; see GetNewTransactionId). */ @@ -1436,6 +1029,28 @@ TransactionIdIsInProgress(TransactionId xid) return false; } + /* + * In hot standby mode, check pg_xact. + * + * With normal non-CSN snapshots, you must be careful to check + * TransactionIdIsInProgress() before checking pg_xact, because a + * transaction is marked as committed before it's removed from PGPROC. But + * during recovery, we now use CSN snapshots so I think that's OK. See the + * "NOTE" at the top of heapam_visibility.c. + * + * During recovery, the XID cannot be our own transaction, and the CSN + * check handles subtransactions too, so we can skip the rest of the + * function. + */ + if (RecoveryInProgress()) + { + xc_during_recovery_inc(); + if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid)) + return false; + else + return true; + } + /* * Also, we can handle our own transaction (and subtransactions) without * any access to shared memory. @@ -1452,12 +1067,7 @@ TransactionIdIsInProgress(TransactionId xid) */ if (xids == NULL) { - /* - * In hot standby mode, reserve enough space to hold all xids in the - * known-assigned list. If we later finish recovery, we no longer need - * the bigger array, but we don't bother to shrink it. - */ - int maxxids = RecoveryInProgress() ? TOTAL_MAX_CACHED_SUBXIDS : arrayP->maxProcs; + int maxxids = arrayP->maxProcs; xids = (TransactionId *) malloc(maxxids * sizeof(TransactionId)); if (xids == NULL) @@ -1552,33 +1162,6 @@ TransactionIdIsInProgress(TransactionId xid) xids[nxids++] = pxid; } - /* - * Step 3: in hot standby mode, check the known-assigned-xids list. XIDs - * in the list must be treated as running. - */ - if (RecoveryInProgress()) - { - /* none of the PGPROC entries should have XIDs in hot standby mode */ - Assert(nxids == 0); - - if (KnownAssignedXidExists(xid)) - { - LWLockRelease(ProcArrayLock); - xc_by_known_assigned_inc(); - return true; - } - - /* - * If the KnownAssignedXids overflowed, we have to check pg_subtrans - * too. Fetch all xids from KnownAssignedXids that are lower than - * xid, since if xid is a subtransaction its parent will always have a - * lower value. Note we will collect both main and subXIDs here, but - * there's no help for it. - */ - if (TransactionIdPrecedesOrEquals(xid, procArray->lastOverflowedXid)) - nxids = KnownAssignedXidsGet(xids, xid); - } - LWLockRelease(ProcArrayLock); /* @@ -1852,8 +1435,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h) * can't be tied to a specific database.) * * Also, while in recovery we cannot compute an accurate per-database - * horizon, as all xids are managed via the KnownAssignedXids - * machinery. + * horizon, as all xids are managed via the CSN log machinery. */ if (proc->databaseId == MyDatabaseId || MyDatabaseId == InvalidOid || @@ -1866,11 +1448,14 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h) } /* - * If in recovery fetch oldest xid in KnownAssignedXids, will be applied - * after lock is released. + * If in recovery fetch oldest xid from last checkpoint. + * + * XXX: that can be much older than what we had previously with the + * known-assigned-xids machinery. I think that's OK, given what this + * function is used for during recovery? */ if (in_recovery) - kaxmin = KnownAssignedXidsGetOldestXmin(); + kaxmin = procArray->oldest_running_primary_xid; /* * No other information from shared state is needed, release the lock @@ -2181,7 +1766,7 @@ GetMVCCSnapshotData(void) TransactionId myxid; uint64 curXactCompletionCount; MVCCSnapshotShared snapshot; - + XLogRecPtr csn = InvalidXLogRecPtr; TransactionId replication_slot_xmin = InvalidTransactionId; TransactionId replication_slot_catalog_xmin = InvalidTransactionId; @@ -2355,27 +1940,8 @@ GetMVCCSnapshotData(void) else { /* - * We're in hot standby, so get XIDs from KnownAssignedXids. - * - * We store all xids directly into subxip[]. Here's why: - * - * In recovery we don't know which xids are top-level and which are - * subxacts, a design choice that greatly simplifies xid processing. - * - * It seems like we would want to try to put xids into xip[] only, but - * that is fairly small. We would either need to make that bigger or - * to increase the rate at which we WAL-log xid assignment; neither is - * an appealing choice. - * - * We could try to store xids into xip[] first and then into subxip[] - * if there are too many xids. That only works if the snapshot doesn't - * overflow because we do not search subxip[] in that case. A simpler - * way is to just store all xids in the subxip array because this is - * by far the bigger array. We just leave the xip array empty. - * - * Either way we need to change the way XidInMVCCSnapshot() works - * depending upon when the snapshot was taken, or change normal - * snapshot processing so it matches. + * We're in hot standby, so get the current CSN. That's used to + * determine which transactions committed before this snapshot. * * Note: It is possible for recovery to end before we finish taking * the snapshot, and for newly assigned transaction ids to be added to @@ -2383,14 +1949,17 @@ GetMVCCSnapshotData(void) * those newly added transaction ids would be filtered away, so we * need not be concerned about them. */ - subcount = KnownAssignedXidsGetAndSetXmin(snapshot->subxip, &xmin, - xmax); + xmin = procArray->oldest_running_primary_xid; - if (TransactionIdPrecedesOrEquals(xmin, procArray->lastOverflowedXid)) - suboverflowed = true; + /* + * Take CSN under ProcArrayLock so the snapshot stays synchronized. + * (XXX: not sure that's strictly required.) This is what determines + * which transactions we consider finished and which are still in + * progress. + */ + csn = TransamVariables->latestCommitLSN; } - /* * Fetch into local variable while ProcArrayLock is held - the * LWLockRelease below is a barrier, ensuring this happens inside the @@ -2507,6 +2076,8 @@ GetMVCCSnapshotData(void) latestSnapshotShared = snapshot; } + snapshot->snapshotCsn = csn; + return snapshot; } @@ -2662,9 +2233,6 @@ ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc) * The returned data structure is statically allocated; caller should not * modify it, and must not assume it is valid past the next call. * - * This is never executed during recovery so there is no need to look at - * KnownAssignedXids. - * * Dummy PGPROCs from prepared transaction are included, meaning that this * may return entries with duplicated TransactionId values coming from * transaction finishing to prepare. Nothing is done about duplicated @@ -2695,6 +2263,7 @@ GetRunningTransactionData(void) int subcount; bool suboverflowed; + /* This is never executed during recovery */ Assert(!RecoveryInProgress()); /* @@ -2861,15 +2430,16 @@ GetRunningTransactionData(void) * We look at all databases, though there is no need to include WALSender * since this has no effect on hot standby conflicts. * - * This is never executed during recovery so there is no need to look at - * KnownAssignedXids. + * If allDbs is false, skip processes attached to other databases. + * + * This is never executed during recovery. * * We don't worry about updating other counters, we want to keep this as * simple as possible and leave GetMVCCSnapshotData() as the primary code for * that bookkeeping. */ TransactionId -GetOldestActiveTransactionId(void) +GetOldestActiveTransactionId(bool allDbs) { ProcArrayStruct *arrayP = procArray; TransactionId *other_xids = ProcGlobal->xids; @@ -2890,11 +2460,13 @@ GetOldestActiveTransactionId(void) LWLockRelease(XidGenLock); /* - * Spin over procArray collecting all xids and subxids. + * Spin over procArray checking each xid. */ LWLockAcquire(ProcArrayLock, LW_SHARED); for (index = 0; index < arrayP->numProcs; index++) { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; TransactionId xid; /* Fetch xid just once - see GetNewTransactionId */ @@ -2903,6 +2475,9 @@ GetOldestActiveTransactionId(void) if (!TransactionIdIsNormal(xid)) continue; + if (!allDbs && proc->databaseId != MyDatabaseId) + continue; + if (TransactionIdPrecedes(xid, oldestRunningXid)) oldestRunningXid = xid; @@ -2981,8 +2556,8 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly) * * In recovery we can't lower the safe value besides what we've computed * above, so we'll have to wait a bit longer there. We unfortunately can - * *not* use KnownAssignedXidsGetOldestXmin() since the KnownAssignedXids - * machinery can miss values and return an older value than is safe. + * *not* use oldest_running_primary_xid since the XID tracking machinery + * can miss values and return an older value than is safe. */ if (!recovery_in_progress) { @@ -3400,6 +2975,9 @@ GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0, * but that would not be true in the case of FATAL errors lagging in array, * but we already know those are bogus anyway, so we skip that test. * + * XXX: KnownAssignedXids is gone so the above comment needs updating. Is + * the code still correct? I think so but need to double-check. + * * If dbOid is valid we skip backends attached to other databases. * * Be careful to *not* pfree the result from this function. We reuse @@ -4071,14 +3649,14 @@ static void DisplayXidCache(void) { fprintf(stderr, - "XidCache: xmin: %ld, known: %ld, myxact: %ld, latest: %ld, mainxid: %ld, childxid: %ld, knownassigned: %ld, nooflo: %ld, slow: %ld\n", + "XidCache: xmin: %ld, known: %ld, myxact: %ld, latest: %ld, mainxid: %ld, childxid: %ld, during_recovery: %ld, nooflo: %ld, slow: %ld\n", xc_by_recent_xmin, xc_by_known_xact, xc_by_my_xact, xc_by_latest_xid, xc_by_main_xid, xc_by_child_xid, - xc_by_known_assigned, + xc_during_recovery, xc_no_overflow, xc_slow_answer); } @@ -4325,61 +3903,6 @@ FullXidRelativeTo(FullTransactionId rel, TransactionId xid) } -/* ---------------------------------------------- - * KnownAssignedTransactionIds sub-module - * ---------------------------------------------- - */ - -/* - * In Hot Standby mode, we maintain a list of transactions that are (or were) - * running on the primary at the current point in WAL. These XIDs must be - * treated as running by standby transactions, even though they are not in - * the standby server's PGPROC array. - * - * We record all XIDs that we know have been assigned. That includes all the - * XIDs seen in WAL records, plus all unobserved XIDs that we can deduce have - * been assigned. We can deduce the existence of unobserved XIDs because we - * know XIDs are assigned in sequence, with no gaps. The KnownAssignedXids - * list expands as new XIDs are observed or inferred, and contracts when - * transaction completion records arrive. - * - * During hot standby we do not fret too much about the distinction between - * top-level XIDs and subtransaction XIDs. We store both together in the - * KnownAssignedXids list. In backends, this is copied into snapshots in - * GetMVCCSnapshotData(), taking advantage of the fact that XidInMVCCSnapshot() - * doesn't care about the distinction either. Subtransaction XIDs are - * effectively treated as top-level XIDs and in the typical case pg_subtrans - * links are *not* maintained (which does not affect visibility). - * - * We have room in KnownAssignedXids and in snapshots to hold maxProcs * - * (1 + PGPROC_MAX_CACHED_SUBXIDS) XIDs, so every primary transaction must - * report its subtransaction XIDs in a WAL XLOG_XACT_ASSIGNMENT record at - * least every PGPROC_MAX_CACHED_SUBXIDS. When we receive one of these - * records, we mark the subXIDs as children of the top XID in pg_subtrans, - * and then remove them from KnownAssignedXids. This prevents overflow of - * KnownAssignedXids and snapshots, at the cost that status checks for these - * subXIDs will take a slower path through TransactionIdIsInProgress(). - * This means that KnownAssignedXids is not necessarily complete for subXIDs, - * though it should be complete for top-level XIDs; this is the same situation - * that holds with respect to the PGPROC entries in normal running. - * - * When we throw away subXIDs from KnownAssignedXids, we need to keep track of - * that, similarly to tracking overflow of a PGPROC's subxids array. We do - * that by remembering the lastOverflowedXid, ie the last thrown-away subXID. - * As long as that is within the range of interesting XIDs, we have to assume - * that subXIDs are missing from snapshots. (Note that subXID overflow occurs - * on primary when 65th subXID arrives, whereas on standby it occurs when 64th - * subXID arrives - that is not an error.) - * - * Should a backend on primary somehow disappear before it can write an abort - * record, then we just leave those XIDs in KnownAssignedXids. They actually - * aborted but we think they were running; the distinction is irrelevant - * because either way any changes done by the transaction are not visible to - * backends in the standby. We prune KnownAssignedXids when - * XLOG_RUNNING_XACTS arrives, to forestall possible overflow of the - * array due to such dead XIDs. - */ - /* * RecordKnownAssignedTransactionIds * Record the given XID in KnownAssignedXids, as well as any preceding @@ -4394,7 +3917,7 @@ FullXidRelativeTo(FullTransactionId rel, TransactionId xid) void RecordKnownAssignedTransactionIds(TransactionId xid) { - Assert(standbyState >= STANDBY_INITIALIZED); + Assert(InHotStandby); Assert(TransactionIdIsValid(xid)); Assert(TransactionIdIsValid(latestObservedXid)); @@ -4412,38 +3935,19 @@ RecordKnownAssignedTransactionIds(TransactionId xid) /* * Extend subtrans like we do in GetNewTransactionId() during normal - * operation using individual extend steps. Note that we do not need - * to extend clog since its extensions are WAL logged. - * - * This part has to be done regardless of standbyState since we - * immediately start assigning subtransactions to their toplevel - * transactions. + * operation using individual extend steps. And CSN log, too. Note + * that we do not need to extend clog since its extensions are WAL + * logged. */ next_expected_xid = latestObservedXid; while (TransactionIdPrecedes(next_expected_xid, xid)) { TransactionIdAdvance(next_expected_xid); ExtendSUBTRANS(next_expected_xid); + ExtendCSNLog(next_expected_xid); } Assert(next_expected_xid == xid); - /* - * If the KnownAssignedXids machinery isn't up yet, there's nothing - * more to do since we don't track assigned xids yet. - */ - if (standbyState <= STANDBY_INITIALIZED) - { - latestObservedXid = xid; - return; - } - - /* - * Add (latestObservedXid, xid] onto the KnownAssignedXids array. - */ - next_expected_xid = latestObservedXid; - TransactionIdAdvance(next_expected_xid); - KnownAssignedXidsAdd(next_expected_xid, xid, false); - /* * Now we can advance latestObservedXid */ @@ -4455,805 +3959,61 @@ RecordKnownAssignedTransactionIds(TransactionId xid) } /* - * ExpireTreeKnownAssignedTransactionIds - * Remove the given XIDs from KnownAssignedXids. + * ProcArrayRecoveryEndTransaction + * + * Called during recovery in analogy with and in place of + * ProcArrayEndTransaction(). The transaction becomes visible to any new + * snapshots taken after this. 'max_xid' is the highest (sub)XID of the + * committed transaction, and 'lsn' is LSN of the commit record. * - * Called during recovery in analogy with and in place of ProcArrayEndTransaction() + * The transaction and all its subtransactions have been already marked as + * committed in the CLOG and in the CSNLOG. */ void -ExpireTreeKnownAssignedTransactionIds(TransactionId xid, int nsubxids, - TransactionId *subxids, TransactionId max_xid) +ProcArrayRecoveryEndTransaction(TransactionId max_xid, XLogRecPtr lsn) { - Assert(standbyState >= STANDBY_INITIALIZED); + TransactionId oldest_running_primary_xid; + + Assert(InHotStandby); + + /* + * If this was the oldest XID that was still running, advance it. This is + * important for advancing the global xmin, which avoids unnecessary + * recovery conflicts + * + * No locking required because this runs in the startup process. + * + * XXX: the caller actually has a list of XIDs that just committed. We + * could save some clog lookups by taking advantage of that list. + */ + oldest_running_primary_xid = procArray->oldest_running_primary_xid; + while (oldest_running_primary_xid < max_xid) + { + if (!TransactionIdDidCommit(oldest_running_primary_xid) && + !TransactionIdDidAbort(oldest_running_primary_xid)) + { + break; + } + TransactionIdAdvance(oldest_running_primary_xid); + } + if (max_xid == oldest_running_primary_xid) + TransactionIdAdvance(oldest_running_primary_xid); /* * Uses same locking as transaction commit */ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - KnownAssignedXidsRemoveTree(xid, nsubxids, subxids); - /* As in ProcArrayEndTransaction, advance latestCompletedXid */ MaintainLatestCompletedXidRecovery(max_xid); /* ... and xactCompletionCount */ TransamVariables->xactCompletionCount++; - LWLockRelease(ProcArrayLock); -} - -/* - * ExpireAllKnownAssignedTransactionIds - * Remove all entries in KnownAssignedXids and reset lastOverflowedXid. - */ -void -ExpireAllKnownAssignedTransactionIds(void) -{ - FullTransactionId latestXid; - - LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - KnownAssignedXidsRemovePreceding(InvalidTransactionId); - - /* Reset latestCompletedXid to nextXid - 1 */ - Assert(FullTransactionIdIsValid(TransamVariables->nextXid)); - latestXid = TransamVariables->nextXid; - FullTransactionIdRetreat(&latestXid); - TransamVariables->latestCompletedXid = latestXid; - - /* - * Any transactions that were in-progress were effectively aborted, so - * advance xactCompletionCount. - */ - TransamVariables->xactCompletionCount++; - - /* - * Reset lastOverflowedXid. Currently, lastOverflowedXid has no use after - * the call of this function. But do this for unification with what - * ExpireOldKnownAssignedTransactionIds() do. - */ - procArray->lastOverflowedXid = InvalidTransactionId; - LWLockRelease(ProcArrayLock); -} - -/* - * ExpireOldKnownAssignedTransactionIds - * Remove KnownAssignedXids entries preceding the given XID and - * potentially reset lastOverflowedXid. - */ -void -ExpireOldKnownAssignedTransactionIds(TransactionId xid) -{ - TransactionId latestXid; - - LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - - /* As in ProcArrayEndTransaction, advance latestCompletedXid */ - latestXid = xid; - TransactionIdRetreat(latestXid); - MaintainLatestCompletedXidRecovery(latestXid); - - /* ... and xactCompletionCount */ - TransamVariables->xactCompletionCount++; - - /* - * Reset lastOverflowedXid if we know all transactions that have been - * possibly running are being gone. Not doing so could cause an incorrect - * lastOverflowedXid value, which makes extra snapshots be marked as - * suboverflowed. - */ - if (TransactionIdPrecedes(procArray->lastOverflowedXid, xid)) - procArray->lastOverflowedXid = InvalidTransactionId; - KnownAssignedXidsRemovePreceding(xid); - LWLockRelease(ProcArrayLock); -} - -/* - * KnownAssignedTransactionIdsIdleMaintenance - * Opportunistically do maintenance work when the startup process - * is about to go idle. - */ -void -KnownAssignedTransactionIdsIdleMaintenance(void) -{ - KnownAssignedXidsCompress(KAX_STARTUP_PROCESS_IDLE, false); -} - - -/* - * Private module functions to manipulate KnownAssignedXids - * - * There are 5 main uses of the KnownAssignedXids data structure: - * - * * backends taking snapshots - all valid XIDs need to be copied out - * * backends seeking to determine presence of a specific XID - * * startup process adding new known-assigned XIDs - * * startup process removing specific XIDs as transactions end - * * startup process pruning array when special WAL records arrive - * - * This data structure is known to be a hot spot during Hot Standby, so we - * go to some lengths to make these operations as efficient and as concurrent - * as possible. - * - * The XIDs are stored in an array in sorted order --- TransactionIdPrecedes - * order, to be exact --- to allow binary search for specific XIDs. Note: - * in general TransactionIdPrecedes would not provide a total order, but - * we know that the entries present at any instant should not extend across - * a large enough fraction of XID space to wrap around (the primary would - * shut down for fear of XID wrap long before that happens). So it's OK to - * use TransactionIdPrecedes as a binary-search comparator. - * - * It's cheap to maintain the sortedness during insertions, since new known - * XIDs are always reported in XID order; we just append them at the right. - * - * To keep individual deletions cheap, we need to allow gaps in the array. - * This is implemented by marking array elements as valid or invalid using - * the parallel boolean array KnownAssignedXidsValid[]. A deletion is done - * by setting KnownAssignedXidsValid[i] to false, *without* clearing the - * XID entry itself. This preserves the property that the XID entries are - * sorted, so we can do binary searches easily. Periodically we compress - * out the unused entries; that's much cheaper than having to compress the - * array immediately on every deletion. - * - * The actually valid items in KnownAssignedXids[] and KnownAssignedXidsValid[] - * are those with indexes tail <= i < head; items outside this subscript range - * have unspecified contents. When head reaches the end of the array, we - * force compression of unused entries rather than wrapping around, since - * allowing wraparound would greatly complicate the search logic. We maintain - * an explicit tail pointer so that pruning of old XIDs can be done without - * immediately moving the array contents. In most cases only a small fraction - * of the array contains valid entries at any instant. - * - * Although only the startup process can ever change the KnownAssignedXids - * data structure, we still need interlocking so that standby backends will - * not observe invalid intermediate states. The convention is that backends - * must hold shared ProcArrayLock to examine the array. To remove XIDs from - * the array, the startup process must hold ProcArrayLock exclusively, for - * the usual transactional reasons (compare commit/abort of a transaction - * during normal running). Compressing unused entries out of the array - * likewise requires exclusive lock. To add XIDs to the array, we just insert - * them into slots to the right of the head pointer and then advance the head - * pointer. This doesn't require any lock at all, but on machines with weak - * memory ordering, we need to be careful that other processors see the array - * element changes before they see the head pointer change. We handle this by - * using memory barriers when reading or writing the head/tail pointers (unless - * the caller holds ProcArrayLock exclusively). - * - * Algorithmic analysis: - * - * If we have a maximum of M slots, with N XIDs currently spread across - * S elements then we have N <= S <= M always. - * - * * Adding a new XID is O(1) and needs no lock (unless compression must - * happen) - * * Compressing the array is O(S) and requires exclusive lock - * * Removing an XID is O(logS) and requires exclusive lock - * * Taking a snapshot is O(S) and requires shared lock - * * Checking for an XID is O(logS) and requires shared lock - * - * In comparison, using a hash table for KnownAssignedXids would mean that - * taking snapshots would be O(M). If we can maintain S << M then the - * sorted array technique will deliver significantly faster snapshots. - * If we try to keep S too small then we will spend too much time compressing, - * so there is an optimal point for any workload mix. We use a heuristic to - * decide when to compress the array, though trimming also helps reduce - * frequency of compressing. The heuristic requires us to track the number of - * currently valid XIDs in the array (N). Except in special cases, we'll - * compress when S >= 2N. Bounding S at 2N in turn bounds the time for - * taking a snapshot to be O(N), which it would have to be anyway. - */ - - -/* - * Compress KnownAssignedXids by shifting valid data down to the start of the - * array, removing any gaps. - * - * A compression step is forced if "reason" is KAX_NO_SPACE, otherwise - * we do it only if a heuristic indicates it's a good time to do it. - * - * Compression requires holding ProcArrayLock in exclusive mode. - * Caller must pass haveLock = true if it already holds the lock. - */ -static void -KnownAssignedXidsCompress(KAXCompressReason reason, bool haveLock) -{ - ProcArrayStruct *pArray = procArray; - int head, - tail, - nelements; - int compress_index; - int i; - - /* Counters for compression heuristics */ - static unsigned int transactionEndsCounter; - static TimestampTz lastCompressTs; - - /* Tuning constants */ -#define KAX_COMPRESS_FREQUENCY 128 /* in transactions */ -#define KAX_COMPRESS_IDLE_INTERVAL 1000 /* in ms */ - - /* - * Since only the startup process modifies the head/tail pointers, we - * don't need a lock to read them here. - */ - head = pArray->headKnownAssignedXids; - tail = pArray->tailKnownAssignedXids; - nelements = head - tail; - - /* - * If we can choose whether to compress, use a heuristic to avoid - * compressing too often or not often enough. "Compress" here simply - * means moving the values to the beginning of the array, so it is not as - * complex or costly as typical data compression algorithms. - */ - if (nelements == pArray->numKnownAssignedXids) - { - /* - * When there are no gaps between head and tail, don't bother to - * compress, except in the KAX_NO_SPACE case where we must compress to - * create some space after the head. - */ - if (reason != KAX_NO_SPACE) - return; - } - else if (reason == KAX_TRANSACTION_END) - { - /* - * Consider compressing only once every so many commits. Frequency - * determined by benchmarks. - */ - if ((transactionEndsCounter++) % KAX_COMPRESS_FREQUENCY != 0) - return; - - /* - * Furthermore, compress only if the used part of the array is less - * than 50% full (see comments above). - */ - if (nelements < 2 * pArray->numKnownAssignedXids) - return; - } - else if (reason == KAX_STARTUP_PROCESS_IDLE) - { - /* - * We're about to go idle for lack of new WAL, so we might as well - * compress. But not too often, to avoid ProcArray lock contention - * with readers. - */ - if (lastCompressTs != 0) - { - TimestampTz compress_after; - - compress_after = TimestampTzPlusMilliseconds(lastCompressTs, - KAX_COMPRESS_IDLE_INTERVAL); - if (GetCurrentTimestamp() < compress_after) - return; - } - } - - /* Need to compress, so get the lock if we don't have it. */ - if (!haveLock) - LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - - /* - * We compress the array by reading the valid values from tail to head, - * re-aligning data to 0th element. - */ - compress_index = 0; - for (i = tail; i < head; i++) - { - if (KnownAssignedXidsValid[i]) - { - KnownAssignedXids[compress_index] = KnownAssignedXids[i]; - KnownAssignedXidsValid[compress_index] = true; - compress_index++; - } - } - Assert(compress_index == pArray->numKnownAssignedXids); - - pArray->tailKnownAssignedXids = 0; - pArray->headKnownAssignedXids = compress_index; - - if (!haveLock) - LWLockRelease(ProcArrayLock); - - /* Update timestamp for maintenance. No need to hold lock for this. */ - lastCompressTs = GetCurrentTimestamp(); -} - -/* - * Add xids into KnownAssignedXids at the head of the array. - * - * xids from from_xid to to_xid, inclusive, are added to the array. - * - * If exclusive_lock is true then caller already holds ProcArrayLock in - * exclusive mode, so we need no extra locking here. Else caller holds no - * lock, so we need to be sure we maintain sufficient interlocks against - * concurrent readers. (Only the startup process ever calls this, so no need - * to worry about concurrent writers.) - */ -static void -KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid, - bool exclusive_lock) -{ - ProcArrayStruct *pArray = procArray; - TransactionId next_xid; - int head, - tail; - int nxids; - int i; - - Assert(TransactionIdPrecedesOrEquals(from_xid, to_xid)); - - /* - * Calculate how many array slots we'll need. Normally this is cheap; in - * the unusual case where the XIDs cross the wrap point, we do it the hard - * way. - */ - if (to_xid >= from_xid) - nxids = to_xid - from_xid + 1; - else - { - nxids = 1; - next_xid = from_xid; - while (TransactionIdPrecedes(next_xid, to_xid)) - { - nxids++; - TransactionIdAdvance(next_xid); - } - } - - /* - * Since only the startup process modifies the head/tail pointers, we - * don't need a lock to read them here. - */ - head = pArray->headKnownAssignedXids; - tail = pArray->tailKnownAssignedXids; - - Assert(head >= 0 && head <= pArray->maxKnownAssignedXids); - Assert(tail >= 0 && tail < pArray->maxKnownAssignedXids); - - /* - * Verify that insertions occur in TransactionId sequence. Note that even - * if the last existing element is marked invalid, it must still have a - * correctly sequenced XID value. - */ - if (head > tail && - TransactionIdFollowsOrEquals(KnownAssignedXids[head - 1], from_xid)) - { - KnownAssignedXidsDisplay(LOG); - elog(ERROR, "out-of-order XID insertion in KnownAssignedXids"); - } - - /* - * If our xids won't fit in the remaining space, compress out free space - */ - if (head + nxids > pArray->maxKnownAssignedXids) - { - KnownAssignedXidsCompress(KAX_NO_SPACE, exclusive_lock); - - head = pArray->headKnownAssignedXids; - /* note: we no longer care about the tail pointer */ - - /* - * If it still won't fit then we're out of memory - */ - if (head + nxids > pArray->maxKnownAssignedXids) - elog(ERROR, "too many KnownAssignedXids"); - } - - /* Now we can insert the xids into the space starting at head */ - next_xid = from_xid; - for (i = 0; i < nxids; i++) - { - KnownAssignedXids[head] = next_xid; - KnownAssignedXidsValid[head] = true; - TransactionIdAdvance(next_xid); - head++; - } - - /* Adjust count of number of valid entries */ - pArray->numKnownAssignedXids += nxids; - - /* - * Now update the head pointer. We use a write barrier to ensure that - * other processors see the above array updates before they see the head - * pointer change. The barrier isn't required if we're holding - * ProcArrayLock exclusively. - */ - if (!exclusive_lock) - pg_write_barrier(); - - pArray->headKnownAssignedXids = head; -} - -/* - * KnownAssignedXidsSearch - * - * Searches KnownAssignedXids for a specific xid and optionally removes it. - * Returns true if it was found, false if not. - * - * Caller must hold ProcArrayLock in shared or exclusive mode. - * Exclusive lock must be held for remove = true. - */ -static bool -KnownAssignedXidsSearch(TransactionId xid, bool remove) -{ - ProcArrayStruct *pArray = procArray; - int first, - last; - int head; - int tail; - int result_index = -1; - - tail = pArray->tailKnownAssignedXids; - head = pArray->headKnownAssignedXids; - - /* - * Only the startup process removes entries, so we don't need the read - * barrier in that case. - */ - if (!remove) - pg_read_barrier(); /* pairs with KnownAssignedXidsAdd */ - - /* - * Standard binary search. Note we can ignore the KnownAssignedXidsValid - * array here, since even invalid entries will contain sorted XIDs. - */ - first = tail; - last = head - 1; - while (first <= last) - { - int mid_index; - TransactionId mid_xid; - - mid_index = (first + last) / 2; - mid_xid = KnownAssignedXids[mid_index]; - - if (xid == mid_xid) - { - result_index = mid_index; - break; - } - else if (TransactionIdPrecedes(xid, mid_xid)) - last = mid_index - 1; - else - first = mid_index + 1; - } - - if (result_index < 0) - return false; /* not in array */ - - if (!KnownAssignedXidsValid[result_index]) - return false; /* in array, but invalid */ - - if (remove) - { - KnownAssignedXidsValid[result_index] = false; - - pArray->numKnownAssignedXids--; - Assert(pArray->numKnownAssignedXids >= 0); - - /* - * If we're removing the tail element then advance tail pointer over - * any invalid elements. This will speed future searches. - */ - if (result_index == tail) - { - tail++; - while (tail < head && !KnownAssignedXidsValid[tail]) - tail++; - if (tail >= head) - { - /* Array is empty, so we can reset both pointers */ - pArray->headKnownAssignedXids = 0; - pArray->tailKnownAssignedXids = 0; - } - else - { - pArray->tailKnownAssignedXids = tail; - } - } - } - - return true; -} - -/* - * Is the specified XID present in KnownAssignedXids[]? - * - * Caller must hold ProcArrayLock in shared or exclusive mode. - */ -static bool -KnownAssignedXidExists(TransactionId xid) -{ - Assert(TransactionIdIsValid(xid)); - - return KnownAssignedXidsSearch(xid, false); -} - -/* - * Remove the specified XID from KnownAssignedXids[]. - * - * Caller must hold ProcArrayLock in exclusive mode. - */ -static void -KnownAssignedXidsRemove(TransactionId xid) -{ - Assert(TransactionIdIsValid(xid)); - - elog(DEBUG4, "remove KnownAssignedXid %u", xid); - - /* - * Note: we cannot consider it an error to remove an XID that's not - * present. We intentionally remove subxact IDs while processing - * XLOG_XACT_ASSIGNMENT, to avoid array overflow. Then those XIDs will be - * removed again when the top-level xact commits or aborts. - * - * It might be possible to track such XIDs to distinguish this case from - * actual errors, but it would be complicated and probably not worth it. - * So, just ignore the search result. - */ - (void) KnownAssignedXidsSearch(xid, true); -} - -/* - * KnownAssignedXidsRemoveTree - * Remove xid (if it's not InvalidTransactionId) and all the subxids. - * - * Caller must hold ProcArrayLock in exclusive mode. - */ -static void -KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids, - TransactionId *subxids) -{ - int i; - - if (TransactionIdIsValid(xid)) - KnownAssignedXidsRemove(xid); - - for (i = 0; i < nsubxids; i++) - KnownAssignedXidsRemove(subxids[i]); - - /* Opportunistically compress the array */ - KnownAssignedXidsCompress(KAX_TRANSACTION_END, true); -} - -/* - * Prune KnownAssignedXids up to, but *not* including xid. If xid is invalid - * then clear the whole table. - * - * Caller must hold ProcArrayLock in exclusive mode. - */ -static void -KnownAssignedXidsRemovePreceding(TransactionId removeXid) -{ - ProcArrayStruct *pArray = procArray; - int count = 0; - int head, - tail, - i; - - if (!TransactionIdIsValid(removeXid)) - { - elog(DEBUG4, "removing all KnownAssignedXids"); - pArray->numKnownAssignedXids = 0; - pArray->headKnownAssignedXids = pArray->tailKnownAssignedXids = 0; - return; - } - - elog(DEBUG4, "prune KnownAssignedXids to %u", removeXid); - - /* - * Mark entries invalid starting at the tail. Since array is sorted, we - * can stop as soon as we reach an entry >= removeXid. - */ - tail = pArray->tailKnownAssignedXids; - head = pArray->headKnownAssignedXids; - - for (i = tail; i < head; i++) - { - if (KnownAssignedXidsValid[i]) - { - TransactionId knownXid = KnownAssignedXids[i]; - - if (TransactionIdFollowsOrEquals(knownXid, removeXid)) - break; - - if (!StandbyTransactionIdIsPrepared(knownXid)) - { - KnownAssignedXidsValid[i] = false; - count++; - } - } - } - - pArray->numKnownAssignedXids -= count; - Assert(pArray->numKnownAssignedXids >= 0); - - /* - * Advance the tail pointer if we've marked the tail item invalid. - */ - for (i = tail; i < head; i++) - { - if (KnownAssignedXidsValid[i]) - break; - } - if (i >= head) - { - /* Array is empty, so we can reset both pointers */ - pArray->headKnownAssignedXids = 0; - pArray->tailKnownAssignedXids = 0; - } - else - { - pArray->tailKnownAssignedXids = i; - } - - /* Opportunistically compress the array */ - KnownAssignedXidsCompress(KAX_PRUNE, true); -} - -/* - * KnownAssignedXidsGet - Get an array of xids by scanning KnownAssignedXids. - * We filter out anything >= xmax. - * - * Returns the number of XIDs stored into xarray[]. Caller is responsible - * that array is large enough. - * - * Caller must hold ProcArrayLock in (at least) shared mode. - */ -static int -KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax) -{ - TransactionId xtmp = InvalidTransactionId; - - return KnownAssignedXidsGetAndSetXmin(xarray, &xtmp, xmax); -} - -/* - * KnownAssignedXidsGetAndSetXmin - as KnownAssignedXidsGet, plus - * we reduce *xmin to the lowest xid value seen if not already lower. - * - * Caller must hold ProcArrayLock in (at least) shared mode. - */ -static int -KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin, - TransactionId xmax) -{ - int count = 0; - int head, - tail; - int i; - - /* - * Fetch head just once, since it may change while we loop. We can stop - * once we reach the initially seen head, since we are certain that an xid - * cannot enter and then leave the array while we hold ProcArrayLock. We - * might miss newly-added xids, but they should be >= xmax so irrelevant - * anyway. - */ - tail = procArray->tailKnownAssignedXids; - head = procArray->headKnownAssignedXids; - - pg_read_barrier(); /* pairs with KnownAssignedXidsAdd */ - - for (i = tail; i < head; i++) - { - /* Skip any gaps in the array */ - if (KnownAssignedXidsValid[i]) - { - TransactionId knownXid = KnownAssignedXids[i]; - - /* - * Update xmin if required. Only the first XID need be checked, - * since the array is sorted. - */ - if (count == 0 && - TransactionIdPrecedes(knownXid, *xmin)) - *xmin = knownXid; - - /* - * Filter out anything >= xmax, again relying on sorted property - * of array. - */ - if (TransactionIdIsValid(xmax) && - TransactionIdFollowsOrEquals(knownXid, xmax)) - break; - - /* Add knownXid into output array */ - xarray[count++] = knownXid; - } - } - - return count; -} - -/* - * Get oldest XID in the KnownAssignedXids array, or InvalidTransactionId - * if nothing there. - */ -static TransactionId -KnownAssignedXidsGetOldestXmin(void) -{ - int head, - tail; - int i; - - /* - * Fetch head just once, since it may change while we loop. - */ - tail = procArray->tailKnownAssignedXids; - head = procArray->headKnownAssignedXids; - - pg_read_barrier(); /* pairs with KnownAssignedXidsAdd */ - - for (i = tail; i < head; i++) - { - /* Skip any gaps in the array */ - if (KnownAssignedXidsValid[i]) - return KnownAssignedXids[i]; - } - - return InvalidTransactionId; -} - -/* - * Display KnownAssignedXids to provide debug trail - * - * Currently this is only called within startup process, so we need no - * special locking. - * - * Note this is pretty expensive, and much of the expense will be incurred - * even if the elog message will get discarded. It's not currently called - * in any performance-critical places, however, so no need to be tenser. - */ -static void -KnownAssignedXidsDisplay(int trace_level) -{ - ProcArrayStruct *pArray = procArray; - StringInfoData buf; - int head, - tail, - i; - int nxids = 0; - - tail = pArray->tailKnownAssignedXids; - head = pArray->headKnownAssignedXids; - - initStringInfo(&buf); - - for (i = tail; i < head; i++) - { - if (KnownAssignedXidsValid[i]) - { - nxids++; - appendStringInfo(&buf, "[%d]=%u ", i, KnownAssignedXids[i]); - } - } - - elog(trace_level, "%d KnownAssignedXids (num=%d tail=%d head=%d) %s", - nxids, - pArray->numKnownAssignedXids, - pArray->tailKnownAssignedXids, - pArray->headKnownAssignedXids, - buf.data); - - pfree(buf.data); -} - -/* - * KnownAssignedXidsReset - * Resets KnownAssignedXids to be empty - */ -static void -KnownAssignedXidsReset(void) -{ - ProcArrayStruct *pArray = procArray; - - LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + Assert(lsn > TransamVariables->latestCommitLSN); + TransamVariables->latestCommitLSN = lsn; - pArray->numKnownAssignedXids = 0; - pArray->tailKnownAssignedXids = 0; - pArray->headKnownAssignedXids = 0; + procArray->oldest_running_primary_xid = oldest_running_primary_xid; LWLockRelease(ProcArrayLock); } diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index 5acb4508f85..217b1670f5b 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -139,8 +139,6 @@ InitRecoveryTransactionEnvironment(void) vxid.procNumber = MyProcNumber; vxid.localTransactionId = GetNextLocalTransactionId(); VirtualXactLockTableInsert(vxid); - - standbyState = STANDBY_INITIALIZED; } /* @@ -168,9 +166,6 @@ ShutdownRecoveryTransactionEnvironment(void) if (RecoveryLockHash == NULL) return; - /* Mark all tracked in-progress transactions as finished. */ - ExpireAllKnownAssignedTransactionIds(); - /* Release all locks the tracked transactions were holding */ StandbyReleaseAllLocks(); @@ -1167,7 +1162,7 @@ standby_redo(XLogReaderState *record) Assert(!XLogRecHasAnyBlockRefs(record)); /* Do nothing if we're not in hot standby mode */ - if (standbyState == STANDBY_DISABLED) + if (!InHotStandby) return; if (info == XLOG_STANDBY_LOCK) @@ -1182,18 +1177,21 @@ standby_redo(XLogReaderState *record) } else if (info == XLOG_RUNNING_XACTS) { + /* + * XXX: running xacts records were previously used to update + * known-assigned xids, but now we only need it for the logical + * replication snapbuilder stuff. And for the + * pg_stat_report_stat(true) call below. + */ xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record); - RunningTransactionsData running; - running.xcnt = xlrec->xcnt; - running.subxcnt = xlrec->subxcnt; - running.subxid_status = xlrec->subxid_overflow ? SUBXIDS_MISSING : SUBXIDS_IN_ARRAY; - running.nextXid = xlrec->nextXid; - running.latestCompletedXid = xlrec->latestCompletedXid; - running.oldestRunningXid = xlrec->oldestRunningXid; - running.xids = xlrec->xids; - - ProcArrayApplyRecoveryInfo(&running); + /* + * Remember the oldest XID that was running at the time. Normally, all + * transaction aborts and commits are WAL-logged, so our + * oldestRunningXid value should be up-to-date, but if not, this + * allows us to resynchronize. + */ + ProcArrayUpdateOldestRunningXid(xlrec->oldestRunningXid); /* * The startup process currently has no convenient way to schedule @@ -1224,50 +1222,46 @@ standby_redo(XLogReaderState *record) * * This is used for Hot Standby as follows: * - * We can move directly to STANDBY_SNAPSHOT_READY at startup if we - * start from a shutdown checkpoint because we know nothing was running - * at that time and our recovery snapshot is known empty. In the more - * typical case of an online checkpoint we need to jump through a few - * hoops to get a correct recovery snapshot and this requires a two or - * sometimes a three stage process. + * We can enter hot standby mode and start accepting read-only queries + * immediately at startup if we start from a shutdown checkpoint, because we + * know nothing was running at that time and our recovery snapshot is known + * empty. In the more typical case of an online checkpoint, the checkpoint + * record doesn't contain all the necessary information about running + * transaction state, and we need to jump through a few hoops to get a correct + * recovery snapshot. * - * The initial snapshot must contain all running xids and all current - * AccessExclusiveLocks at a point in time on the standby. Assembling - * that information while the server is running requires many and - * various LWLocks, so we choose to derive that information piece by - * piece and then re-assemble that info on the standby. When that - * information is fully assembled we move to STANDBY_SNAPSHOT_READY. + * The initial snapshot must contain all current AccessExclusiveLocks at a + * point in time on the standby. Assembling that information while the server + * is running requires many and various LWLocks, so we choose to derive that + * information piece by piece and then re-assemble that info on the standby. * - * Since locking on the primary when we derive the information is not - * strict, we note that there is a time window between the derivation and - * writing to WAL of the derived information. That allows race conditions - * that we must resolve, since xids and locks may enter or leave the - * snapshot during that window. This creates the issue that an xid or - * lock may start *after* the snapshot has been derived yet *before* the - * snapshot is logged in the running xacts WAL record. We resolve this by - * starting to accumulate changes at a point just prior to when we derive - * the snapshot on the primary, then ignore duplicates when we later apply - * the snapshot from the running xacts record. This is implemented during - * CreateCheckPoint() where we use the logical checkpoint location as - * our starting point and then write the running xacts record immediately - * before writing the main checkpoint WAL record. Since we always start - * up from a checkpoint and are immediately at our starting point, we - * unconditionally move to STANDBY_INITIALIZED. After this point we - * must do 4 things: + * Since locking on the primary when we derive the information is not strict, + * there is a time window between the derivation and writing to WAL of the + * derived information. That allows race conditions that we must resolve, + * since xids and locks may enter or leave the snapshot during that + * window. This creates the issue that an xid or lock may start *after* the + * snapshot has been derived yet *before* the snapshot is logged in the + * running xacts WAL record. We resolve this by starting to accumulate changes + * at a point just prior to when we collect the lock information on the + * primary, then ignore duplicates when we later apply the snapshot from the + * running xacts record. This is implemented during CreateCheckPoint() where + * we use the logical checkpoint location as our starting point and then write + * the running xacts record immediately before writing the main checkpoint WAL + * record. Since we always start up from a checkpoint's redo pointer, we will + * always see a running-xacts record between before reaching the checkpoint + * record, and can immediately enter hot standby mode. After this point we + * must do 3 things: * * move shared nextXid forwards as we see new xids * * extend the clog and subtrans with each new xid - * * keep track of uncommitted known assigned xids * * keep track of uncommitted AccessExclusiveLocks * - * When we see a commit/abort we must remove known assigned xids and locks - * from the completing transaction. Attempted removals that cannot locate - * an entry are expected and must not cause an error when we are in state - * STANDBY_INITIALIZED. This is implemented in StandbyReleaseLocks() and - * KnownAssignedXidsRemove(). - * - * Later, when we apply the running xact data we must be careful to ignore - * transactions already committed, since those commits raced ahead when - * making WAL entries. + * When we see a commit/abort we must advance oldest_running_primary_xid and + * remove locks from the completing transaction. Attempted removals that + * cannot locate an entry are expected and must not cause an error until we + * have seen the running-xacts record. (We don't throw an error even after + * that, because whatever the reason was, after the transaction has completed + * the issue has already been resolved anyway.) This is implemented in + * StandbyReleaseLocks(). * * For logical decoding only the running xacts information is needed; * there's no need to look at the locking information, but it's logged anyway, diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 3df29658f18..aadec36dc15 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -140,6 +140,7 @@ static const char *const BuiltinTrancheNames[] = { [LWTRANCHE_XACT_BUFFER] = "XactBuffer", [LWTRANCHE_COMMITTS_BUFFER] = "CommitTsBuffer", [LWTRANCHE_SUBTRANS_BUFFER] = "SubtransBuffer", + [LWTRANCHE_CSN_LOG_BUFFER] = "CsnLogBuffer", [LWTRANCHE_MULTIXACTOFFSET_BUFFER] = "MultiXactOffsetBuffer", [LWTRANCHE_MULTIXACTMEMBER_BUFFER] = "MultiXactMemberBuffer", [LWTRANCHE_NOTIFY_BUFFER] = "NotifyBuffer", @@ -178,6 +179,7 @@ static const char *const BuiltinTrancheNames[] = { [LWTRANCHE_XACT_SLRU] = "XactSLRU", [LWTRANCHE_PARALLEL_VACUUM_DSA] = "ParallelVacuumDSA", [LWTRANCHE_AIO_URING_COMPLETION] = "AioUringCompletion", + [LWTRANCHE_CSN_LOG_SLRU] = "CsnLogSLRU", }; StaticAssertDecl(lengthof(BuiltinTrancheNames) == diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 4f44648aca8..95e248b2c88 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -363,6 +363,7 @@ AioWorkerSubmissionQueue "Waiting to access AIO worker submission queue." XactBuffer "Waiting for I/O on a transaction status SLRU buffer." CommitTsBuffer "Waiting for I/O on a commit timestamp SLRU buffer." SubtransBuffer "Waiting for I/O on a sub-transaction SLRU buffer." +CsnlogBuffer "Waiting for I/O on a sub-transaction SLRU buffer." MultiXactOffsetBuffer "Waiting for I/O on a multixact offset SLRU buffer." MultiXactMemberBuffer "Waiting for I/O on a multixact member SLRU buffer." NotifyBuffer "Waiting for I/O on a NOTIFY message SLRU buffer." diff --git a/src/backend/utils/probes.d b/src/backend/utils/probes.d index e9e413477ba..d8ff9cfdb36 100644 --- a/src/backend/utils/probes.d +++ b/src/backend/utils/probes.d @@ -77,6 +77,8 @@ provider postgresql { probe clog__checkpoint__done(bool); probe subtrans__checkpoint__start(bool); probe subtrans__checkpoint__done(bool); + probe csnlog__checkpoint__start(bool); + probe csnlog__checkpoint__done(bool); probe multixact__checkpoint__start(bool); probe multixact__checkpoint__done(bool); probe twophase__checkpoint__start(); diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 5f9f2b9d8b2..049c706f2cf 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -97,6 +97,7 @@ #include #include +#include "access/csn_log.h" #include "access/subtrans.h" #include "access/transam.h" #include "access/xact.h" @@ -1888,36 +1889,11 @@ XidInMVCCSnapshot(TransactionId xid, MVCCSnapshotShared snapshot) } else { - /* - * In recovery we store all xids in the subxip array because it is by - * far the bigger array, and we mostly don't know which xids are - * top-level and which are subxacts. The xip array is empty. - * - * We start by searching subtrans, if we overflowed. - */ - if (snapshot->suboverflowed) - { - /* - * Snapshot overflowed, so convert xid to top-level. This is safe - * because we eliminated too-old XIDs above. - */ - xid = SubTransGetTopmostTransaction(xid); - - /* - * If xid was indeed a subxact, we might now have an xid < xmin, - * so recheck to avoid an array scan. No point in rechecking - * xmax. - */ - if (TransactionIdPrecedes(xid, snapshot->xmin)) - return false; - } + XLogRecPtr csn = CSNLogGetCSNByXid(xid); - /* - * We now have either a top-level xid higher than xmin or an - * indeterminate xid. We don't know whether it's top level or subxact - * but it doesn't matter. If it's present, the xid is visible. - */ - if (pg_lfind32(xid, snapshot->subxip, snapshot->subxcnt)) + if (csn != InvalidXLogRecPtr && csn <= snapshot->snapshotCsn) + return false; + else return true; } diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index c17fda2bc81..f52817e218f 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -251,7 +251,8 @@ static const char *const subdirs[] = { "pg_xact", "pg_logical", "pg_logical/snapshots", - "pg_logical/mappings" + "pg_logical/mappings", + "pg_csn" }; diff --git a/src/bin/pg_rewind/filemap.c b/src/bin/pg_rewind/filemap.c index a28d1667d4c..64fdd139173 100644 --- a/src/bin/pg_rewind/filemap.c +++ b/src/bin/pg_rewind/filemap.c @@ -146,6 +146,9 @@ static const char *const excludeDirContents[] = /* Contents zeroed on startup, see StartupSUBTRANS(). */ "pg_subtrans", + /* Contents zeroed on startup, see StartupCSNLog(). */ + "pg_csn", + /* end of list */ NULL }; diff --git a/src/include/access/csn_log.h b/src/include/access/csn_log.h new file mode 100644 index 00000000000..f8cdf573aef --- /dev/null +++ b/src/include/access/csn_log.h @@ -0,0 +1,30 @@ +/* + * csn_log.h + * + * Mapping from XID to commit record's LSN (Commit Sequence Number). + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/csn_log.h + */ +#ifndef CSNLOG_H +#define CSNLOG_H + +#include "access/xlog.h" +#include "utils/snapshot.h" + +extern void CSNLogSetCSN(TransactionId xid, int nsubxids, + TransactionId *subxids, XLogRecPtr csn); +extern XLogRecPtr CSNLogGetCSNByXid(TransactionId xid); + +extern Size CSNLogShmemSize(void); +extern void CSNLogShmemInit(void); +extern void BootStrapCSNLog(void); +extern void StartupCSNLog(TransactionId oldestActiveXID, XLogRecPtr csn); +extern void ShutdownCSNLog(void); +extern void CheckPointCSNLog(void); +extern void ExtendCSNLog(TransactionId newestXact); +extern void TruncateCSNLog(TransactionId oldestXact); + +#endif /* CSNLOG_H */ diff --git a/src/include/access/transam.h b/src/include/access/transam.h index e71c660118e..76411cca178 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -238,6 +238,9 @@ typedef struct TransamVariablesData FullTransactionId latestCompletedXid; /* newest full XID that has * committed or aborted */ + /* During recovery, LSN of latest replayed commit record */ + XLogRecPtr latestCommitLSN; + /* * Number of top-level transactions with xids (i.e. which may have * modified the database) that completed in some form since the start of diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h index 9fa82355033..9527695886f 100644 --- a/src/include/access/twophase.h +++ b/src/include/access/twophase.h @@ -47,8 +47,7 @@ extern void StartPrepare(GlobalTransaction gxact); extern void EndPrepare(GlobalTransaction gxact); extern bool StandbyTransactionIdIsPrepared(TransactionId xid); -extern TransactionId PrescanPreparedTransactions(TransactionId **xids_p, - int *nxids_p); +extern TransactionId PrescanPreparedTransactions(void); extern void StandbyRecoverPreparedTransactions(void); extern void RecoverPreparedTransactions(void); diff --git a/src/include/access/xact.h b/src/include/access/xact.h index b2bc10ee041..b31944d0e6c 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -171,7 +171,7 @@ typedef struct SavedTransactionCharacteristics #define XLOG_XACT_ABORT 0x20 #define XLOG_XACT_COMMIT_PREPARED 0x30 #define XLOG_XACT_ABORT_PREPARED 0x40 -#define XLOG_XACT_ASSIGNMENT 0x50 +/* 0x50 is unused, was XLOG_XACT_ASSIGNMENT */ #define XLOG_XACT_INVALIDATIONS 0x60 /* free opcode 0x70 */ @@ -215,15 +215,6 @@ typedef struct SavedTransactionCharacteristics #define XactCompletionForceSyncCommit(xinfo) \ ((xinfo & XACT_COMPLETION_FORCE_SYNC_COMMIT) != 0) -typedef struct xl_xact_assignment -{ - TransactionId xtop; /* assigned XID's top-level XID */ - int nsubxacts; /* number of subtransaction XIDs */ - TransactionId xsub[FLEXIBLE_ARRAY_MEMBER]; /* assigned subxids */ -} xl_xact_assignment; - -#define MinSizeOfXactAssignment offsetof(xl_xact_assignment, xsub) - /* * Commit and abort records can contain a lot of information. But a large * portion of the records won't need all possible pieces of information. So we @@ -448,7 +439,6 @@ extern FullTransactionId GetTopFullTransactionId(void); extern FullTransactionId GetTopFullTransactionIdIfAny(void); extern FullTransactionId GetCurrentFullTransactionId(void); extern FullTransactionId GetCurrentFullTransactionIdIfAny(void); -extern void MarkCurrentTransactionIdLoggedIfAny(void); extern bool SubTransactionIsActive(SubTransactionId subxid); extern CommandId GetCurrentCommandId(bool used); extern void SetParallelStartTimestamps(TimestampTz xact_ts, TimestampTz stmt_ts); diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h index a1870d8e5aa..2ab20fcae2f 100644 --- a/src/include/access/xlogutils.h +++ b/src/include/access/xlogutils.h @@ -27,37 +27,10 @@ extern PGDLLIMPORT bool ignore_invalid_pages; extern PGDLLIMPORT bool InRecovery; /* - * Like InRecovery, standbyState is only valid in the startup process. - * In all other processes it will have the value STANDBY_DISABLED (so - * InHotStandby will read as false). - * - * In DISABLED state, we're performing crash recovery or hot standby was - * disabled in postgresql.conf. - * - * In INITIALIZED state, we've run InitRecoveryTransactionEnvironment, but - * we haven't yet processed a RUNNING_XACTS or shutdown-checkpoint WAL record - * to initialize our primary-transaction tracking system. - * - * When the transaction tracking is initialized, we enter the SNAPSHOT_PENDING - * state. The tracked information might still be incomplete, so we can't allow - * connections yet, but redo functions must update the in-memory state when - * appropriate. - * - * In SNAPSHOT_READY mode, we have full knowledge of transactions that are - * (or were) running on the primary at the current WAL location. Snapshots - * can be taken, and read-only queries can be run. + * Like InRecovery, InHotStandby is only valid in the startup process. + * In all other processes it will be false. */ -typedef enum -{ - STANDBY_DISABLED, - STANDBY_INITIALIZED, - STANDBY_SNAPSHOT_PENDING, - STANDBY_SNAPSHOT_READY, -} HotStandbyState; - -extern PGDLLIMPORT HotStandbyState standbyState; - -#define InHotStandby (standbyState >= STANDBY_SNAPSHOT_PENDING) +extern PGDLLIMPORT bool InHotStandby; extern bool XLogHaveInvalidPages(void); diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 4df1d25c045..457c5511c5e 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -181,6 +181,7 @@ typedef enum BuiltinTrancheIds LWTRANCHE_XACT_BUFFER = NUM_INDIVIDUAL_LWLOCKS, LWTRANCHE_COMMITTS_BUFFER, LWTRANCHE_SUBTRANS_BUFFER, + LWTRANCHE_CSN_LOG_BUFFER, LWTRANCHE_MULTIXACTOFFSET_BUFFER, LWTRANCHE_MULTIXACTMEMBER_BUFFER, LWTRANCHE_NOTIFY_BUFFER, @@ -219,6 +220,7 @@ typedef enum BuiltinTrancheIds LWTRANCHE_XACT_SLRU, LWTRANCHE_PARALLEL_VACUUM_DSA, LWTRANCHE_AIO_URING_COMPLETION, + LWTRANCHE_CSN_LOG_SLRU, LWTRANCHE_FIRST_USER_DEFINED, } BuiltinTrancheIds; diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index 8eedc2d6b9f..57071d1e0f4 100644 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -28,18 +28,11 @@ extern void ProcArrayRemove(PGPROC *proc, TransactionId latestXid); extern void ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid); extern void ProcArrayClearTransaction(PGPROC *proc); +extern void ProcArrayUpdateOldestRunningXid(TransactionId oldestRunningXID); extern void ProcArrayInitRecovery(TransactionId initializedUptoXID); -extern void ProcArrayApplyRecoveryInfo(RunningTransactions running); -extern void ProcArrayApplyXidAssignment(TransactionId topxid, - int nsubxids, TransactionId *subxids); extern void RecordKnownAssignedTransactionIds(TransactionId xid); -extern void ExpireTreeKnownAssignedTransactionIds(TransactionId xid, - int nsubxids, TransactionId *subxids, - TransactionId max_xid); -extern void ExpireAllKnownAssignedTransactionIds(void); -extern void ExpireOldKnownAssignedTransactionIds(TransactionId xid); -extern void KnownAssignedTransactionIdsIdleMaintenance(void); +extern void ProcArrayRecoveryEndTransaction(TransactionId max_xid, XLogRecPtr lsn); extern int GetMaxSnapshotXidCount(void); extern int GetMaxSnapshotSubxidCount(void); @@ -56,7 +49,7 @@ extern bool TransactionIdIsInProgress(TransactionId xid); extern bool TransactionIdIsActive(TransactionId xid); extern TransactionId GetOldestNonRemovableTransactionId(Relation rel); extern TransactionId GetOldestTransactionIdConsideredRunning(void); -extern TransactionId GetOldestActiveTransactionId(void); +extern TransactionId GetOldestActiveTransactionId(bool allDbs); extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly); extern void GetReplicationHorizons(TransactionId *xmin, TransactionId *catalog_xmin); diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h index 193366ce052..14ff80904c8 100644 --- a/src/include/utils/snapshot.h +++ b/src/include/utils/snapshot.h @@ -13,6 +13,7 @@ #ifndef SNAPSHOT_H #define SNAPSHOT_H +#include "access/xlogdefs.h" #include "lib/ilist.h" @@ -186,6 +187,13 @@ typedef struct MVCCSnapshotSharedData int32 subxcnt; /* # of xact ids in subxip[] */ bool suboverflowed; /* has the subxip array overflowed? */ + /* + * MVCC snapshots taken during recovery use this CSN instead of the xip + * and subxip arrays. Any transactions that committed at or before this + * LSN are considered as visible. + */ + XLogRecPtr snapshotCsn; + bool takenDuringRecovery; /* recovery-shaped snapshot? */ /* -- 2.39.5