From 0d79d2fb6ab9f1d5b0b3f03e500315135329b09e Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Thu, 22 May 2025 18:39:08 +0200 Subject: [PATCH v2 6/7] NUMA: interleave PGPROC entries The goal is to distribute ProcArray (or rather PGPROC entries and associated fast-path arrays) to NUMA nodes. We can't do this by simply interleaving pages, because that wouldn't work for both parts at the same time. We want to place the PGPROC and it's fast-path locking structs on the same node, but the structs are of different sizes, etc. Another problem is that PGPROC entries are fairly small, so with huge pages and reasonable values of max_connections everything fits onto a single page. We don't want to make this incompatible with huge pages. Note: If we eventually switch to allocating separate shared segments for different parts (to allow on-line resizing), we could keep using regular pages for procarray, and this would not be such an issue. To make this work, we split the PGPROC array into per-node segments, each with about (MaxBackends / numa_nodes) entries, and one segment for auxiliary processes and prepared transations. And we do the same thing for fast-path arrays. The PGPROC segments are laid out like this (e.g. for 2 NUMA nodes): - PGPROC array / node #0 - PGPROC array / node #1 - PGPROC array / aux processes + 2PC transactions - fast-path arrays / node #0 - fast-path arrays / node #1 - fast-path arrays / aux processes + 2PC transaction Each segment is aligned to (starts at) memory page, and is effectively a multiple of multiple memory pages. Having a single PGPROC array made certain operations easiers - e.g. it was possible to iterate the array, and GetNumberFromPGProc() could calculate offset by simply subtracting PGPROC pointers. With multiple segments that's not possible, but the fallout is minimal. Most places accessed PGPROC through PROC_HDR->allProcs, and can continue to do so, except that now they get a pointer to the PGPROC (which most places wanted anyway). Note: There's an indirection, though. But the pointer does not change, so hopefully that's not an issue. And each PGPROC entry gets an explicit procnumber field, which is the index in allProcs, GetNumberFromPGProc can simply return that. Each PGPROC also gets numa_node, tracking the NUMA node, so that we don't have to recalculate that. This is used by InitProcess() to pick a PGPROC entry from the local NUMA node. Note: The scheduler may migrate the process to a different CPU/node later. Maybe we should consider pinning the process to the node? --- src/backend/access/transam/clog.c | 4 +- src/backend/postmaster/pgarch.c | 2 +- src/backend/postmaster/walsummarizer.c | 2 +- src/backend/storage/buffer/freelist.c | 2 +- src/backend/storage/ipc/procarray.c | 61 ++-- src/backend/storage/lmgr/lock.c | 6 +- src/backend/storage/lmgr/proc.c | 368 +++++++++++++++++++++++-- src/backend/utils/init/globals.c | 1 + src/backend/utils/misc/guc_tables.c | 10 + src/include/miscadmin.h | 1 + src/include/storage/proc.h | 11 +- 11 files changed, 406 insertions(+), 62 deletions(-) diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index e80fbe109cf..928d126d0ee 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -574,7 +574,7 @@ TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status, /* Walk the list and update the status of all XIDs. */ while (nextidx != INVALID_PROC_NUMBER) { - PGPROC *nextproc = &ProcGlobal->allProcs[nextidx]; + PGPROC *nextproc = ProcGlobal->allProcs[nextidx]; int64 thispageno = nextproc->clogGroupMemberPage; /* @@ -633,7 +633,7 @@ TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status, */ while (wakeidx != INVALID_PROC_NUMBER) { - PGPROC *wakeproc = &ProcGlobal->allProcs[wakeidx]; + PGPROC *wakeproc = ProcGlobal->allProcs[wakeidx]; wakeidx = pg_atomic_read_u32(&wakeproc->clogGroupNext); pg_atomic_write_u32(&wakeproc->clogGroupNext, INVALID_PROC_NUMBER); diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c index 78e39e5f866..e28e0f7d3bd 100644 --- a/src/backend/postmaster/pgarch.c +++ b/src/backend/postmaster/pgarch.c @@ -289,7 +289,7 @@ PgArchWakeup(void) * be relaunched shortly and will start archiving. */ if (arch_pgprocno != INVALID_PROC_NUMBER) - SetLatch(&ProcGlobal->allProcs[arch_pgprocno].procLatch); + SetLatch(&ProcGlobal->allProcs[arch_pgprocno]->procLatch); } diff --git a/src/backend/postmaster/walsummarizer.c b/src/backend/postmaster/walsummarizer.c index 777c9a8d555..087279a6a8e 100644 --- a/src/backend/postmaster/walsummarizer.c +++ b/src/backend/postmaster/walsummarizer.c @@ -649,7 +649,7 @@ WakeupWalSummarizer(void) LWLockRelease(WALSummarizerLock); if (pgprocno != INVALID_PROC_NUMBER) - SetLatch(&ProcGlobal->allProcs[pgprocno].procLatch); + SetLatch(&ProcGlobal->allProcs[pgprocno]->procLatch); } /* diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 1827e052da7..2ce158ca9bd 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -446,7 +446,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r * actually fine because procLatch isn't ever freed, so we just can * potentially set the wrong process' (or no process') latch. */ - SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch); + SetLatch(&ProcGlobal->allProcs[bgwprocno]->procLatch); } /* diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 2418967def6..82158eeb5d6 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -268,7 +268,7 @@ typedef enum KAXCompressReason static ProcArrayStruct *procArray; -static PGPROC *allProcs; +static PGPROC **allProcs; /* * Cache to reduce overhead of repeated calls to TransactionIdIsInProgress() @@ -502,7 +502,7 @@ ProcArrayAdd(PGPROC *proc) int this_procno = arrayP->pgprocnos[index]; Assert(this_procno >= 0 && this_procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS)); - Assert(allProcs[this_procno].pgxactoff == index); + Assert(allProcs[this_procno]->pgxactoff == index); /* If we have found our right position in the array, break */ if (this_procno > pgprocno) @@ -538,9 +538,9 @@ ProcArrayAdd(PGPROC *proc) int procno = arrayP->pgprocnos[index]; Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS)); - Assert(allProcs[procno].pgxactoff == index - 1); + Assert(allProcs[procno]->pgxactoff == index - 1); - allProcs[procno].pgxactoff = index; + allProcs[procno]->pgxactoff = index; } /* @@ -581,7 +581,7 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid) myoff = proc->pgxactoff; Assert(myoff >= 0 && myoff < arrayP->numProcs); - Assert(ProcGlobal->allProcs[arrayP->pgprocnos[myoff]].pgxactoff == myoff); + Assert(ProcGlobal->allProcs[arrayP->pgprocnos[myoff]]->pgxactoff == myoff); if (TransactionIdIsValid(latestXid)) { @@ -636,9 +636,9 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid) int procno = arrayP->pgprocnos[index]; Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS)); - Assert(allProcs[procno].pgxactoff - 1 == index); + Assert(allProcs[procno]->pgxactoff - 1 == index); - allProcs[procno].pgxactoff = index; + allProcs[procno]->pgxactoff = index; } /* @@ -860,7 +860,7 @@ ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid) /* Walk the list and clear all XIDs. */ while (nextidx != INVALID_PROC_NUMBER) { - PGPROC *nextproc = &allProcs[nextidx]; + PGPROC *nextproc = allProcs[nextidx]; ProcArrayEndTransactionInternal(nextproc, nextproc->procArrayGroupMemberXid); @@ -880,7 +880,7 @@ ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid) */ while (wakeidx != INVALID_PROC_NUMBER) { - PGPROC *nextproc = &allProcs[wakeidx]; + PGPROC *nextproc = allProcs[wakeidx]; wakeidx = pg_atomic_read_u32(&nextproc->procArrayGroupNext); pg_atomic_write_u32(&nextproc->procArrayGroupNext, INVALID_PROC_NUMBER); @@ -1526,7 +1526,7 @@ TransactionIdIsInProgress(TransactionId xid) pxids = other_subxidstates[pgxactoff].count; pg_read_barrier(); /* pairs with barrier in GetNewTransactionId() */ pgprocno = arrayP->pgprocnos[pgxactoff]; - proc = &allProcs[pgprocno]; + proc = allProcs[pgprocno]; for (j = pxids - 1; j >= 0; j--) { /* Fetch xid just once - see GetNewTransactionId */ @@ -1622,7 +1622,6 @@ TransactionIdIsInProgress(TransactionId xid) return false; } - /* * Determine XID horizons. * @@ -1740,7 +1739,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h) for (int index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; int8 statusFlags = ProcGlobal->statusFlags[index]; TransactionId xid; TransactionId xmin; @@ -2224,7 +2223,7 @@ GetSnapshotData(Snapshot snapshot) TransactionId xid = UINT32_ACCESS_ONCE(other_xids[pgxactoff]); uint8 statusFlags; - Assert(allProcs[arrayP->pgprocnos[pgxactoff]].pgxactoff == pgxactoff); + Assert(allProcs[arrayP->pgprocnos[pgxactoff]]->pgxactoff == pgxactoff); /* * If the transaction has no XID assigned, we can skip it; it @@ -2298,7 +2297,7 @@ GetSnapshotData(Snapshot snapshot) if (nsubxids > 0) { int pgprocno = pgprocnos[pgxactoff]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; pg_read_barrier(); /* pairs with GetNewTransactionId */ @@ -2499,7 +2498,7 @@ ProcArrayInstallImportedXmin(TransactionId xmin, for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; int statusFlags = ProcGlobal->statusFlags[index]; TransactionId xid; @@ -2725,7 +2724,7 @@ GetRunningTransactionData(void) if (TransactionIdPrecedes(xid, oldestDatabaseRunningXid)) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; if (proc->databaseId == MyDatabaseId) oldestDatabaseRunningXid = xid; @@ -2756,7 +2755,7 @@ GetRunningTransactionData(void) for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; int nsubxids; /* @@ -3006,7 +3005,7 @@ GetVirtualXIDsDelayingChkpt(int *nvxids, int type) for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; if ((proc->delayChkptFlags & type) != 0) { @@ -3047,7 +3046,7 @@ HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids, int type) for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; VirtualTransactionId vxid; GET_VXID_FROM_PGPROC(vxid, *proc); @@ -3175,7 +3174,7 @@ BackendPidGetProcWithLock(int pid) for (index = 0; index < arrayP->numProcs; index++) { - PGPROC *proc = &allProcs[arrayP->pgprocnos[index]]; + PGPROC *proc = allProcs[arrayP->pgprocnos[index]]; if (proc->pid == pid) { @@ -3218,7 +3217,7 @@ BackendXidGetPid(TransactionId xid) if (other_xids[index] == xid) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; result = proc->pid; break; @@ -3287,7 +3286,7 @@ GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0, for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; uint8 statusFlags = ProcGlobal->statusFlags[index]; if (proc == MyProc) @@ -3389,7 +3388,7 @@ GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid) for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; /* Exclude prepared transactions */ if (proc->pid == 0) @@ -3454,7 +3453,7 @@ SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode, for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; VirtualTransactionId procvxid; GET_VXID_FROM_PGPROC(procvxid, *proc); @@ -3509,7 +3508,7 @@ MinimumActiveBackends(int min) for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; /* * Since we're not holding a lock, need to be prepared to deal with @@ -3555,7 +3554,7 @@ CountDBBackends(Oid databaseid) for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; if (proc->pid == 0) continue; /* do not count prepared xacts */ @@ -3584,7 +3583,7 @@ CountDBConnections(Oid databaseid) for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; if (proc->pid == 0) continue; /* do not count prepared xacts */ @@ -3615,7 +3614,7 @@ CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending) for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; if (databaseid == InvalidOid || proc->databaseId == databaseid) { @@ -3656,7 +3655,7 @@ CountUserBackends(Oid roleid) for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; if (proc->pid == 0) continue; /* do not count prepared xacts */ @@ -3719,7 +3718,7 @@ CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared) for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; uint8 statusFlags = ProcGlobal->statusFlags[index]; if (proc->databaseId != databaseId) @@ -3785,7 +3784,7 @@ TerminateOtherDBBackends(Oid databaseId) for (i = 0; i < procArray->numProcs; i++) { int pgprocno = arrayP->pgprocnos[i]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; if (proc->databaseId != databaseId) continue; diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 62f3471448e..c84a2a5f1bc 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -2844,7 +2844,7 @@ FastPathTransferRelationLocks(LockMethod lockMethodTable, const LOCKTAG *locktag */ for (i = 0; i < ProcGlobal->allProcCount; i++) { - PGPROC *proc = &ProcGlobal->allProcs[i]; + PGPROC *proc = ProcGlobal->allProcs[i]; uint32 j; LWLockAcquire(&proc->fpInfoLock, LW_EXCLUSIVE); @@ -3103,7 +3103,7 @@ GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode, int *countp) */ for (i = 0; i < ProcGlobal->allProcCount; i++) { - PGPROC *proc = &ProcGlobal->allProcs[i]; + PGPROC *proc = ProcGlobal->allProcs[i]; uint32 j; /* A backend never blocks itself */ @@ -3790,7 +3790,7 @@ GetLockStatusData(void) */ for (i = 0; i < ProcGlobal->allProcCount; ++i) { - PGPROC *proc = &ProcGlobal->allProcs[i]; + PGPROC *proc = ProcGlobal->allProcs[i]; /* Skip backends with pid=0, as they don't hold fast-path locks */ if (proc->pid == 0) diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index e9ef0fbfe32..9d3e94a7b3a 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -29,21 +29,29 @@ */ #include "postgres.h" +#include #include #include #include +#ifdef USE_LIBNUMA +#include +#include +#endif + #include "access/transam.h" #include "access/twophase.h" #include "access/xlogutils.h" #include "miscadmin.h" #include "pgstat.h" +#include "port/pg_numa.h" #include "postmaster/autovacuum.h" #include "replication/slotsync.h" #include "replication/syncrep.h" #include "storage/condition_variable.h" #include "storage/ipc.h" #include "storage/lmgr.h" +#include "storage/pg_shmem.h" #include "storage/pmsignal.h" #include "storage/proc.h" #include "storage/procarray.h" @@ -89,6 +97,12 @@ static void ProcKill(int code, Datum arg); static void AuxiliaryProcKill(int code, Datum arg); static void CheckDeadLock(void); +/* NUMA */ +static Size get_memory_page_size(void); /* XXX duplicate */ +static void move_to_node(char *startptr, char *endptr, + Size mem_page_size, int node); +static int numa_nodes = -1; + /* * Report shared-memory space needed by PGPROC. @@ -100,11 +114,40 @@ PGProcShmemSize(void) Size TotalProcs = add_size(MaxBackends, add_size(NUM_AUXILIARY_PROCS, max_prepared_xacts)); + size = add_size(size, mul_size(TotalProcs, sizeof(PGPROC *))); size = add_size(size, mul_size(TotalProcs, sizeof(PGPROC))); size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->xids))); size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->subxidStates))); size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->statusFlags))); + /* + * With NUMA, we allocate the PGPROC array in several chunks. With shared + * buffers we simply manually assign parts of the buffer array to + * different NUMA nodes, and that does the trick. But we can't do that for + * PGPROC, as the number of PGPROC entries is much lower, especially with + * huge pages. We can fit ~2k entries on a 2MB page, and NUMA does stuff + * with page granularity, and the large NUMA systems are likely to use + * huge pages. So with sensible max_connections we would not use more than + * a single page, which means it gets to a single NUMA node. + * + * So we allocate PGPROC not as a single array, but one array per NUMA + * node, and then one array for aux processes (without NUMA node + * assigned). Each array may need up to memory-page-worth of padding, + * worst case. So we just add that - it's a bit wasteful, but good enough + * for PoC. + * + * FIXME Should be conditional, but that was causing problems in bootstrap + * mode. Or maybe it was because the code that allocates stuff later does + * not do that conditionally. Anyway, needs to be fixed. + */ + /* if (numa_procs_interleave) */ + { + int num_nodes = numa_num_configured_nodes(); + Size mem_page_size = get_memory_page_size(); + + size = add_size(size, mul_size((num_nodes + 1), mem_page_size)); + } + return size; } @@ -129,6 +172,26 @@ FastPathLockShmemSize(void) size = add_size(size, mul_size(TotalProcs, (fpLockBitsSize + fpRelIdSize))); + /* + * Same NUMA-padding logic as in PGProcShmemSize, adding a memory page per + * NUMA node - but this way we add two pages per node - one for PGPROC, + * one for fast-path arrays. In theory we could make this work just one + * page per node, by adding fast-path arrays right after PGPROC entries on + * each node. But now we allocate fast-path locks separately - good enough + * for PoC. + * + * FIXME Should be conditional, but that was causing problems in bootstrap + * mode. Or maybe it was because the code that allocates stuff later does + * not do that conditionally. Anyway, needs to be fixed. + */ + /* if (numa_procs_interleave) */ + { + int num_nodes = numa_num_configured_nodes(); + Size mem_page_size = get_memory_page_size(); + + size = add_size(size, mul_size((num_nodes + 1), mem_page_size)); + } + return size; } @@ -191,11 +254,13 @@ ProcGlobalSemas(void) void InitProcGlobal(void) { - PGPROC *procs; + PGPROC **procs; int i, j; bool found; uint32 TotalProcs = MaxBackends + NUM_AUXILIARY_PROCS + max_prepared_xacts; + int procs_total; + int procs_per_node; /* Used for setup of per-backend fast-path slots. */ char *fpPtr, @@ -205,6 +270,8 @@ InitProcGlobal(void) Size requestSize; char *ptr; + Size mem_page_size = get_memory_page_size(); + /* Create the ProcGlobal shared structure */ ProcGlobal = (PROC_HDR *) ShmemInitStruct("Proc Header", sizeof(PROC_HDR), &found); @@ -224,6 +291,9 @@ InitProcGlobal(void) pg_atomic_init_u32(&ProcGlobal->procArrayGroupFirst, INVALID_PROC_NUMBER); pg_atomic_init_u32(&ProcGlobal->clogGroupFirst, INVALID_PROC_NUMBER); + /* one chunk per NUMA node (without NUMA assume 1 node) */ + numa_nodes = numa_num_configured_nodes(); + /* * Create and initialize all the PGPROC structures we'll need. There are * six separate consumers: (1) normal backends, (2) autovacuum workers and @@ -241,19 +311,108 @@ InitProcGlobal(void) MemSet(ptr, 0, requestSize); - procs = (PGPROC *) ptr; - ptr = (char *) ptr + TotalProcs * sizeof(PGPROC); + /* allprocs (array of pointers to PGPROC entries) */ + procs = (PGPROC **) ptr; + ptr = (char *) ptr + TotalProcs * sizeof(PGPROC *); ProcGlobal->allProcs = procs; /* XXX allProcCount isn't really all of them; it excludes prepared xacts */ ProcGlobal->allProcCount = MaxBackends + NUM_AUXILIARY_PROCS; + /* + * NUMA partitioning + * + * Now build the actual PGPROC arrays, one "chunk" per NUMA node (and one + * extra for auxiliary processes and 2PC transactions, not associated with + * any particular node). + * + * First determine how many "backend" procs to allocate per NUMA node. The + * count may not be exactly divisible, but we mostly ignore that. The last + * node may get somewhat fewer PGPROC entries, but the imbalance ought to + * be pretty small (if MaxBackends >> numa_nodes). + * + * XXX A fairer distribution is possible, but not worth it now. + */ + procs_per_node = (MaxBackends + (numa_nodes - 1)) / numa_nodes; + procs_total = 0; + + /* build PGPROC entries for NUMA nodes */ + for (i = 0; i < numa_nodes; i++) + { + PGPROC *procs_node; + + /* the last NUMA node may get fewer PGPROC entries, but meh */ + int count_node = Min(procs_per_node, MaxBackends - procs_total); + + /* make sure to align the PGPROC array to memory page */ + ptr = (char *) TYPEALIGN(mem_page_size, ptr); + + /* allocate the PGPROC chunk for this node */ + procs_node = (PGPROC *) ptr; + ptr = (char *) ptr + count_node * sizeof(PGPROC); + + /* don't overflow the allocation */ + Assert((ptr > (char *) procs) && (ptr <= (char *) procs + requestSize)); + + /* add pointers to the PGPROC entries to allProcs */ + for (j = 0; j < count_node; j++) + { + procs_node[j].numa_node = i; + procs_node[j].procnumber = procs_total; + + ProcGlobal->allProcs[procs_total++] = &procs_node[j]; + } + + move_to_node((char *) procs_node, ptr, mem_page_size, i); + } + + /* + * also build PGPROC entries for auxiliary procs / prepared xacts (we + * don't assign those to any NUMA node) + * + * XXX Mostly duplicate of preceding block, could be reused. + */ + { + PGPROC *procs_node; + int count_node = (NUM_AUXILIARY_PROCS + max_prepared_xacts); + + /* + * Make sure to align PGPROC array to memory page (it may not be + * aligned). We won't assign this to any NUMA node, but we still don't + * want it to interfere with the preceding chunk (for the last NUMA + * node). + */ + ptr = (char *) TYPEALIGN(mem_page_size, ptr); + + procs_node = (PGPROC *) ptr; + ptr = (char *) ptr + count_node * sizeof(PGPROC); + + /* don't overflow the allocation */ + Assert((ptr > (char *) procs) && (ptr <= (char *) procs + requestSize)); + + /* now add the PGPROC pointers to allProcs */ + for (j = 0; j < count_node; j++) + { + procs_node[j].numa_node = -1; + procs_node[j].procnumber = procs_total; + + ProcGlobal->allProcs[procs_total++] = &procs_node[j]; + } + } + + /* we should have allocated the expected number of PGPROC entries */ + Assert(procs_total == TotalProcs); + /* * Allocate arrays mirroring PGPROC fields in a dense manner. See * PROC_HDR. * * XXX: It might make sense to increase padding for these arrays, given * how hotly they are accessed. + * + * XXX Would it make sense to NUMA-partition these chunks too, somehow? + * But those arrays are tiny, fit into a single memory page, so would need + * to be made more complex. Not sure. */ ProcGlobal->xids = (TransactionId *) ptr; ptr = (char *) ptr + (TotalProcs * sizeof(*ProcGlobal->xids)); @@ -286,23 +445,100 @@ InitProcGlobal(void) /* For asserts checking we did not overflow. */ fpEndPtr = fpPtr + requestSize; - for (i = 0; i < TotalProcs; i++) + /* reset the count */ + procs_total = 0; + + /* + * Mimic the same logic as above, but for fast-path locking. + */ + for (i = 0; i < numa_nodes; i++) { - PGPROC *proc = &procs[i]; + char *startptr; + char *endptr; - /* Common initialization for all PGPROCs, regardless of type. */ + /* the last NUMA node may get fewer PGPROC entries, but meh */ + int procs_node = Min(procs_per_node, MaxBackends - procs_total); + + /* align to memory page, to make move_pages possible */ + fpPtr = (char *) TYPEALIGN(mem_page_size, fpPtr); + + startptr = fpPtr; + endptr = fpPtr + procs_node * (fpLockBitsSize + fpRelIdSize); + + move_to_node(startptr, endptr, mem_page_size, i); /* - * Set the fast-path lock arrays, and move the pointer. We interleave - * the two arrays, to (hopefully) get some locality for each backend. + * Now point the PGPROC entries to the fast-path arrays, and also + * advance the fpPtr. */ - proc->fpLockBits = (uint64 *) fpPtr; - fpPtr += fpLockBitsSize; + for (j = 0; j < procs_node; j++) + { + PGPROC *proc = ProcGlobal->allProcs[procs_total++]; + + /* cross-check we got the expected NUMA node */ + Assert(proc->numa_node == i); + Assert(proc->procnumber == (procs_total - 1)); + + /* + * Set the fast-path lock arrays, and move the pointer. We + * interleave the two arrays, to (hopefully) get some locality for + * each backend. + */ + proc->fpLockBits = (uint64 *) fpPtr; + fpPtr += fpLockBitsSize; - proc->fpRelId = (Oid *) fpPtr; - fpPtr += fpRelIdSize; + proc->fpRelId = (Oid *) fpPtr; + fpPtr += fpRelIdSize; - Assert(fpPtr <= fpEndPtr); + Assert(fpPtr <= fpEndPtr); + } + + Assert(fpPtr == endptr); + } + + /* auxiliary processes / prepared xacts */ + { + /* the last NUMA node may get fewer PGPROC entries, but meh */ + int procs_node = (NUM_AUXILIARY_PROCS + max_prepared_xacts); + + /* align to memory page, to make move_pages possible */ + fpPtr = (char *) TYPEALIGN(mem_page_size, fpPtr); + + /* now point the PGPROC entries to the fast-path arrays */ + for (j = 0; j < procs_node; j++) + { + PGPROC *proc = ProcGlobal->allProcs[procs_total++]; + + /* cross-check we got PGPROC with no NUMA node assigned */ + Assert(proc->numa_node == -1); + Assert(proc->procnumber == (procs_total - 1)); + + /* + * Set the fast-path lock arrays, and move the pointer. We + * interleave the two arrays, to (hopefully) get some locality for + * each backend. + */ + proc->fpLockBits = (uint64 *) fpPtr; + fpPtr += fpLockBitsSize; + + proc->fpRelId = (Oid *) fpPtr; + fpPtr += fpRelIdSize; + + Assert(fpPtr <= fpEndPtr); + } + } + + /* Should have consumed exactly the expected amount of fast-path memory. */ + Assert(fpPtr <= fpEndPtr); + + /* make sure we allocated the expected number of PGPROC entries */ + Assert(procs_total == TotalProcs); + + for (i = 0; i < TotalProcs; i++) + { + PGPROC *proc = procs[i]; + + Assert(proc->procnumber == i); /* * Set up per-PGPROC semaphore, latch, and fpInfoLock. Prepared xact @@ -366,15 +602,12 @@ InitProcGlobal(void) pg_atomic_init_u64(&(proc->waitStart), 0); } - /* Should have consumed exactly the expected amount of fast-path memory. */ - Assert(fpPtr == fpEndPtr); - /* * Save pointers to the blocks of PGPROC structures reserved for auxiliary * processes and prepared transactions. */ - AuxiliaryProcs = &procs[MaxBackends]; - PreparedXactProcs = &procs[MaxBackends + NUM_AUXILIARY_PROCS]; + AuxiliaryProcs = procs[MaxBackends]; + PreparedXactProcs = procs[MaxBackends + NUM_AUXILIARY_PROCS]; /* Create ProcStructLock spinlock, too */ ProcStructLock = (slock_t *) ShmemInitStruct("ProcStructLock spinlock", @@ -435,7 +668,45 @@ InitProcess(void) if (!dlist_is_empty(procgloballist)) { - MyProc = dlist_container(PGPROC, links, dlist_pop_head_node(procgloballist)); + /* + * With numa interleaving of PGPROC, try to get a PROC entry from the + * right NUMA node (when the process starts). + * + * XXX The process may move to a different NUMA node later, but + * there's not much we can do about that. + */ + if (numa_procs_interleave) + { + dlist_mutable_iter iter; + unsigned cpu; + unsigned node; + int rc; + + rc = getcpu(&cpu, &node); + if (rc != 0) + elog(ERROR, "getcpu failed: %m"); + + MyProc = NULL; + + dlist_foreach_modify(iter, procgloballist) + { + PGPROC *proc; + + proc = dlist_container(PGPROC, links, iter.cur); + + if (proc->numa_node == node) + { + MyProc = proc; + dlist_delete(iter.cur); + break; + } + } + } + + /* didn't find PGPROC from the correct NUMA node, pick any free one */ + if (MyProc == NULL) + MyProc = dlist_container(PGPROC, links, dlist_pop_head_node(procgloballist)); + SpinLockRelease(ProcStructLock); } else @@ -1988,7 +2259,7 @@ ProcSendSignal(ProcNumber procNumber) if (procNumber < 0 || procNumber >= ProcGlobal->allProcCount) elog(ERROR, "procNumber out of range"); - SetLatch(&ProcGlobal->allProcs[procNumber].procLatch); + SetLatch(&ProcGlobal->allProcs[procNumber]->procLatch); } /* @@ -2063,3 +2334,60 @@ BecomeLockGroupMember(PGPROC *leader, int pid) return ok; } + +/* copy from buf_init.c */ +static Size +get_memory_page_size(void) +{ + Size os_page_size; + Size huge_page_size; + +#ifdef WIN32 + SYSTEM_INFO sysinfo; + + GetSystemInfo(&sysinfo); + os_page_size = sysinfo.dwPageSize; +#else + os_page_size = sysconf(_SC_PAGESIZE); +#endif + + /* + * XXX This is a bit annoying/confusing, because we may get a different + * result depending on when we call it. Before mmap() we don't know if the + * huge pages get used, so we assume they will. And then if we don't get + * huge pages, we'll waste memory etc. + */ + + /* assume huge pages get used, unless HUGE_PAGES_OFF */ + if (huge_pages_status == HUGE_PAGES_OFF) + huge_page_size = 0; + else + GetHugePageSize(&huge_page_size, NULL); + + return Max(os_page_size, huge_page_size); +} + +/* + * move_to_node + * move all pages in the given range to the requested NUMA node + * + * XXX This is expected to only process fairly small number of pages, so no + * need to do batching etc. Just move pages one by one. + */ +static void +move_to_node(char *startptr, char *endptr, Size mem_page_size, int node) +{ + while (startptr < endptr) + { + int r, + status; + + r = numa_move_pages(0, 1, (void **) &startptr, &node, &status, 0); + + if (r != 0) + elog(WARNING, "failed to move page to NUMA node %d (r = %d, status = %d)", + node, r, status); + + startptr += mem_page_size; + } +} diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index a11bc71a386..6ee4684d1b8 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -149,6 +149,7 @@ int MaxBackends = 0; bool numa_buffers_interleave = false; bool numa_localalloc = false; bool numa_partition_freelist = false; +bool numa_procs_interleave = false; /* GUC parameters for vacuum */ int VacuumBufferUsageLimit = 2048; diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 0552ed62cc7..7b718760248 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -2146,6 +2146,16 @@ struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, + { + {"numa_procs_interleave", PGC_POSTMASTER, DEVELOPER_OPTIONS, + gettext_noop("Enables NUMA interleaving of PGPROC entries."), + gettext_noop("When enabled, the PGPROC entries are interleaved to all NUMA nodes."), + }, + &numa_procs_interleave, + false, + NULL, NULL, NULL + }, + { {"sync_replication_slots", PGC_SIGHUP, REPLICATION_STANDBY, gettext_noop("Enables a physical standby to synchronize logical failover replication slots from the primary server."), diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 66baf2bf33e..cdeee8dccba 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -181,6 +181,7 @@ extern PGDLLIMPORT int max_parallel_workers; extern PGDLLIMPORT bool numa_buffers_interleave; extern PGDLLIMPORT bool numa_localalloc; extern PGDLLIMPORT bool numa_partition_freelist; +extern PGDLLIMPORT bool numa_procs_interleave; extern PGDLLIMPORT int commit_timestamp_buffers; extern PGDLLIMPORT int multixact_member_buffers; diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index 9f9b3fcfbf1..5cb1632718e 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -194,6 +194,8 @@ struct PGPROC * vacuum must not remove tuples deleted by * xid >= xmin ! */ + int procnumber; /* index in ProcGlobal->allProcs */ + int pid; /* Backend's process ID; 0 if prepared xact */ int pgxactoff; /* offset into various ProcGlobal->arrays with @@ -319,6 +321,9 @@ struct PGPROC PGPROC *lockGroupLeader; /* lock group leader, if I'm a member */ dlist_head lockGroupMembers; /* list of members, if I'm a leader */ dlist_node lockGroupLink; /* my member link, if I'm a member */ + + /* NUMA node */ + int numa_node; }; /* NOTE: "typedef struct PGPROC PGPROC" appears in storage/lock.h. */ @@ -383,7 +388,7 @@ extern PGDLLIMPORT PGPROC *MyProc; typedef struct PROC_HDR { /* Array of PGPROC structures (not including dummies for prepared txns) */ - PGPROC *allProcs; + PGPROC **allProcs; /* Array mirroring PGPROC.xid for each PGPROC currently in the procarray */ TransactionId *xids; @@ -435,8 +440,8 @@ extern PGDLLIMPORT PGPROC *PreparedXactProcs; /* * Accessors for getting PGPROC given a ProcNumber and vice versa. */ -#define GetPGProcByNumber(n) (&ProcGlobal->allProcs[(n)]) -#define GetNumberFromPGProc(proc) ((proc) - &ProcGlobal->allProcs[0]) +#define GetPGProcByNumber(n) (ProcGlobal->allProcs[(n)]) +#define GetNumberFromPGProc(proc) ((proc)->procnumber) /* * We set aside some extra PGPROC structures for "special worker" processes, -- 2.49.0