From 0d79d2fb6ab9f1d5b0b3f03e500315135329b09e Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Thu, 22 May 2025 18:39:08 +0200
Subject: [PATCH v2 6/7] NUMA: interleave PGPROC entries

The goal is to distribute ProcArray (or rather PGPROC entries and
associated fast-path arrays) to NUMA nodes.

We can't do this by simply interleaving pages, because that wouldn't
work for both parts at the same time. We want to place the PGPROC and
it's fast-path locking structs on the same node, but the structs are
of different sizes, etc.

Another problem is that PGPROC entries are fairly small, so with huge
pages and reasonable values of max_connections everything fits onto a
single page. We don't want to make this incompatible with huge pages.

Note: If we eventually switch to allocating separate shared segments for
different parts (to allow on-line resizing), we could keep using regular
pages for procarray, and this would not be such an issue.

To make this work, we split the PGPROC array into per-node segments,
each with about (MaxBackends / numa_nodes) entries, and one segment for
auxiliary processes and prepared transations. And we do the same thing
for fast-path arrays.

The PGPROC segments are laid out like this (e.g. for 2 NUMA nodes):

 - PGPROC array / node #0
 - PGPROC array / node #1
 - PGPROC array / aux processes + 2PC transactions
 - fast-path arrays / node #0
 - fast-path arrays / node #1
 - fast-path arrays / aux processes + 2PC transaction

Each segment is aligned to (starts at) memory page, and is effectively a
multiple of multiple memory pages.

Having a single PGPROC array made certain operations easiers - e.g. it
was possible to iterate the array, and GetNumberFromPGProc() could
calculate offset by simply subtracting PGPROC pointers. With multiple
segments that's not possible, but the fallout is minimal.

Most places accessed PGPROC through PROC_HDR->allProcs, and can continue
to do so, except that now they get a pointer to the PGPROC (which most
places wanted anyway).

Note: There's an indirection, though. But the pointer does not change,
so hopefully that's not an issue. And each PGPROC entry gets an explicit
procnumber field, which is the index in allProcs, GetNumberFromPGProc
can simply return that.

Each PGPROC also gets numa_node, tracking the NUMA node, so that we
don't have to recalculate that. This is used by InitProcess() to pick
a PGPROC entry from the local NUMA node.

Note: The scheduler may migrate the process to a different CPU/node
later. Maybe we should consider pinning the process to the node?
---
 src/backend/access/transam/clog.c      |   4 +-
 src/backend/postmaster/pgarch.c        |   2 +-
 src/backend/postmaster/walsummarizer.c |   2 +-
 src/backend/storage/buffer/freelist.c  |   2 +-
 src/backend/storage/ipc/procarray.c    |  61 ++--
 src/backend/storage/lmgr/lock.c        |   6 +-
 src/backend/storage/lmgr/proc.c        | 368 +++++++++++++++++++++++--
 src/backend/utils/init/globals.c       |   1 +
 src/backend/utils/misc/guc_tables.c    |  10 +
 src/include/miscadmin.h                |   1 +
 src/include/storage/proc.h             |  11 +-
 11 files changed, 406 insertions(+), 62 deletions(-)

diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index e80fbe109cf..928d126d0ee 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -574,7 +574,7 @@ TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status,
 	/* Walk the list and update the status of all XIDs. */
 	while (nextidx != INVALID_PROC_NUMBER)
 	{
-		PGPROC	   *nextproc = &ProcGlobal->allProcs[nextidx];
+		PGPROC	   *nextproc = ProcGlobal->allProcs[nextidx];
 		int64		thispageno = nextproc->clogGroupMemberPage;
 
 		/*
@@ -633,7 +633,7 @@ TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status,
 	 */
 	while (wakeidx != INVALID_PROC_NUMBER)
 	{
-		PGPROC	   *wakeproc = &ProcGlobal->allProcs[wakeidx];
+		PGPROC	   *wakeproc = ProcGlobal->allProcs[wakeidx];
 
 		wakeidx = pg_atomic_read_u32(&wakeproc->clogGroupNext);
 		pg_atomic_write_u32(&wakeproc->clogGroupNext, INVALID_PROC_NUMBER);
diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c
index 78e39e5f866..e28e0f7d3bd 100644
--- a/src/backend/postmaster/pgarch.c
+++ b/src/backend/postmaster/pgarch.c
@@ -289,7 +289,7 @@ PgArchWakeup(void)
 	 * be relaunched shortly and will start archiving.
 	 */
 	if (arch_pgprocno != INVALID_PROC_NUMBER)
-		SetLatch(&ProcGlobal->allProcs[arch_pgprocno].procLatch);
+		SetLatch(&ProcGlobal->allProcs[arch_pgprocno]->procLatch);
 }
 
 
diff --git a/src/backend/postmaster/walsummarizer.c b/src/backend/postmaster/walsummarizer.c
index 777c9a8d555..087279a6a8e 100644
--- a/src/backend/postmaster/walsummarizer.c
+++ b/src/backend/postmaster/walsummarizer.c
@@ -649,7 +649,7 @@ WakeupWalSummarizer(void)
 	LWLockRelease(WALSummarizerLock);
 
 	if (pgprocno != INVALID_PROC_NUMBER)
-		SetLatch(&ProcGlobal->allProcs[pgprocno].procLatch);
+		SetLatch(&ProcGlobal->allProcs[pgprocno]->procLatch);
 }
 
 /*
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 1827e052da7..2ce158ca9bd 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -446,7 +446,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r
 		 * actually fine because procLatch isn't ever freed, so we just can
 		 * potentially set the wrong process' (or no process') latch.
 		 */
-		SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch);
+		SetLatch(&ProcGlobal->allProcs[bgwprocno]->procLatch);
 	}
 
 	/*
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 2418967def6..82158eeb5d6 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -268,7 +268,7 @@ typedef enum KAXCompressReason
 
 static ProcArrayStruct *procArray;
 
-static PGPROC *allProcs;
+static PGPROC **allProcs;
 
 /*
  * Cache to reduce overhead of repeated calls to TransactionIdIsInProgress()
@@ -502,7 +502,7 @@ ProcArrayAdd(PGPROC *proc)
 		int			this_procno = arrayP->pgprocnos[index];
 
 		Assert(this_procno >= 0 && this_procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS));
-		Assert(allProcs[this_procno].pgxactoff == index);
+		Assert(allProcs[this_procno]->pgxactoff == index);
 
 		/* If we have found our right position in the array, break */
 		if (this_procno > pgprocno)
@@ -538,9 +538,9 @@ ProcArrayAdd(PGPROC *proc)
 		int			procno = arrayP->pgprocnos[index];
 
 		Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS));
-		Assert(allProcs[procno].pgxactoff == index - 1);
+		Assert(allProcs[procno]->pgxactoff == index - 1);
 
-		allProcs[procno].pgxactoff = index;
+		allProcs[procno]->pgxactoff = index;
 	}
 
 	/*
@@ -581,7 +581,7 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
 	myoff = proc->pgxactoff;
 
 	Assert(myoff >= 0 && myoff < arrayP->numProcs);
-	Assert(ProcGlobal->allProcs[arrayP->pgprocnos[myoff]].pgxactoff == myoff);
+	Assert(ProcGlobal->allProcs[arrayP->pgprocnos[myoff]]->pgxactoff == myoff);
 
 	if (TransactionIdIsValid(latestXid))
 	{
@@ -636,9 +636,9 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
 		int			procno = arrayP->pgprocnos[index];
 
 		Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS));
-		Assert(allProcs[procno].pgxactoff - 1 == index);
+		Assert(allProcs[procno]->pgxactoff - 1 == index);
 
-		allProcs[procno].pgxactoff = index;
+		allProcs[procno]->pgxactoff = index;
 	}
 
 	/*
@@ -860,7 +860,7 @@ ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid)
 	/* Walk the list and clear all XIDs. */
 	while (nextidx != INVALID_PROC_NUMBER)
 	{
-		PGPROC	   *nextproc = &allProcs[nextidx];
+		PGPROC	   *nextproc = allProcs[nextidx];
 
 		ProcArrayEndTransactionInternal(nextproc, nextproc->procArrayGroupMemberXid);
 
@@ -880,7 +880,7 @@ ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid)
 	 */
 	while (wakeidx != INVALID_PROC_NUMBER)
 	{
-		PGPROC	   *nextproc = &allProcs[wakeidx];
+		PGPROC	   *nextproc = allProcs[wakeidx];
 
 		wakeidx = pg_atomic_read_u32(&nextproc->procArrayGroupNext);
 		pg_atomic_write_u32(&nextproc->procArrayGroupNext, INVALID_PROC_NUMBER);
@@ -1526,7 +1526,7 @@ TransactionIdIsInProgress(TransactionId xid)
 		pxids = other_subxidstates[pgxactoff].count;
 		pg_read_barrier();		/* pairs with barrier in GetNewTransactionId() */
 		pgprocno = arrayP->pgprocnos[pgxactoff];
-		proc = &allProcs[pgprocno];
+		proc = allProcs[pgprocno];
 		for (j = pxids - 1; j >= 0; j--)
 		{
 			/* Fetch xid just once - see GetNewTransactionId */
@@ -1622,7 +1622,6 @@ TransactionIdIsInProgress(TransactionId xid)
 	return false;
 }
 
-
 /*
  * Determine XID horizons.
  *
@@ -1740,7 +1739,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 	for (int index = 0; index < arrayP->numProcs; index++)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
-		PGPROC	   *proc = &allProcs[pgprocno];
+		PGPROC	   *proc = allProcs[pgprocno];
 		int8		statusFlags = ProcGlobal->statusFlags[index];
 		TransactionId xid;
 		TransactionId xmin;
@@ -2224,7 +2223,7 @@ GetSnapshotData(Snapshot snapshot)
 			TransactionId xid = UINT32_ACCESS_ONCE(other_xids[pgxactoff]);
 			uint8		statusFlags;
 
-			Assert(allProcs[arrayP->pgprocnos[pgxactoff]].pgxactoff == pgxactoff);
+			Assert(allProcs[arrayP->pgprocnos[pgxactoff]]->pgxactoff == pgxactoff);
 
 			/*
 			 * If the transaction has no XID assigned, we can skip it; it
@@ -2298,7 +2297,7 @@ GetSnapshotData(Snapshot snapshot)
 					if (nsubxids > 0)
 					{
 						int			pgprocno = pgprocnos[pgxactoff];
-						PGPROC	   *proc = &allProcs[pgprocno];
+						PGPROC	   *proc = allProcs[pgprocno];
 
 						pg_read_barrier();	/* pairs with GetNewTransactionId */
 
@@ -2499,7 +2498,7 @@ ProcArrayInstallImportedXmin(TransactionId xmin,
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
-		PGPROC	   *proc = &allProcs[pgprocno];
+		PGPROC	   *proc = allProcs[pgprocno];
 		int			statusFlags = ProcGlobal->statusFlags[index];
 		TransactionId xid;
 
@@ -2725,7 +2724,7 @@ GetRunningTransactionData(void)
 		if (TransactionIdPrecedes(xid, oldestDatabaseRunningXid))
 		{
 			int			pgprocno = arrayP->pgprocnos[index];
-			PGPROC	   *proc = &allProcs[pgprocno];
+			PGPROC	   *proc = allProcs[pgprocno];
 
 			if (proc->databaseId == MyDatabaseId)
 				oldestDatabaseRunningXid = xid;
@@ -2756,7 +2755,7 @@ GetRunningTransactionData(void)
 		for (index = 0; index < arrayP->numProcs; index++)
 		{
 			int			pgprocno = arrayP->pgprocnos[index];
-			PGPROC	   *proc = &allProcs[pgprocno];
+			PGPROC	   *proc = allProcs[pgprocno];
 			int			nsubxids;
 
 			/*
@@ -3006,7 +3005,7 @@ GetVirtualXIDsDelayingChkpt(int *nvxids, int type)
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
-		PGPROC	   *proc = &allProcs[pgprocno];
+		PGPROC	   *proc = allProcs[pgprocno];
 
 		if ((proc->delayChkptFlags & type) != 0)
 		{
@@ -3047,7 +3046,7 @@ HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids, int type)
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
-		PGPROC	   *proc = &allProcs[pgprocno];
+		PGPROC	   *proc = allProcs[pgprocno];
 		VirtualTransactionId vxid;
 
 		GET_VXID_FROM_PGPROC(vxid, *proc);
@@ -3175,7 +3174,7 @@ BackendPidGetProcWithLock(int pid)
 
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
-		PGPROC	   *proc = &allProcs[arrayP->pgprocnos[index]];
+		PGPROC	   *proc = allProcs[arrayP->pgprocnos[index]];
 
 		if (proc->pid == pid)
 		{
@@ -3218,7 +3217,7 @@ BackendXidGetPid(TransactionId xid)
 		if (other_xids[index] == xid)
 		{
 			int			pgprocno = arrayP->pgprocnos[index];
-			PGPROC	   *proc = &allProcs[pgprocno];
+			PGPROC	   *proc = allProcs[pgprocno];
 
 			result = proc->pid;
 			break;
@@ -3287,7 +3286,7 @@ GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0,
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
-		PGPROC	   *proc = &allProcs[pgprocno];
+		PGPROC	   *proc = allProcs[pgprocno];
 		uint8		statusFlags = ProcGlobal->statusFlags[index];
 
 		if (proc == MyProc)
@@ -3389,7 +3388,7 @@ GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid)
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
-		PGPROC	   *proc = &allProcs[pgprocno];
+		PGPROC	   *proc = allProcs[pgprocno];
 
 		/* Exclude prepared transactions */
 		if (proc->pid == 0)
@@ -3454,7 +3453,7 @@ SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode,
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
-		PGPROC	   *proc = &allProcs[pgprocno];
+		PGPROC	   *proc = allProcs[pgprocno];
 		VirtualTransactionId procvxid;
 
 		GET_VXID_FROM_PGPROC(procvxid, *proc);
@@ -3509,7 +3508,7 @@ MinimumActiveBackends(int min)
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
-		PGPROC	   *proc = &allProcs[pgprocno];
+		PGPROC	   *proc = allProcs[pgprocno];
 
 		/*
 		 * Since we're not holding a lock, need to be prepared to deal with
@@ -3555,7 +3554,7 @@ CountDBBackends(Oid databaseid)
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
-		PGPROC	   *proc = &allProcs[pgprocno];
+		PGPROC	   *proc = allProcs[pgprocno];
 
 		if (proc->pid == 0)
 			continue;			/* do not count prepared xacts */
@@ -3584,7 +3583,7 @@ CountDBConnections(Oid databaseid)
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
-		PGPROC	   *proc = &allProcs[pgprocno];
+		PGPROC	   *proc = allProcs[pgprocno];
 
 		if (proc->pid == 0)
 			continue;			/* do not count prepared xacts */
@@ -3615,7 +3614,7 @@ CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending)
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
-		PGPROC	   *proc = &allProcs[pgprocno];
+		PGPROC	   *proc = allProcs[pgprocno];
 
 		if (databaseid == InvalidOid || proc->databaseId == databaseid)
 		{
@@ -3656,7 +3655,7 @@ CountUserBackends(Oid roleid)
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
-		PGPROC	   *proc = &allProcs[pgprocno];
+		PGPROC	   *proc = allProcs[pgprocno];
 
 		if (proc->pid == 0)
 			continue;			/* do not count prepared xacts */
@@ -3719,7 +3718,7 @@ CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared)
 		for (index = 0; index < arrayP->numProcs; index++)
 		{
 			int			pgprocno = arrayP->pgprocnos[index];
-			PGPROC	   *proc = &allProcs[pgprocno];
+			PGPROC	   *proc = allProcs[pgprocno];
 			uint8		statusFlags = ProcGlobal->statusFlags[index];
 
 			if (proc->databaseId != databaseId)
@@ -3785,7 +3784,7 @@ TerminateOtherDBBackends(Oid databaseId)
 	for (i = 0; i < procArray->numProcs; i++)
 	{
 		int			pgprocno = arrayP->pgprocnos[i];
-		PGPROC	   *proc = &allProcs[pgprocno];
+		PGPROC	   *proc = allProcs[pgprocno];
 
 		if (proc->databaseId != databaseId)
 			continue;
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index 62f3471448e..c84a2a5f1bc 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -2844,7 +2844,7 @@ FastPathTransferRelationLocks(LockMethod lockMethodTable, const LOCKTAG *locktag
 	 */
 	for (i = 0; i < ProcGlobal->allProcCount; i++)
 	{
-		PGPROC	   *proc = &ProcGlobal->allProcs[i];
+		PGPROC	   *proc = ProcGlobal->allProcs[i];
 		uint32		j;
 
 		LWLockAcquire(&proc->fpInfoLock, LW_EXCLUSIVE);
@@ -3103,7 +3103,7 @@ GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode, int *countp)
 		 */
 		for (i = 0; i < ProcGlobal->allProcCount; i++)
 		{
-			PGPROC	   *proc = &ProcGlobal->allProcs[i];
+			PGPROC	   *proc = ProcGlobal->allProcs[i];
 			uint32		j;
 
 			/* A backend never blocks itself */
@@ -3790,7 +3790,7 @@ GetLockStatusData(void)
 	 */
 	for (i = 0; i < ProcGlobal->allProcCount; ++i)
 	{
-		PGPROC	   *proc = &ProcGlobal->allProcs[i];
+		PGPROC	   *proc = ProcGlobal->allProcs[i];
 
 		/* Skip backends with pid=0, as they don't hold fast-path locks */
 		if (proc->pid == 0)
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index e9ef0fbfe32..9d3e94a7b3a 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -29,21 +29,29 @@
  */
 #include "postgres.h"
 
+#include <sched.h>
 #include <signal.h>
 #include <unistd.h>
 #include <sys/time.h>
 
+#ifdef USE_LIBNUMA
+#include <numa.h>
+#include <numaif.h>
+#endif
+
 #include "access/transam.h"
 #include "access/twophase.h"
 #include "access/xlogutils.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "port/pg_numa.h"
 #include "postmaster/autovacuum.h"
 #include "replication/slotsync.h"
 #include "replication/syncrep.h"
 #include "storage/condition_variable.h"
 #include "storage/ipc.h"
 #include "storage/lmgr.h"
+#include "storage/pg_shmem.h"
 #include "storage/pmsignal.h"
 #include "storage/proc.h"
 #include "storage/procarray.h"
@@ -89,6 +97,12 @@ static void ProcKill(int code, Datum arg);
 static void AuxiliaryProcKill(int code, Datum arg);
 static void CheckDeadLock(void);
 
+/* NUMA */
+static Size get_memory_page_size(void); /* XXX duplicate */
+static void move_to_node(char *startptr, char *endptr,
+						 Size mem_page_size, int node);
+static int	numa_nodes = -1;
+
 
 /*
  * Report shared-memory space needed by PGPROC.
@@ -100,11 +114,40 @@ PGProcShmemSize(void)
 	Size		TotalProcs =
 		add_size(MaxBackends, add_size(NUM_AUXILIARY_PROCS, max_prepared_xacts));
 
+	size = add_size(size, mul_size(TotalProcs, sizeof(PGPROC *)));
 	size = add_size(size, mul_size(TotalProcs, sizeof(PGPROC)));
 	size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->xids)));
 	size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->subxidStates)));
 	size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->statusFlags)));
 
+	/*
+	 * With NUMA, we allocate the PGPROC array in several chunks. With shared
+	 * buffers we simply manually assign parts of the buffer array to
+	 * different NUMA nodes, and that does the trick. But we can't do that for
+	 * PGPROC, as the number of PGPROC entries is much lower, especially with
+	 * huge pages. We can fit ~2k entries on a 2MB page, and NUMA does stuff
+	 * with page granularity, and the large NUMA systems are likely to use
+	 * huge pages. So with sensible max_connections we would not use more than
+	 * a single page, which means it gets to a single NUMA node.
+	 *
+	 * So we allocate PGPROC not as a single array, but one array per NUMA
+	 * node, and then one array for aux processes (without NUMA node
+	 * assigned). Each array may need up to memory-page-worth of padding,
+	 * worst case. So we just add that - it's a bit wasteful, but good enough
+	 * for PoC.
+	 *
+	 * FIXME Should be conditional, but that was causing problems in bootstrap
+	 * mode. Or maybe it was because the code that allocates stuff later does
+	 * not do that conditionally. Anyway, needs to be fixed.
+	 */
+	/* if (numa_procs_interleave) */
+	{
+		int			num_nodes = numa_num_configured_nodes();
+		Size		mem_page_size = get_memory_page_size();
+
+		size = add_size(size, mul_size((num_nodes + 1), mem_page_size));
+	}
+
 	return size;
 }
 
@@ -129,6 +172,26 @@ FastPathLockShmemSize(void)
 
 	size = add_size(size, mul_size(TotalProcs, (fpLockBitsSize + fpRelIdSize)));
 
+	/*
+	 * Same NUMA-padding logic as in PGProcShmemSize, adding a memory page per
+	 * NUMA node - but this way we add two pages per node - one for PGPROC,
+	 * one for fast-path arrays. In theory we could make this work just one
+	 * page per node, by adding fast-path arrays right after PGPROC entries on
+	 * each node. But now we allocate fast-path locks separately - good enough
+	 * for PoC.
+	 *
+	 * FIXME Should be conditional, but that was causing problems in bootstrap
+	 * mode. Or maybe it was because the code that allocates stuff later does
+	 * not do that conditionally. Anyway, needs to be fixed.
+	 */
+	/* if (numa_procs_interleave) */
+	{
+		int			num_nodes = numa_num_configured_nodes();
+		Size		mem_page_size = get_memory_page_size();
+
+		size = add_size(size, mul_size((num_nodes + 1), mem_page_size));
+	}
+
 	return size;
 }
 
@@ -191,11 +254,13 @@ ProcGlobalSemas(void)
 void
 InitProcGlobal(void)
 {
-	PGPROC	   *procs;
+	PGPROC	  **procs;
 	int			i,
 				j;
 	bool		found;
 	uint32		TotalProcs = MaxBackends + NUM_AUXILIARY_PROCS + max_prepared_xacts;
+	int			procs_total;
+	int			procs_per_node;
 
 	/* Used for setup of per-backend fast-path slots. */
 	char	   *fpPtr,
@@ -205,6 +270,8 @@ InitProcGlobal(void)
 	Size		requestSize;
 	char	   *ptr;
 
+	Size		mem_page_size = get_memory_page_size();
+
 	/* Create the ProcGlobal shared structure */
 	ProcGlobal = (PROC_HDR *)
 		ShmemInitStruct("Proc Header", sizeof(PROC_HDR), &found);
@@ -224,6 +291,9 @@ InitProcGlobal(void)
 	pg_atomic_init_u32(&ProcGlobal->procArrayGroupFirst, INVALID_PROC_NUMBER);
 	pg_atomic_init_u32(&ProcGlobal->clogGroupFirst, INVALID_PROC_NUMBER);
 
+	/* one chunk per NUMA node (without NUMA assume 1 node) */
+	numa_nodes = numa_num_configured_nodes();
+
 	/*
 	 * Create and initialize all the PGPROC structures we'll need.  There are
 	 * six separate consumers: (1) normal backends, (2) autovacuum workers and
@@ -241,19 +311,108 @@ InitProcGlobal(void)
 
 	MemSet(ptr, 0, requestSize);
 
-	procs = (PGPROC *) ptr;
-	ptr = (char *) ptr + TotalProcs * sizeof(PGPROC);
+	/* allprocs (array of pointers to PGPROC entries) */
+	procs = (PGPROC **) ptr;
+	ptr = (char *) ptr + TotalProcs * sizeof(PGPROC *);
 
 	ProcGlobal->allProcs = procs;
 	/* XXX allProcCount isn't really all of them; it excludes prepared xacts */
 	ProcGlobal->allProcCount = MaxBackends + NUM_AUXILIARY_PROCS;
 
+	/*
+	 * NUMA partitioning
+	 *
+	 * Now build the actual PGPROC arrays, one "chunk" per NUMA node (and one
+	 * extra for auxiliary processes and 2PC transactions, not associated with
+	 * any particular node).
+	 *
+	 * First determine how many "backend" procs to allocate per NUMA node. The
+	 * count may not be exactly divisible, but we mostly ignore that. The last
+	 * node may get somewhat fewer PGPROC entries, but the imbalance ought to
+	 * be pretty small (if MaxBackends >> numa_nodes).
+	 *
+	 * XXX A fairer distribution is possible, but not worth it now.
+	 */
+	procs_per_node = (MaxBackends + (numa_nodes - 1)) / numa_nodes;
+	procs_total = 0;
+
+	/* build PGPROC entries for NUMA nodes */
+	for (i = 0; i < numa_nodes; i++)
+	{
+		PGPROC	   *procs_node;
+
+		/* the last NUMA node may get fewer PGPROC entries, but meh */
+		int			count_node = Min(procs_per_node, MaxBackends - procs_total);
+
+		/* make sure to align the PGPROC array to memory page */
+		ptr = (char *) TYPEALIGN(mem_page_size, ptr);
+
+		/* allocate the PGPROC chunk for this node */
+		procs_node = (PGPROC *) ptr;
+		ptr = (char *) ptr + count_node * sizeof(PGPROC);
+
+		/* don't overflow the allocation */
+		Assert((ptr > (char *) procs) && (ptr <= (char *) procs + requestSize));
+
+		/* add pointers to the PGPROC entries to allProcs */
+		for (j = 0; j < count_node; j++)
+		{
+			procs_node[j].numa_node = i;
+			procs_node[j].procnumber = procs_total;
+
+			ProcGlobal->allProcs[procs_total++] = &procs_node[j];
+		}
+
+		move_to_node((char *) procs_node, ptr, mem_page_size, i);
+	}
+
+	/*
+	 * also build PGPROC entries for auxiliary procs / prepared xacts (we
+	 * don't assign those to any NUMA node)
+	 *
+	 * XXX Mostly duplicate of preceding block, could be reused.
+	 */
+	{
+		PGPROC	   *procs_node;
+		int			count_node = (NUM_AUXILIARY_PROCS + max_prepared_xacts);
+
+		/*
+		 * Make sure to align PGPROC array to memory page (it may not be
+		 * aligned). We won't assign this to any NUMA node, but we still don't
+		 * want it to interfere with the preceding chunk (for the last NUMA
+		 * node).
+		 */
+		ptr = (char *) TYPEALIGN(mem_page_size, ptr);
+
+		procs_node = (PGPROC *) ptr;
+		ptr = (char *) ptr + count_node * sizeof(PGPROC);
+
+		/* don't overflow the allocation */
+		Assert((ptr > (char *) procs) && (ptr <= (char *) procs + requestSize));
+
+		/* now add the PGPROC pointers to allProcs */
+		for (j = 0; j < count_node; j++)
+		{
+			procs_node[j].numa_node = -1;
+			procs_node[j].procnumber = procs_total;
+
+			ProcGlobal->allProcs[procs_total++] = &procs_node[j];
+		}
+	}
+
+	/* we should have allocated the expected number of PGPROC entries */
+	Assert(procs_total == TotalProcs);
+
 	/*
 	 * Allocate arrays mirroring PGPROC fields in a dense manner. See
 	 * PROC_HDR.
 	 *
 	 * XXX: It might make sense to increase padding for these arrays, given
 	 * how hotly they are accessed.
+	 *
+	 * XXX Would it make sense to NUMA-partition these chunks too, somehow?
+	 * But those arrays are tiny, fit into a single memory page, so would need
+	 * to be made more complex. Not sure.
 	 */
 	ProcGlobal->xids = (TransactionId *) ptr;
 	ptr = (char *) ptr + (TotalProcs * sizeof(*ProcGlobal->xids));
@@ -286,23 +445,100 @@ InitProcGlobal(void)
 	/* For asserts checking we did not overflow. */
 	fpEndPtr = fpPtr + requestSize;
 
-	for (i = 0; i < TotalProcs; i++)
+	/* reset the count */
+	procs_total = 0;
+
+	/*
+	 * Mimic the same logic as above, but for fast-path locking.
+	 */
+	for (i = 0; i < numa_nodes; i++)
 	{
-		PGPROC	   *proc = &procs[i];
+		char	   *startptr;
+		char	   *endptr;
 
-		/* Common initialization for all PGPROCs, regardless of type. */
+		/* the last NUMA node may get fewer PGPROC entries, but meh */
+		int			procs_node = Min(procs_per_node, MaxBackends - procs_total);
+
+		/* align to memory page, to make move_pages possible */
+		fpPtr = (char *) TYPEALIGN(mem_page_size, fpPtr);
+
+		startptr = fpPtr;
+		endptr = fpPtr + procs_node * (fpLockBitsSize + fpRelIdSize);
+
+		move_to_node(startptr, endptr, mem_page_size, i);
 
 		/*
-		 * Set the fast-path lock arrays, and move the pointer. We interleave
-		 * the two arrays, to (hopefully) get some locality for each backend.
+		 * Now point the PGPROC entries to the fast-path arrays, and also
+		 * advance the fpPtr.
 		 */
-		proc->fpLockBits = (uint64 *) fpPtr;
-		fpPtr += fpLockBitsSize;
+		for (j = 0; j < procs_node; j++)
+		{
+			PGPROC	   *proc = ProcGlobal->allProcs[procs_total++];
+
+			/* cross-check we got the expected NUMA node */
+			Assert(proc->numa_node == i);
+			Assert(proc->procnumber == (procs_total - 1));
+
+			/*
+			 * Set the fast-path lock arrays, and move the pointer. We
+			 * interleave the two arrays, to (hopefully) get some locality for
+			 * each backend.
+			 */
+			proc->fpLockBits = (uint64 *) fpPtr;
+			fpPtr += fpLockBitsSize;
 
-		proc->fpRelId = (Oid *) fpPtr;
-		fpPtr += fpRelIdSize;
+			proc->fpRelId = (Oid *) fpPtr;
+			fpPtr += fpRelIdSize;
 
-		Assert(fpPtr <= fpEndPtr);
+			Assert(fpPtr <= fpEndPtr);
+		}
+
+		Assert(fpPtr == endptr);
+	}
+
+	/* auxiliary processes / prepared xacts */
+	{
+		/* the last NUMA node may get fewer PGPROC entries, but meh */
+		int			procs_node = (NUM_AUXILIARY_PROCS + max_prepared_xacts);
+
+		/* align to memory page, to make move_pages possible */
+		fpPtr = (char *) TYPEALIGN(mem_page_size, fpPtr);
+
+		/* now point the PGPROC entries to the fast-path arrays */
+		for (j = 0; j < procs_node; j++)
+		{
+			PGPROC	   *proc = ProcGlobal->allProcs[procs_total++];
+
+			/* cross-check we got PGPROC with no NUMA node assigned */
+			Assert(proc->numa_node == -1);
+			Assert(proc->procnumber == (procs_total - 1));
+
+			/*
+			 * Set the fast-path lock arrays, and move the pointer. We
+			 * interleave the two arrays, to (hopefully) get some locality for
+			 * each backend.
+			 */
+			proc->fpLockBits = (uint64 *) fpPtr;
+			fpPtr += fpLockBitsSize;
+
+			proc->fpRelId = (Oid *) fpPtr;
+			fpPtr += fpRelIdSize;
+
+			Assert(fpPtr <= fpEndPtr);
+		}
+	}
+
+	/* Should have consumed exactly the expected amount of fast-path memory. */
+	Assert(fpPtr <= fpEndPtr);
+
+	/* make sure we allocated the expected number of PGPROC entries */
+	Assert(procs_total == TotalProcs);
+
+	for (i = 0; i < TotalProcs; i++)
+	{
+		PGPROC	   *proc = procs[i];
+
+		Assert(proc->procnumber == i);
 
 		/*
 		 * Set up per-PGPROC semaphore, latch, and fpInfoLock.  Prepared xact
@@ -366,15 +602,12 @@ InitProcGlobal(void)
 		pg_atomic_init_u64(&(proc->waitStart), 0);
 	}
 
-	/* Should have consumed exactly the expected amount of fast-path memory. */
-	Assert(fpPtr == fpEndPtr);
-
 	/*
 	 * Save pointers to the blocks of PGPROC structures reserved for auxiliary
 	 * processes and prepared transactions.
 	 */
-	AuxiliaryProcs = &procs[MaxBackends];
-	PreparedXactProcs = &procs[MaxBackends + NUM_AUXILIARY_PROCS];
+	AuxiliaryProcs = procs[MaxBackends];
+	PreparedXactProcs = procs[MaxBackends + NUM_AUXILIARY_PROCS];
 
 	/* Create ProcStructLock spinlock, too */
 	ProcStructLock = (slock_t *) ShmemInitStruct("ProcStructLock spinlock",
@@ -435,7 +668,45 @@ InitProcess(void)
 
 	if (!dlist_is_empty(procgloballist))
 	{
-		MyProc = dlist_container(PGPROC, links, dlist_pop_head_node(procgloballist));
+		/*
+		 * With numa interleaving of PGPROC, try to get a PROC entry from the
+		 * right NUMA node (when the process starts).
+		 *
+		 * XXX The process may move to a different NUMA node later, but
+		 * there's not much we can do about that.
+		 */
+		if (numa_procs_interleave)
+		{
+			dlist_mutable_iter iter;
+			unsigned	cpu;
+			unsigned	node;
+			int			rc;
+
+			rc = getcpu(&cpu, &node);
+			if (rc != 0)
+				elog(ERROR, "getcpu failed: %m");
+
+			MyProc = NULL;
+
+			dlist_foreach_modify(iter, procgloballist)
+			{
+				PGPROC	   *proc;
+
+				proc = dlist_container(PGPROC, links, iter.cur);
+
+				if (proc->numa_node == node)
+				{
+					MyProc = proc;
+					dlist_delete(iter.cur);
+					break;
+				}
+			}
+		}
+
+		/* didn't find PGPROC from the correct NUMA node, pick any free one */
+		if (MyProc == NULL)
+			MyProc = dlist_container(PGPROC, links, dlist_pop_head_node(procgloballist));
+
 		SpinLockRelease(ProcStructLock);
 	}
 	else
@@ -1988,7 +2259,7 @@ ProcSendSignal(ProcNumber procNumber)
 	if (procNumber < 0 || procNumber >= ProcGlobal->allProcCount)
 		elog(ERROR, "procNumber out of range");
 
-	SetLatch(&ProcGlobal->allProcs[procNumber].procLatch);
+	SetLatch(&ProcGlobal->allProcs[procNumber]->procLatch);
 }
 
 /*
@@ -2063,3 +2334,60 @@ BecomeLockGroupMember(PGPROC *leader, int pid)
 
 	return ok;
 }
+
+/* copy from buf_init.c */
+static Size
+get_memory_page_size(void)
+{
+	Size		os_page_size;
+	Size		huge_page_size;
+
+#ifdef WIN32
+	SYSTEM_INFO sysinfo;
+
+	GetSystemInfo(&sysinfo);
+	os_page_size = sysinfo.dwPageSize;
+#else
+	os_page_size = sysconf(_SC_PAGESIZE);
+#endif
+
+	/*
+	 * XXX This is a bit annoying/confusing, because we may get a different
+	 * result depending on when we call it. Before mmap() we don't know if the
+	 * huge pages get used, so we assume they will. And then if we don't get
+	 * huge pages, we'll waste memory etc.
+	 */
+
+	/* assume huge pages get used, unless HUGE_PAGES_OFF */
+	if (huge_pages_status == HUGE_PAGES_OFF)
+		huge_page_size = 0;
+	else
+		GetHugePageSize(&huge_page_size, NULL);
+
+	return Max(os_page_size, huge_page_size);
+}
+
+/*
+ * move_to_node
+ *		move all pages in the given range to the requested NUMA node
+ *
+ * XXX This is expected to only process fairly small number of pages, so no
+ * need to do batching etc. Just move pages one by one.
+ */
+static void
+move_to_node(char *startptr, char *endptr, Size mem_page_size, int node)
+{
+	while (startptr < endptr)
+	{
+		int			r,
+					status;
+
+		r = numa_move_pages(0, 1, (void **) &startptr, &node, &status, 0);
+
+		if (r != 0)
+			elog(WARNING, "failed to move page to NUMA node %d (r = %d, status = %d)",
+				 node, r, status);
+
+		startptr += mem_page_size;
+	}
+}
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index a11bc71a386..6ee4684d1b8 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -149,6 +149,7 @@ int			MaxBackends = 0;
 bool		numa_buffers_interleave = false;
 bool		numa_localalloc = false;
 bool		numa_partition_freelist = false;
+bool		numa_procs_interleave = false;
 
 /* GUC parameters for vacuum */
 int			VacuumBufferUsageLimit = 2048;
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 0552ed62cc7..7b718760248 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -2146,6 +2146,16 @@ struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"numa_procs_interleave", PGC_POSTMASTER, DEVELOPER_OPTIONS,
+			gettext_noop("Enables NUMA interleaving of PGPROC entries."),
+			gettext_noop("When enabled, the PGPROC entries are interleaved to all NUMA nodes."),
+		},
+		&numa_procs_interleave,
+		false,
+		NULL, NULL, NULL
+	},
+
 	{
 		{"sync_replication_slots", PGC_SIGHUP, REPLICATION_STANDBY,
 			gettext_noop("Enables a physical standby to synchronize logical failover replication slots from the primary server."),
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 66baf2bf33e..cdeee8dccba 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -181,6 +181,7 @@ extern PGDLLIMPORT int max_parallel_workers;
 extern PGDLLIMPORT bool numa_buffers_interleave;
 extern PGDLLIMPORT bool numa_localalloc;
 extern PGDLLIMPORT bool numa_partition_freelist;
+extern PGDLLIMPORT bool numa_procs_interleave;
 
 extern PGDLLIMPORT int commit_timestamp_buffers;
 extern PGDLLIMPORT int multixact_member_buffers;
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 9f9b3fcfbf1..5cb1632718e 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -194,6 +194,8 @@ struct PGPROC
 								 * vacuum must not remove tuples deleted by
 								 * xid >= xmin ! */
 
+	int			procnumber;		/* index in ProcGlobal->allProcs */
+
 	int			pid;			/* Backend's process ID; 0 if prepared xact */
 
 	int			pgxactoff;		/* offset into various ProcGlobal->arrays with
@@ -319,6 +321,9 @@ struct PGPROC
 	PGPROC	   *lockGroupLeader;	/* lock group leader, if I'm a member */
 	dlist_head	lockGroupMembers;	/* list of members, if I'm a leader */
 	dlist_node	lockGroupLink;	/* my member link, if I'm a member */
+
+	/* NUMA node */
+	int			numa_node;
 };
 
 /* NOTE: "typedef struct PGPROC PGPROC" appears in storage/lock.h. */
@@ -383,7 +388,7 @@ extern PGDLLIMPORT PGPROC *MyProc;
 typedef struct PROC_HDR
 {
 	/* Array of PGPROC structures (not including dummies for prepared txns) */
-	PGPROC	   *allProcs;
+	PGPROC	  **allProcs;
 
 	/* Array mirroring PGPROC.xid for each PGPROC currently in the procarray */
 	TransactionId *xids;
@@ -435,8 +440,8 @@ extern PGDLLIMPORT PGPROC *PreparedXactProcs;
 /*
  * Accessors for getting PGPROC given a ProcNumber and vice versa.
  */
-#define GetPGProcByNumber(n) (&ProcGlobal->allProcs[(n)])
-#define GetNumberFromPGProc(proc) ((proc) - &ProcGlobal->allProcs[0])
+#define GetPGProcByNumber(n) (ProcGlobal->allProcs[(n)])
+#define GetNumberFromPGProc(proc) ((proc)->procnumber)
 
 /*
  * We set aside some extra PGPROC structures for "special worker" processes,
-- 
2.49.0