From 06a43d54498ab5049c12a458cfbd4fe3b3b168c2 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Thu, 22 May 2025 18:38:41 +0200 Subject: [PATCH v3 4/7] NUMA: partition buffer freelist Instead of a single buffer freelist, partition into multiple smaller lists, to reduce lock contention, and to spread the buffers over all NUMA nodes more evenly. This uses the buffer partitioning scheme introduced by the earlier patch, i.e. the partitions will "align" with NUMA nodes, etc. It also extends the "pg_buffercache_partitions" view, to include information about each freelist (number of consumedd buffers, ...). When allocating a buffer, it's taken from the correct freelist (same NUMA node). Note: This is (probably) more important than partitioning ProcArray. --- .../pg_buffercache--1.6--1.7.sql | 2 +- contrib/pg_buffercache/pg_buffercache_pages.c | 24 +- src/backend/storage/buffer/freelist.c | 360 ++++++++++++++++-- src/backend/utils/init/globals.c | 1 + src/backend/utils/misc/guc_tables.c | 10 + src/include/miscadmin.h | 1 + src/include/storage/bufmgr.h | 4 +- 7 files changed, 372 insertions(+), 30 deletions(-) diff --git a/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql index bd97246f6ab..3871c261528 100644 --- a/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql +++ b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql @@ -12,7 +12,7 @@ LANGUAGE C PARALLEL SAFE; -- Create a view for convenient access. CREATE VIEW pg_buffercache_partitions AS SELECT P.* FROM pg_buffercache_partitions() AS P - (partition integer, numa_node integer, num_buffers integer, first_buffer integer, last_buffer integer); + (partition integer, numa_node integer, num_buffers integer, first_buffer integer, last_buffer integer, buffers_consumed bigint, buffers_remain bigint, buffers_free bigint); -- Don't want these to be available to public. REVOKE ALL ON FUNCTION pg_buffercache_partitions() FROM PUBLIC; diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index 8baa7c7b543..668ada8c47b 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -27,7 +27,7 @@ #define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3 #define NUM_BUFFERCACHE_NUMA_ELEM 3 -#define NUM_BUFFERCACHE_PARTITIONS_ELEM 5 +#define NUM_BUFFERCACHE_PARTITIONS_ELEM 8 PG_MODULE_MAGIC_EXT( .name = "pg_buffercache", @@ -812,6 +812,12 @@ pg_buffercache_partitions(PG_FUNCTION_ARGS) INT4OID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 5, "last_buffer", INT4OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 6, "buffers_consumed", + INT8OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 7, "buffers_remain", + INT8OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 8, "buffers_free", + INT8OID, -1, 0); funcctx->user_fctx = BlessTupleDesc(tupledesc); @@ -833,12 +839,19 @@ pg_buffercache_partitions(PG_FUNCTION_ARGS) first_buffer, last_buffer; + uint64 buffers_consumed, + buffers_remain, + buffers_free; + Datum values[NUM_BUFFERCACHE_PARTITIONS_ELEM]; bool nulls[NUM_BUFFERCACHE_PARTITIONS_ELEM]; BufferPartitionGet(i, &numa_node, &num_buffers, &first_buffer, &last_buffer); + FreelistPartitionGetInfo(i, &buffers_consumed, &buffers_remain, + &buffers_free); + values[0] = Int32GetDatum(i); nulls[0] = false; @@ -854,6 +867,15 @@ pg_buffercache_partitions(PG_FUNCTION_ARGS) values[4] = Int32GetDatum(last_buffer); nulls[4] = false; + values[5] = Int64GetDatum(buffers_consumed); + nulls[5] = false; + + values[6] = Int64GetDatum(buffers_remain); + nulls[6] = false; + + values[7] = Int64GetDatum(buffers_free); + nulls[7] = false; + /* Build and return the tuple. */ tuple = heap_form_tuple((TupleDesc) funcctx->user_fctx, values, nulls); result = HeapTupleGetDatum(tuple); diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index e046526c149..c3fbd651dd5 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -15,14 +15,52 @@ */ #include "postgres.h" +#include +#include + +#ifdef USE_LIBNUMA +#include +#include +#endif + #include "pgstat.h" #include "port/atomics.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" +#include "storage/ipc.h" #include "storage/proc.h" #define INT_ACCESS_ONCE(var) ((int)(*((volatile int *)&(var)))) +/* + * Represents one freelist partition. + */ +typedef struct BufferStrategyFreelist +{ + /* Spinlock: protects the values below */ + slock_t freelist_lock; + + /* + * XXX Not sure why this needs to be aligned like this. Need to ask + * Andres. + */ + int firstFreeBuffer __attribute__((aligned(64))); /* Head of list of + * unused buffers */ + + /* Number of buffers consumed from this list. */ + uint64 consumed; +} BufferStrategyFreelist; + +/* + * The minimum number of partitions we want to have. We want at least this + * number of partitions, even on non-NUMA system, as it helps with contention + * for buffers. But with multiple NUMA nodes, we want a separate partition per + * node. But we may get multiple partitions per node, for low node count. + * + * With multiple partitions per NUMA node, we pick the partition based on CPU + * (or some other parameter). + */ +#define MIN_FREELIST_PARTITIONS 4 /* * The shared freelist control information. @@ -39,8 +77,6 @@ typedef struct */ pg_atomic_uint32 nextVictimBuffer; - int firstFreeBuffer; /* Head of list of unused buffers */ - /* * Statistics. These counters should be wide enough that they can't * overflow during a single bgwriter cycle. @@ -51,8 +87,19 @@ typedef struct /* * Bgworker process to be notified upon activity or -1 if none. See * StrategyNotifyBgWriter. + * + * XXX Not sure why this needs to be aligned like this. Need to ask + * Andres. Also, shouldn't the alignment be specified after, like for + * "consumed"? */ - int bgwprocno; + int __attribute__((aligned(64))) bgwprocno; + + /* info about freelist partitioning */ + int num_nodes; /* effectively number of NUMA nodes */ + int num_partitions; + int num_partitions_per_node; + + BufferStrategyFreelist freelists[FLEXIBLE_ARRAY_MEMBER]; } BufferStrategyControl; /* Pointers to shared state */ @@ -157,6 +204,88 @@ ClockSweepTick(void) return victim; } +static int +calculate_partition_index() +{ + int rc; + unsigned cpu; + unsigned node; + int index; + + Assert(StrategyControl->num_partitions == + (StrategyControl->num_nodes * StrategyControl->num_partitions_per_node)); + + /* + * freelist is partitioned, so determine the CPU/NUMA node, and pick a + * list based on that. + */ + rc = getcpu(&cpu, &node); + if (rc != 0) + elog(ERROR, "getcpu failed: %m"); + + /* + * XXX We should't get nodes that we haven't considered while building the + * partitions. Maybe if we allow this (e.g. due to support adjusting the + * NUMA stuff at runtime), we should just do our best to minimize the + * conflicts somehow. But it'll make the mapping harder, so for now we + * ignore it. + */ + if (node > StrategyControl->num_nodes) + elog(ERROR, "node out of range: %d > %u", cpu, StrategyControl->num_nodes); + + /* + * Find the partition. If we have a single partition per node, we can + * calculate the index directly from node. Otherwise we need to do two + * steps, using node and then cpu. + */ + if (StrategyControl->num_partitions_per_node == 1) + { + index = (node % StrategyControl->num_partitions); + } + else + { + int index_group, + index_part; + + /* two steps - calculate group from node, partition from cpu */ + index_group = (node % StrategyControl->num_nodes); + index_part = (cpu % StrategyControl->num_partitions_per_node); + + index = (index_group * StrategyControl->num_partitions_per_node) + + index_part; + } + + return index; +} + +/* + * ChooseFreeList + * Pick the buffer freelist to use, depending on the CPU and NUMA node. + * + * Without partitioned freelists (numa_partition_freelist=false), there's only + * a single freelist, so use that. + * + * With partitioned freelists, we have multiple ways how to pick the freelist + * for the backend: + * + * - one freelist per CPU, use the freelist for CPU the task executes on + * + * - one freelist per NUMA node, use the freelist for node task executes on + * + * - use fixed number of freelists, map processes to lists based on PID + * + * There may be some other strategies, not sure. The important thing is this + * needs to be refrecled during initialization, i.e. we need to create the + * right number of lists. + */ +static BufferStrategyFreelist * +ChooseFreeList(void) +{ + int index = calculate_partition_index(); + + return &StrategyControl->freelists[index]; +} + /* * have_free_buffer -- a lockless check to see if there is a free buffer in * buffer pool. @@ -168,10 +297,13 @@ ClockSweepTick(void) bool have_free_buffer(void) { - if (StrategyControl->firstFreeBuffer >= 0) - return true; - else - return false; + for (int i = 0; i < StrategyControl->num_partitions; i++) + { + if (StrategyControl->freelists[i].firstFreeBuffer >= 0) + return true; + } + + return false; } /* @@ -193,6 +325,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r int bgwprocno; int trycounter; uint32 local_buf_state; /* to avoid repeated (de-)referencing */ + BufferStrategyFreelist *freelist; *from_ring = false; @@ -259,31 +392,35 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r * buffer_strategy_lock not the individual buffer spinlocks, so it's OK to * manipulate them without holding the spinlock. */ - if (StrategyControl->firstFreeBuffer >= 0) + freelist = ChooseFreeList(); + if (freelist->firstFreeBuffer >= 0) { while (true) { /* Acquire the spinlock to remove element from the freelist */ - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + SpinLockAcquire(&freelist->freelist_lock); - if (StrategyControl->firstFreeBuffer < 0) + if (freelist->firstFreeBuffer < 0) { - SpinLockRelease(&StrategyControl->buffer_strategy_lock); + SpinLockRelease(&freelist->freelist_lock); break; } - buf = GetBufferDescriptor(StrategyControl->firstFreeBuffer); + buf = GetBufferDescriptor(freelist->firstFreeBuffer); Assert(buf->freeNext != FREENEXT_NOT_IN_LIST); /* Unconditionally remove buffer from freelist */ - StrategyControl->firstFreeBuffer = buf->freeNext; + freelist->firstFreeBuffer = buf->freeNext; buf->freeNext = FREENEXT_NOT_IN_LIST; + /* increment number of buffers we consumed from this list */ + freelist->consumed++; + /* * Release the lock so someone else can access the freelist while * we check out this buffer. */ - SpinLockRelease(&StrategyControl->buffer_strategy_lock); + SpinLockRelease(&freelist->freelist_lock); /* * If the buffer is pinned or has a nonzero usage_count, we cannot @@ -305,7 +442,17 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r } } - /* Nothing on the freelist, so run the "clock sweep" algorithm */ + /* + * Nothing on the freelist, so run the "clock sweep" algorithm + * + * XXX Should we also make this NUMA-aware, to only access buffers from + * the same NUMA node? That'd probably mean we need to make the clock + * sweep NUMA-aware, perhaps by having multiple clock sweeps, each for a + * subset of buffers. But that also means each process could "sweep" only + * a fraction of buffers, even if the other buffers are better candidates + * for eviction. Would that also mean we'd have multiple bgwriters, one + * for each node, or would one bgwriter handle all of that? + */ trycounter = NBuffers; for (;;) { @@ -356,7 +503,22 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r void StrategyFreeBuffer(BufferDesc *buf) { - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + BufferStrategyFreelist *freelist; + + /* + * We don't want to call ChooseFreeList() again, because we might get a + * completely different freelist - either a different partition in the + * same group, or even a different group if the NUMA node changed. But we + * can calculate the proper freelist from the buffer id. + */ + int index = (BufferGetNode(buf->buf_id) * StrategyControl->num_partitions_per_node) + + (buf->buf_id % StrategyControl->num_partitions_per_node); + + Assert((index >= 0) && (index < StrategyControl->num_partitions)); + + freelist = &StrategyControl->freelists[index]; + + SpinLockAcquire(&freelist->freelist_lock); /* * It is possible that we are told to put something in the freelist that @@ -364,11 +526,11 @@ StrategyFreeBuffer(BufferDesc *buf) */ if (buf->freeNext == FREENEXT_NOT_IN_LIST) { - buf->freeNext = StrategyControl->firstFreeBuffer; - StrategyControl->firstFreeBuffer = buf->buf_id; + buf->freeNext = freelist->firstFreeBuffer; + freelist->firstFreeBuffer = buf->buf_id; } - SpinLockRelease(&StrategyControl->buffer_strategy_lock); + SpinLockRelease(&freelist->freelist_lock); } /* @@ -432,6 +594,42 @@ StrategyNotifyBgWriter(int bgwprocno) SpinLockRelease(&StrategyControl->buffer_strategy_lock); } +/* prints some debug info / stats about freelists at shutdown */ +static void +freelist_before_shmem_exit(int code, Datum arg) +{ + for (int p = 0; p < StrategyControl->num_partitions; p++) + { + BufferStrategyFreelist *freelist = &StrategyControl->freelists[p]; + uint64 remain = 0; + uint64 actually_free = 0; + int cur = freelist->firstFreeBuffer; + + while (cur >= 0) + { + uint32 local_buf_state; + BufferDesc *buf; + + buf = GetBufferDescriptor(cur); + + remain++; + + local_buf_state = LockBufHdr(buf); + + if (!(local_buf_state & BM_TAG_VALID)) + actually_free++; + + UnlockBufHdr(buf, local_buf_state); + + cur = buf->freeNext; + } + elog(LOG, "NUMA: freelist partition %d, firstF: %d: consumed: %lu, remain: %lu, actually free: %lu", + p, + freelist->firstFreeBuffer, + freelist->consumed, + remain, actually_free); + } +} /* * StrategyShmemSize @@ -445,12 +643,20 @@ Size StrategyShmemSize(void) { Size size = 0; + int num_partitions; + int num_nodes; + + BufferPartitionParams(&num_partitions, &num_nodes); /* size of lookup hash table ... see comment in StrategyInitialize */ size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS)); /* size of the shared replacement strategy control block */ - size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl))); + size = add_size(size, MAXALIGN(offsetof(BufferStrategyControl, freelists))); + + /* size of freelist partitions (at least one per NUMA node) */ + size = add_size(size, MAXALIGN(mul_size(sizeof(BufferStrategyFreelist), + num_partitions))); return size; } @@ -467,6 +673,18 @@ StrategyInitialize(bool init) { bool found; + int num_nodes; + int num_partitions; + int num_partitions_per_node; + + /* */ + BufferPartitionParams(&num_partitions, &num_nodes); + + /* always a multiple of NUMA nodes */ + Assert(num_partitions % num_nodes == 0); + + num_partitions_per_node = (num_partitions / num_nodes); + /* * Initialize the shared buffer lookup hashtable. * @@ -484,7 +702,8 @@ StrategyInitialize(bool init) */ StrategyControl = (BufferStrategyControl *) ShmemInitStruct("Buffer Strategy Status", - sizeof(BufferStrategyControl), + MAXALIGN(offsetof(BufferStrategyControl, freelists)) + + MAXALIGN(sizeof(BufferStrategyFreelist) * num_partitions), &found); if (!found) @@ -494,13 +713,10 @@ StrategyInitialize(bool init) */ Assert(init); - SpinLockInit(&StrategyControl->buffer_strategy_lock); + /* register callback to dump some stats on exit */ + before_shmem_exit(freelist_before_shmem_exit, 0); - /* - * Grab the whole linked list of free buffers for our strategy. We - * assume it was previously set up by BufferManagerShmemInit(). - */ - StrategyControl->firstFreeBuffer = 0; + SpinLockInit(&StrategyControl->buffer_strategy_lock); /* Initialize the clock sweep pointer */ pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0); @@ -511,6 +727,52 @@ StrategyInitialize(bool init) /* No pending notification */ StrategyControl->bgwprocno = -1; + + /* initialize the partitioned clocksweep */ + StrategyControl->num_partitions = num_partitions; + StrategyControl->num_nodes = num_nodes; + StrategyControl->num_partitions_per_node = num_partitions_per_node; + + /* + * Rebuild the freelist - right now all buffers are in one huge list, + * we want to rework that into multiple lists. Start by initializing + * the strategy to have empty lists. + */ + for (int nfreelist = 0; nfreelist < num_partitions; nfreelist++) + { + int node, + num_buffers, + first_buffer, + last_buffer; + + BufferStrategyFreelist *freelist; + + freelist = &StrategyControl->freelists[nfreelist]; + + freelist->firstFreeBuffer = FREENEXT_END_OF_LIST; + + SpinLockInit(&freelist->freelist_lock); + + /* get info about the buffer partition */ + BufferPartitionGet(nfreelist, &node, + &num_buffers, &first_buffer, &last_buffer); + + /* + * Walk through buffers for each partition, add them to the list. + * Walk from the end, because we're adding the buffers to the + * beginning. + */ + + for (int i = last_buffer; i >= first_buffer; i--) + { + BufferDesc *buf = GetBufferDescriptor(i); + + /* add to the freelist */ + buf->freeNext = freelist->firstFreeBuffer; + freelist->firstFreeBuffer = i; + } + + } } else Assert(!init); @@ -847,3 +1109,47 @@ StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_r return true; } + +void +FreelistPartitionGetInfo(int idx, uint64 *consumed, uint64 *remain, uint64 *actually_free) +{ + BufferStrategyFreelist *freelist; + int cur; + + /* stats */ + uint64 cnt_remain = 0; + uint64 cnt_free = 0; + + Assert((idx >= 0) && (idx < StrategyControl->num_partitions)); + + freelist = &StrategyControl->freelists[idx]; + + /* stat */ + SpinLockAcquire(&freelist->freelist_lock); + + *consumed = freelist->consumed; + + cur = freelist->firstFreeBuffer; + while (cur >= 0) + { + uint32 local_buf_state; + BufferDesc *buf; + + buf = GetBufferDescriptor(cur); + + cnt_remain++; + + local_buf_state = LockBufHdr(buf); + + if (!(local_buf_state & BM_TAG_VALID)) + cnt_free++; + + UnlockBufHdr(buf, local_buf_state); + + cur = buf->freeNext; + } + SpinLockRelease(&freelist->freelist_lock); + + *remain = cnt_remain; + *actually_free = cnt_free; +} diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index f5359db3656..a11bc71a386 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -148,6 +148,7 @@ int MaxBackends = 0; /* NUMA stuff */ bool numa_buffers_interleave = false; bool numa_localalloc = false; +bool numa_partition_freelist = false; /* GUC parameters for vacuum */ int VacuumBufferUsageLimit = 2048; diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index a21f20800fb..0552ed62cc7 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -2136,6 +2136,16 @@ struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, + { + {"numa_partition_freelist", PGC_POSTMASTER, DEVELOPER_OPTIONS, + gettext_noop("Enables buffer freelists to be partitioned per NUMA node."), + gettext_noop("When enabled, we create a separate freelist per NUMA node."), + }, + &numa_partition_freelist, + false, + NULL, NULL, NULL + }, + { {"sync_replication_slots", PGC_SIGHUP, REPLICATION_STANDBY, gettext_noop("Enables a physical standby to synchronize logical failover replication slots from the primary server."), diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 692871a401f..66baf2bf33e 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -180,6 +180,7 @@ extern PGDLLIMPORT int max_parallel_workers; extern PGDLLIMPORT bool numa_buffers_interleave; extern PGDLLIMPORT bool numa_localalloc; +extern PGDLLIMPORT bool numa_partition_freelist; extern PGDLLIMPORT int commit_timestamp_buffers; extern PGDLLIMPORT int multixact_member_buffers; diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index deaf4f19fa4..df127274190 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -347,7 +347,9 @@ extern int GetAccessStrategyBufferCount(BufferAccessStrategy strategy); extern int GetAccessStrategyPinLimit(BufferAccessStrategy strategy); extern void FreeAccessStrategy(BufferAccessStrategy strategy); - +extern void FreelistPartitionGetInfo(int idx, + uint64 *consumed, uint64 *remain, + uint64 *actually_free); /* inline functions */ -- 2.50.1