From 963e48dadc3ecc6941ee8e70d0f6d84d13ceb276 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Wed, 15 Oct 2025 17:42:23 +0200 Subject: [PATCH v20251015 08/12] fix: handle disabled NUMA-partitioning of buffers We still partition the buffers, because we want to partition the clock-sweep (and that relies on partitions). But we don't map the partitions to NUMA nodes. The NUMA partitioning may be disabled for a number of reasons. The build may not have libnuma enabled, debug_numa may not include "buffers" and/or the shared buffers are too small (especially with huge pages). --- src/backend/storage/buffer/buf_init.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 65f45346bd1..d0efa102d82 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -301,6 +301,10 @@ BufferGetNode(Buffer buffer) if (numa_buffers_per_node == -1) return 0; + /* no NUMA-aware partitioning */ + if ((numa_flags & NUMA_BUFFERS) == 0) + return 0; + return (buffer / numa_buffers_per_node); } @@ -460,10 +464,15 @@ buffer_partitions_prepare(void) numa_can_partition = true; /* assume we can allocate to nodes */ if (numa_nodes > max_nodes) { - elog(LOG, "shared buffers too small for %d nodes (max nodes %d)", + elog(NOTICE, "shared buffers too small for %d nodes (max nodes %d)", numa_nodes, max_nodes); numa_can_partition = false; } + else if ((numa_flags & NUMA_BUFFERS) == 0) + { + elog(NOTICE, "NUMA-partitioning of buffers disabled"); + numa_can_partition = false; + } /* * We know we can partition to the desired number of nodes, now it's time @@ -483,8 +492,16 @@ buffer_partitions_prepare(void) /* * Finally, calculate how many buffers we'll assign to a single NUMA node. - * If we have only a single node, or can't map to that many nodes, just - * take a "fair share" of buffers. + * If we have only a single node, or when we can't partition for some + * reason, just take a "fair share" of buffers. This can happen for a + * number of reasons - missing NUMA support, partitioning of buffers not + * enabled, or not enough buffers for this many nodes. + * + * We still build partitions, because we want to allow partitioning of + * the clock-sweep later. + * + * The number of buffers for each partition is calculated later, once we + * have allocated the shared memory (because that's where we store it). * * XXX In both cases the last node can get fewer buffers. */ @@ -599,7 +616,7 @@ buffer_partitions_init(void) Assert((num_buffers > 0) && (num_buffers <= part_buffers)); /* XXX we should get the actual node ID from the mask */ - if ((numa_flags & NUMA_BUFFERS) != 0) + if (numa_can_partition) part->numa_node = n; else part->numa_node = -1; -- 2.51.0