From d6bc867374a1195d2426de629b0ffd1f22b040a9 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Wed, 15 Oct 2025 15:03:48 +0200 Subject: [PATCH v20251015 07/12] fix: map all buffer partitions for NUMA node at once Don't map individual partitions, but all partitions for the whole NUMA node at once. This means we don't need to worry about memory pages that span two partitions (min_node_buffers applies to the whole node). That might be causing problems with huge pages, as reported by Alexey Makhmutov. Discussion: bf95094a-77c2-46cf-913a-443f7419bc79@postgrespro.ru --- src/backend/storage/buffer/buf_init.c | 42 ++++++++++++++++++--------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 2fd7f937ffb..65f45346bd1 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -460,7 +460,7 @@ buffer_partitions_prepare(void) numa_can_partition = true; /* assume we can allocate to nodes */ if (numa_nodes > max_nodes) { - elog(WARNING, "shared buffers too small for %d nodes (max nodes %d)", + elog(LOG, "shared buffers too small for %d nodes (max nodes %d)", numa_nodes, max_nodes); numa_can_partition = false; } @@ -630,8 +630,8 @@ buffer_partitions_init(void) * memory pages (smaller than BLCKSZ) we'd split all buffers to multiple * NUMA nodes. And we don't want that. * - * But even with huge pages it seems like a good idea to not have mapping - * for each page. + * But even with huge pages it seems like a good idea to not map pages + * one by one. * * So we always assign a larger contiguous chunk of buffers to the same * NUMA node, as calculated by choose_chunk_buffers(). We try to keep the @@ -649,35 +649,51 @@ buffer_partitions_init(void) * We need to account for partitions being of different length, when the * NBuffers is not nicely divisible. To do that we keep track of the start * of the next partition. + * + * We always map all partitions for the same node at once, so that we + * don't need to worry about alignment of memory pages that get split + * between partitions (we only worry about min_node_buffers for whole + * NUMA nodes, not for individual partitions). */ buffers_ptr = BufferBlocks; descriptors_ptr = (char *) BufferDescriptors; - for (int i = 0; i < numa_partitions; i++) + for (int n = 0; n < numa_nodes; n++) { - BufferPartition *part = &BufferPartitionsArray->partitions[i]; char *startptr, *endptr; + int num_buffers = 0; + + /* sum buffers in all partitions for this node */ + for (int p = 0; p < parts_per_node; p++) + { + int pidx = (n * parts_per_node + p); + BufferPartition *part = &BufferPartitionsArray->partitions[pidx]; + + Assert(part->numa_node == n); + + num_buffers += part->num_buffers; + } /* first map buffers */ startptr = buffers_ptr; - endptr = startptr + ((Size) part->num_buffers * BLCKSZ); + endptr = startptr + ((Size) num_buffers * BLCKSZ); buffers_ptr = endptr; /* start of the next partition */ - elog(DEBUG1, "NUMA: buffer_partitions_init: %d => %d buffers %d start %p end %p (size %zd)", - i, part->numa_node, part->num_buffers, startptr, endptr, (endptr - startptr)); + elog(DEBUG1, "NUMA: buffer_partitions_init: %d => buffers %d start %p end %p (size %zd)", + n, num_buffers, startptr, endptr, (endptr - startptr)); - pg_numa_move_to_node(startptr, endptr, part->numa_node); + pg_numa_move_to_node(startptr, endptr, n); /* now do the same for buffer descriptors */ startptr = descriptors_ptr; - endptr = startptr + ((Size) part->num_buffers * sizeof(BufferDescPadded)); + endptr = startptr + ((Size) num_buffers * sizeof(BufferDescPadded)); descriptors_ptr = endptr; - elog(DEBUG1, "NUMA: buffer_partitions_init: %d => %d descriptors %d start %p end %p (size %zd)", - i, part->numa_node, part->num_buffers, startptr, endptr, (endptr - startptr)); + elog(DEBUG1, "NUMA: buffer_partitions_init: %d => descriptors %d start %p end %p (size %zd)", + n, num_buffers, startptr, endptr, (endptr - startptr)); - pg_numa_move_to_node(startptr, endptr, part->numa_node); + pg_numa_move_to_node(startptr, endptr, n); } /* we should have consumed the arrays exactly */ -- 2.51.0