From 4cbe8788366287b0347f006ee90bf1470af74877 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Wed, 15 Oct 2025 13:59:29 +0200 Subject: [PATCH v20251121 4/7] clock-sweep: scan all partitions When looking for a free buffer, scan all clock-sweep partitions, not just the "home" one. All buffers in the home partition may be pinned, in which case we should not fail. Instead, advance to the next partition, in a round-robin way, and only fail after scanning through all of them. --- src/backend/storage/buffer/freelist.c | 91 ++++++++++++++++------- src/test/recovery/t/027_stream_regress.pl | 5 -- 2 files changed, 63 insertions(+), 33 deletions(-) diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 169071032b4..3af82e267c6 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -167,6 +167,9 @@ static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy, static void AddBufferToRing(BufferAccessStrategy strategy, BufferDesc *buf); static ClockSweep *ChooseClockSweep(bool balance); +static BufferDesc *StrategyGetBufferPartition(ClockSweep *sweep, + BufferAccessStrategy strategy, + uint32 *buf_state); /* * clocksweep allocation balancing @@ -201,10 +204,9 @@ static int clocksweep_count = 0; * id of the buffer now under the hand. */ static inline uint32 -ClockSweepTick(void) +ClockSweepTick(ClockSweep *sweep) { uint32 victim; - ClockSweep *sweep = ChooseClockSweep(true); /* * Atomically move hand ahead one buffer - if there's several processes @@ -370,7 +372,8 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r { BufferDesc *buf; int bgwprocno; - int trycounter; + ClockSweep *sweep, + *sweep_start; /* starting clock-sweep partition */ *from_ring = false; @@ -424,37 +427,69 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r /* * Use the "clock sweep" algorithm to find a free buffer * - * XXX Note that ClockSweepTick() is NUMA-aware, i.e. it only looks at - * buffers from a single partition, aligned with the NUMA node. That means - * it only accesses buffers from the same NUMA node. - * - * XXX That also means each process "sweeps" only a fraction of buffers, - * even if the other buffers are better candidates for eviction. Maybe - * there should be some logic to "steal" buffers from other freelists or - * other nodes? + * Start with the "preferred" partition, and then proceed in a round-robin + * manner. If we cycle back to the starting partition, it means none of the + * partitions has unpinned buffers. * - * XXX Would that also mean we'd have multiple bgwriters, one for each - * node, or would one bgwriter handle all of that? + * XXX Does this need to do similar balancing "balancing" as for bgwriter + * in StrategySyncBalance? Maybe it's be enough to simply pick the initial + * partition that way? We'd only getting a single buffer, so not much chance + * to balance over many allocations. * - * XXX This only searches a single partition, which can result in "no - * unpinned buffers available" even if there are buffers in other - * partitions. Should be fixed by falling back to other partitions if - * needed. - * - * XXX Also, the trycounter should not be set to NBuffers, but to buffer - * count for that one partition. In fact, this should not call ClockSweepTick - * for every iteration. The call is likely quite expensive (does a lot - * of stuff), and also may return a different partition on each call. - * We should just do it once, and then do the for(;;) loop. And then - * maybe advance to the next partition, until we scan through all of them. + * XXX But actually, we're calling ChooseClockSweep() with balance=true, so + * maybe it already does balancing? */ - trycounter = NBuffers; + sweep = ChooseClockSweep(true); + sweep_start = sweep; + for (;;) + { + buf = StrategyGetBufferPartition(sweep, strategy, buf_state); + + /* found a buffer in the "sweep" partition, we're done */ + if (buf != NULL) + return buf; + + /* + * Try advancing to the next partition, round-robin (if last partition, + * wrap around to the beginning). + * + * XXX This is a bit ugly, there must be a better way to advance to the + * next partition. + */ + if (sweep == &StrategyControl->sweeps[StrategyControl->num_partitions - 1]) + sweep = StrategyControl->sweeps; + else + sweep++; + + /* we've scanned all partitions */ + if (sweep == sweep_start) + break; + } + + /* we shouldn't get here if there are unpinned buffers */ + elog(ERROR, "no unpinned buffers available"); +} + +/* + * StrategyGetBufferPartition + * get a free buffer from a single clock-sweep partition + * + * Returns NULL if there are no free (unpinned) buffers in the partition. +*/ +static BufferDesc * +StrategyGetBufferPartition(ClockSweep *sweep, BufferAccessStrategy strategy, + uint32 *buf_state) +{ + BufferDesc *buf; + int trycounter; + + trycounter = sweep->numBuffers; for (;;) { uint32 old_buf_state; uint32 local_buf_state; - buf = GetBufferDescriptor(ClockSweepTick()); + buf = GetBufferDescriptor(ClockSweepTick(sweep)); /* * Check whether the buffer can be used and pin it if so. Do this @@ -482,7 +517,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r * one eventually, but it's probably better to fail than * to risk getting stuck in an infinite loop. */ - elog(ERROR, "no unpinned buffers available"); + return NULL; } break; } @@ -501,7 +536,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state, local_buf_state)) { - trycounter = NBuffers; + trycounter = sweep->numBuffers; break; } } diff --git a/src/test/recovery/t/027_stream_regress.pl b/src/test/recovery/t/027_stream_regress.pl index 98b146ed4b7..589c79d97d3 100644 --- a/src/test/recovery/t/027_stream_regress.pl +++ b/src/test/recovery/t/027_stream_regress.pl @@ -18,11 +18,6 @@ $node_primary->adjust_conf('postgresql.conf', 'max_connections', '25'); $node_primary->append_conf('postgresql.conf', 'max_prepared_transactions = 10'); -# The default is 1MB, which is not enough with clock-sweep partitioning. -# Increase to 32MB, so that we don't get "no unpinned buffers". -$node_primary->append_conf('postgresql.conf', - 'shared_buffers = 32MB'); - # Enable pg_stat_statements to force tests to do query jumbling. # pg_stat_statements.max should be large enough to hold all the entries # of the regression database. -- 2.51.1