From c4d51ab87b92f9900e37d42cf74980e87b648a56 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Sun, 8 Jun 2025 18:53:12 +0200 Subject: [PATCH v2 5/7] NUMA: clockweep partitioning --- src/backend/storage/buffer/bufmgr.c | 473 ++++++++++++++------------ src/backend/storage/buffer/freelist.c | 202 ++++++++--- src/include/storage/buf_internals.h | 4 +- 3 files changed, 424 insertions(+), 255 deletions(-) diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 5922689fe5d..3d6c834d77c 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -3587,6 +3587,23 @@ BufferSync(int flags) TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan); } +/* + * Information saved between calls so we can determine the strategy + * point's advance rate and avoid scanning already-cleaned buffers. + * + * XXX One value per partition. We don't know how many partitions are + * there, so allocate 32, should be enough for the PoC patch. + * + * XXX might be better to have a per-partition struct with all the info + */ +#define MAX_CLOCKSWEEP_PARTITIONS 32 +static bool saved_info_valid = false; +static int prev_strategy_buf_id[MAX_CLOCKSWEEP_PARTITIONS]; +static uint32 prev_strategy_passes[MAX_CLOCKSWEEP_PARTITIONS]; +static int next_to_clean[MAX_CLOCKSWEEP_PARTITIONS]; +static uint32 next_passes[MAX_CLOCKSWEEP_PARTITIONS]; + + /* * BgBufferSync -- Write out some dirty buffers in the pool. * @@ -3602,55 +3619,24 @@ bool BgBufferSync(WritebackContext *wb_context) { /* info obtained from freelist.c */ - int strategy_buf_id; - uint32 strategy_passes; uint32 recent_alloc; + uint32 recent_alloc_partition; + int num_partitions; - /* - * Information saved between calls so we can determine the strategy - * point's advance rate and avoid scanning already-cleaned buffers. - */ - static bool saved_info_valid = false; - static int prev_strategy_buf_id; - static uint32 prev_strategy_passes; - static int next_to_clean; - static uint32 next_passes; - - /* Moving averages of allocation rate and clean-buffer density */ - static float smoothed_alloc = 0; - static float smoothed_density = 10.0; - - /* Potentially these could be tunables, but for now, not */ - float smoothing_samples = 16; - float scan_whole_pool_milliseconds = 120000.0; - - /* Used to compute how far we scan ahead */ - long strategy_delta; - int bufs_to_lap; - int bufs_ahead; - float scans_per_alloc; - int reusable_buffers_est; - int upcoming_alloc_est; - int min_scan_buffers; - - /* Variables for the scanning loop proper */ - int num_to_scan; - int num_written; - int reusable_buffers; + /* assume we can hibernate, any partition can set to false */ + bool hibernate = true; - /* Variables for final smoothed_density update */ - long new_strategy_delta; - uint32 new_recent_alloc; + /* get the number of clocksweep partitions, and total alloc count */ + StrategySyncPrepare(&num_partitions, &recent_alloc); - /* - * Find out where the freelist clock sweep currently is, and how many - * buffer allocations have happened since our last call. - */ - strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc); + Assert(num_partitions <= MAX_CLOCKSWEEP_PARTITIONS); /* Report buffer alloc counts to pgstat */ PendingBgWriterStats.buf_alloc += recent_alloc; + /* average alloc buffers per partition */ + recent_alloc_partition = (recent_alloc / num_partitions); + /* * If we're not running the LRU scan, just stop after doing the stats * stuff. We mark the saved state invalid so that we can recover sanely @@ -3663,223 +3649,282 @@ BgBufferSync(WritebackContext *wb_context) } /* - * Compute strategy_delta = how many buffers have been scanned by the - * clock sweep since last time. If first time through, assume none. Then - * see if we are still ahead of the clock sweep, and if so, how many - * buffers we could scan before we'd catch up with it and "lap" it. Note: - * weird-looking coding of xxx_passes comparisons are to avoid bogus - * behavior when the passes counts wrap around. - */ - if (saved_info_valid) - { - int32 passes_delta = strategy_passes - prev_strategy_passes; - - strategy_delta = strategy_buf_id - prev_strategy_buf_id; - strategy_delta += (long) passes_delta * NBuffers; + * now process the clocksweep partitions, one by one, using the same + * cleanup that we used for all buffers + * + * XXX Maybe we should randomize the order of partitions a bit, so that + * we don't start from partition 0 all the time? Perhaps not entirely, + * but at least pick a random starting point? + */ + for (int partition = 0; partition < num_partitions; partition++) + { + /* info obtained from freelist.c */ + int strategy_buf_id; + uint32 strategy_passes; + + /* Moving averages of allocation rate and clean-buffer density */ + static float smoothed_alloc = 0; + static float smoothed_density = 10.0; + + /* Potentially these could be tunables, but for now, not */ + float smoothing_samples = 16; + float scan_whole_pool_milliseconds = 120000.0; + + /* Used to compute how far we scan ahead */ + long strategy_delta; + int bufs_to_lap; + int bufs_ahead; + float scans_per_alloc; + int reusable_buffers_est; + int upcoming_alloc_est; + int min_scan_buffers; + + /* Variables for the scanning loop proper */ + int num_to_scan; + int num_written; + int reusable_buffers; + + /* Variables for final smoothed_density update */ + long new_strategy_delta; + uint32 new_recent_alloc; + + /* buffer range for the clocksweep partition */ + int first_buffer; + int num_buffers; - Assert(strategy_delta >= 0); + /* + * Find out where the freelist clock sweep currently is, and how many + * buffer allocations have happened since our last call. + */ + strategy_buf_id = StrategySyncStart(partition, &strategy_passes, + &first_buffer, &num_buffers); - if ((int32) (next_passes - strategy_passes) > 0) + /* + * Compute strategy_delta = how many buffers have been scanned by the + * clock sweep since last time. If first time through, assume none. Then + * see if we are still ahead of the clock sweep, and if so, how many + * buffers we could scan before we'd catch up with it and "lap" it. Note: + * weird-looking coding of xxx_passes comparisons are to avoid bogus + * behavior when the passes counts wrap around. + */ + if (saved_info_valid) { - /* we're one pass ahead of the strategy point */ - bufs_to_lap = strategy_buf_id - next_to_clean; + int32 passes_delta = strategy_passes - prev_strategy_passes[partition]; + + strategy_delta = strategy_buf_id - prev_strategy_buf_id[partition]; + strategy_delta += (long) passes_delta * num_buffers; + + Assert(strategy_delta >= 0); + + if ((int32) (next_passes[partition] - strategy_passes) > 0) + { + /* we're one pass ahead of the strategy point */ + bufs_to_lap = strategy_buf_id - next_to_clean[partition]; #ifdef BGW_DEBUG - elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d", - next_passes, next_to_clean, - strategy_passes, strategy_buf_id, - strategy_delta, bufs_to_lap); + elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d", + next_passes, next_to_clean, + strategy_passes, strategy_buf_id, + strategy_delta, bufs_to_lap); #endif - } - else if (next_passes == strategy_passes && - next_to_clean >= strategy_buf_id) - { - /* on same pass, but ahead or at least not behind */ - bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id); + } + else if (next_passes[partition] == strategy_passes && + next_to_clean[partition] >= strategy_buf_id) + { + /* on same pass, but ahead or at least not behind */ + bufs_to_lap = num_buffers - (next_to_clean[partition] - strategy_buf_id); +#ifdef BGW_DEBUG + elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d", + next_passes, next_to_clean, + strategy_passes, strategy_buf_id, + strategy_delta, bufs_to_lap); +#endif + } + else + { + /* + * We're behind, so skip forward to the strategy point and start + * cleaning from there. + */ #ifdef BGW_DEBUG - elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d", - next_passes, next_to_clean, - strategy_passes, strategy_buf_id, - strategy_delta, bufs_to_lap); + elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld", + next_passes, next_to_clean, + strategy_passes, strategy_buf_id, + strategy_delta); #endif + next_to_clean[partition] = strategy_buf_id; + next_passes[partition] = strategy_passes; + bufs_to_lap = num_buffers; + } } else { /* - * We're behind, so skip forward to the strategy point and start - * cleaning from there. + * Initializing at startup or after LRU scanning had been off. Always + * start at the strategy point. */ #ifdef BGW_DEBUG - elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld", - next_passes, next_to_clean, - strategy_passes, strategy_buf_id, - strategy_delta); + elog(DEBUG2, "bgwriter initializing: strategy %u-%u", + strategy_passes, strategy_buf_id); #endif - next_to_clean = strategy_buf_id; - next_passes = strategy_passes; - bufs_to_lap = NBuffers; + strategy_delta = 0; + next_to_clean[partition] = strategy_buf_id; + next_passes[partition] = strategy_passes; + bufs_to_lap = num_buffers; } - } - else - { - /* - * Initializing at startup or after LRU scanning had been off. Always - * start at the strategy point. - */ -#ifdef BGW_DEBUG - elog(DEBUG2, "bgwriter initializing: strategy %u-%u", - strategy_passes, strategy_buf_id); -#endif - strategy_delta = 0; - next_to_clean = strategy_buf_id; - next_passes = strategy_passes; - bufs_to_lap = NBuffers; - } - /* Update saved info for next time */ - prev_strategy_buf_id = strategy_buf_id; - prev_strategy_passes = strategy_passes; - saved_info_valid = true; + /* Update saved info for next time */ + prev_strategy_buf_id[partition] = strategy_buf_id; + prev_strategy_passes[partition] = strategy_passes; + // FIXME has to happen after all partitions + // saved_info_valid = true; - /* - * Compute how many buffers had to be scanned for each new allocation, ie, - * 1/density of reusable buffers, and track a moving average of that. - * - * If the strategy point didn't move, we don't update the density estimate - */ - if (strategy_delta > 0 && recent_alloc > 0) - { - scans_per_alloc = (float) strategy_delta / (float) recent_alloc; - smoothed_density += (scans_per_alloc - smoothed_density) / - smoothing_samples; - } + /* + * Compute how many buffers had to be scanned for each new allocation, ie, + * 1/density of reusable buffers, and track a moving average of that. + * + * If the strategy point didn't move, we don't update the density estimate + */ + if (strategy_delta > 0 && recent_alloc_partition > 0) + { + scans_per_alloc = (float) strategy_delta / (float) recent_alloc_partition; + smoothed_density += (scans_per_alloc - smoothed_density) / + smoothing_samples; + } - /* - * Estimate how many reusable buffers there are between the current - * strategy point and where we've scanned ahead to, based on the smoothed - * density estimate. - */ - bufs_ahead = NBuffers - bufs_to_lap; - reusable_buffers_est = (float) bufs_ahead / smoothed_density; + /* + * Estimate how many reusable buffers there are between the current + * strategy point and where we've scanned ahead to, based on the smoothed + * density estimate. + */ + bufs_ahead = num_buffers - bufs_to_lap; + reusable_buffers_est = (float) bufs_ahead / smoothed_density; - /* - * Track a moving average of recent buffer allocations. Here, rather than - * a true average we want a fast-attack, slow-decline behavior: we - * immediately follow any increase. - */ - if (smoothed_alloc <= (float) recent_alloc) - smoothed_alloc = recent_alloc; - else - smoothed_alloc += ((float) recent_alloc - smoothed_alloc) / - smoothing_samples; + /* + * Track a moving average of recent buffer allocations. Here, rather than + * a true average we want a fast-attack, slow-decline behavior: we + * immediately follow any increase. + */ + if (smoothed_alloc <= (float) recent_alloc_partition) + smoothed_alloc = recent_alloc_partition; + else + smoothed_alloc += ((float) recent_alloc_partition - smoothed_alloc) / + smoothing_samples; - /* Scale the estimate by a GUC to allow more aggressive tuning. */ - upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier); + /* Scale the estimate by a GUC to allow more aggressive tuning. */ + upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier); - /* - * If recent_alloc remains at zero for many cycles, smoothed_alloc will - * eventually underflow to zero, and the underflows produce annoying - * kernel warnings on some platforms. Once upcoming_alloc_est has gone to - * zero, there's no point in tracking smaller and smaller values of - * smoothed_alloc, so just reset it to exactly zero to avoid this - * syndrome. It will pop back up as soon as recent_alloc increases. - */ - if (upcoming_alloc_est == 0) - smoothed_alloc = 0; + /* + * If recent_alloc remains at zero for many cycles, smoothed_alloc will + * eventually underflow to zero, and the underflows produce annoying + * kernel warnings on some platforms. Once upcoming_alloc_est has gone to + * zero, there's no point in tracking smaller and smaller values of + * smoothed_alloc, so just reset it to exactly zero to avoid this + * syndrome. It will pop back up as soon as recent_alloc increases. + */ + if (upcoming_alloc_est == 0) + smoothed_alloc = 0; - /* - * Even in cases where there's been little or no buffer allocation - * activity, we want to make a small amount of progress through the buffer - * cache so that as many reusable buffers as possible are clean after an - * idle period. - * - * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times - * the BGW will be called during the scan_whole_pool time; slice the - * buffer pool into that many sections. - */ - min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay)); + /* + * Even in cases where there's been little or no buffer allocation + * activity, we want to make a small amount of progress through the buffer + * cache so that as many reusable buffers as possible are clean after an + * idle period. + * + * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times + * the BGW will be called during the scan_whole_pool time; slice the + * buffer pool into that many sections. + */ + min_scan_buffers = (int) (num_buffers / (scan_whole_pool_milliseconds / BgWriterDelay)); - if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est)) - { + if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est)) + { #ifdef BGW_DEBUG - elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d", - upcoming_alloc_est, min_scan_buffers, reusable_buffers_est); + elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d", + upcoming_alloc_est, min_scan_buffers, reusable_buffers_est); #endif - upcoming_alloc_est = min_scan_buffers + reusable_buffers_est; - } - - /* - * Now write out dirty reusable buffers, working forward from the - * next_to_clean point, until we have lapped the strategy scan, or cleaned - * enough buffers to match our estimate of the next cycle's allocation - * requirements, or hit the bgwriter_lru_maxpages limit. - */ + upcoming_alloc_est = min_scan_buffers + reusable_buffers_est; + } - num_to_scan = bufs_to_lap; - num_written = 0; - reusable_buffers = reusable_buffers_est; + /* + * Now write out dirty reusable buffers, working forward from the + * next_to_clean point, until we have lapped the strategy scan, or cleaned + * enough buffers to match our estimate of the next cycle's allocation + * requirements, or hit the bgwriter_lru_maxpages limit. + */ - /* Execute the LRU scan */ - while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est) - { - int sync_state = SyncOneBuffer(next_to_clean, true, - wb_context); + num_to_scan = bufs_to_lap; + num_written = 0; + reusable_buffers = reusable_buffers_est; - if (++next_to_clean >= NBuffers) + /* Execute the LRU scan */ + while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est) { - next_to_clean = 0; - next_passes++; - } - num_to_scan--; + int sync_state = SyncOneBuffer(next_to_clean[partition], true, + wb_context); - if (sync_state & BUF_WRITTEN) - { - reusable_buffers++; - if (++num_written >= bgwriter_lru_maxpages) + if (++next_to_clean[partition] >= (first_buffer + num_buffers)) { - PendingBgWriterStats.maxwritten_clean++; - break; + next_to_clean[partition] = first_buffer; + next_passes[partition]++; + } + num_to_scan--; + + if (sync_state & BUF_WRITTEN) + { + reusable_buffers++; + if (++num_written >= (bgwriter_lru_maxpages / num_partitions)) + { + PendingBgWriterStats.maxwritten_clean++; + break; + } } + else if (sync_state & BUF_REUSABLE) + reusable_buffers++; } - else if (sync_state & BUF_REUSABLE) - reusable_buffers++; - } - PendingBgWriterStats.buf_written_clean += num_written; + PendingBgWriterStats.buf_written_clean += num_written; #ifdef BGW_DEBUG - elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d", - recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead, - smoothed_density, reusable_buffers_est, upcoming_alloc_est, - bufs_to_lap - num_to_scan, - num_written, - reusable_buffers - reusable_buffers_est); + elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d", + recent_alloc_partition, smoothed_alloc, strategy_delta, bufs_ahead, + smoothed_density, reusable_buffers_est, upcoming_alloc_est, + bufs_to_lap - num_to_scan, + num_written, + reusable_buffers - reusable_buffers_est); #endif - /* - * Consider the above scan as being like a new allocation scan. - * Characterize its density and update the smoothed one based on it. This - * effectively halves the moving average period in cases where both the - * strategy and the background writer are doing some useful scanning, - * which is helpful because a long memory isn't as desirable on the - * density estimates. - */ - new_strategy_delta = bufs_to_lap - num_to_scan; - new_recent_alloc = reusable_buffers - reusable_buffers_est; - if (new_strategy_delta > 0 && new_recent_alloc > 0) - { - scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc; - smoothed_density += (scans_per_alloc - smoothed_density) / - smoothing_samples; + /* + * Consider the above scan as being like a new allocation scan. + * Characterize its density and update the smoothed one based on it. This + * effectively halves the moving average period in cases where both the + * strategy and the background writer are doing some useful scanning, + * which is helpful because a long memory isn't as desirable on the + * density estimates. + */ + new_strategy_delta = bufs_to_lap - num_to_scan; + new_recent_alloc = reusable_buffers - reusable_buffers_est; + if (new_strategy_delta > 0 && new_recent_alloc > 0) + { + scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc; + smoothed_density += (scans_per_alloc - smoothed_density) / + smoothing_samples; #ifdef BGW_DEBUG - elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f", - new_recent_alloc, new_strategy_delta, - scans_per_alloc, smoothed_density); + elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f", + new_recent_alloc, new_strategy_delta, + scans_per_alloc, smoothed_density); #endif + } + + /* hibernate if all partitions can hibernate */ + hibernate &= (bufs_to_lap == 0 && recent_alloc_partition == 0); } + /* now that we've scanned all partitions, mark the cached info as valid */ + saved_info_valid = true; + /* Return true if OK to hibernate */ - return (bufs_to_lap == 0 && recent_alloc == 0); + return hibernate; } /* diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index e38e5c7ec3d..1827e052da7 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -63,17 +63,27 @@ typedef struct BufferStrategyFreelist #define MIN_FREELIST_PARTITIONS 4 /* - * The shared freelist control information. + * Information about one partition of the ClockSweep (on a subset of buffers). + * + * XXX Should be careful to align this to cachelines, etc. */ typedef struct { /* Spinlock: protects the values below */ - slock_t buffer_strategy_lock; + slock_t clock_sweep_lock; + + /* range for this clock weep partition */ + int32 firstBuffer; + int32 numBuffers; /* * Clock sweep hand: index of next buffer to consider grabbing. Note that * this isn't a concrete buffer - we only ever increase the value. So, to * get an actual buffer, it needs to be used modulo NBuffers. + * + * XXX This is relative to firstBuffer, so needs to be offset properly. + * + * XXX firstBuffer + (nextVictimBuffer % numBuffers) */ pg_atomic_uint32 nextVictimBuffer; @@ -83,6 +93,15 @@ typedef struct */ uint32 completePasses; /* Complete cycles of the clock sweep */ pg_atomic_uint32 numBufferAllocs; /* Buffers allocated since last reset */ +} ClockSweep; + +/* + * The shared freelist control information. + */ +typedef struct +{ + /* Spinlock: protects the values below */ + slock_t buffer_strategy_lock; /* * Bgworker process to be notified upon activity or -1 if none. See @@ -99,6 +118,9 @@ typedef struct int num_partitions_groups; /* effectively num of NUMA nodes */ int num_partitions_per_group; + /* clocksweep partitions */ + ClockSweep *sweeps; + BufferStrategyFreelist freelists[FLEXIBLE_ARRAY_MEMBER]; } BufferStrategyControl; @@ -152,6 +174,7 @@ static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state); static void AddBufferToRing(BufferAccessStrategy strategy, BufferDesc *buf); +static ClockSweep *ChooseClockSweep(void); /* * ClockSweepTick - Helper routine for StrategyGetBuffer() @@ -163,6 +186,7 @@ static inline uint32 ClockSweepTick(void) { uint32 victim; + ClockSweep *sweep = ChooseClockSweep(); /* * Atomically move hand ahead one buffer - if there's several processes @@ -170,14 +194,14 @@ ClockSweepTick(void) * apparent order. */ victim = - pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1); + pg_atomic_fetch_add_u32(&sweep->nextVictimBuffer, 1); - if (victim >= NBuffers) + if (victim >= sweep->numBuffers) { uint32 originalVictim = victim; /* always wrap what we look up in BufferDescriptors */ - victim = victim % NBuffers; + victim = victim % sweep->numBuffers; /* * If we're the one that just caused a wraparound, force @@ -203,19 +227,23 @@ ClockSweepTick(void) * could lead to an overflow of nextVictimBuffers, but that's * highly unlikely and wouldn't be particularly harmful. */ - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + SpinLockAcquire(&sweep->clock_sweep_lock); - wrapped = expected % NBuffers; + wrapped = expected % sweep->numBuffers; - success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer, + success = pg_atomic_compare_exchange_u32(&sweep->nextVictimBuffer, &expected, wrapped); if (success) - StrategyControl->completePasses++; - SpinLockRelease(&StrategyControl->buffer_strategy_lock); + sweep->completePasses++; + SpinLockRelease(&sweep->clock_sweep_lock); } } } - return victim; + + /* XXX buffer IDs are 1-based, we're calculating 0-based indexes */ + Assert(BufferIsValid(1 + sweep->firstBuffer + (victim % sweep->numBuffers))); + + return sweep->firstBuffer + victim; } /* @@ -289,6 +317,28 @@ calculate_partition_index() return index; } +/* + * ChooseClockSweep + * pick a clocksweep partition based on NUMA node and CPU + * + * The number of clocksweep partitions may not match the number of NUMA + * nodes, but it should not be lower. Each partition should be mapped to + * a single NUMA node, but a node may have multiple partitions. If there + * are multiple partitions per node (all nodes have the same number of + * partitions), we pick the partition using CPU. + * + * XXX Maybe we should do both the total and "per group" counts a power of + * two? That'd allow using shifts instead of divisions in the calculation, + * and that's cheaper. But how would that deal with odd number of nodes? + */ +static ClockSweep * +ChooseClockSweep(void) +{ + int index = calculate_partition_index(); + + return &StrategyControl->sweeps[index]; +} + /* * ChooseFreeList * Pick the buffer freelist to use, depending on the CPU and NUMA node. @@ -404,7 +454,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r * the rate of buffer consumption. Note that buffers recycled by a * strategy object are intentionally not counted here. */ - pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1); + pg_atomic_fetch_add_u32(&ChooseClockSweep()->numBufferAllocs, 1); /* * First check, without acquiring the lock, whether there's buffers in the @@ -475,13 +525,17 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r /* * Nothing on the freelist, so run the "clock sweep" algorithm * - * XXX Should we also make this NUMA-aware, to only access buffers from - * the same NUMA node? That'd probably mean we need to make the clock - * sweep NUMA-aware, perhaps by having multiple clock sweeps, each for a - * subset of buffers. But that also means each process could "sweep" only - * a fraction of buffers, even if the other buffers are better candidates - * for eviction. Would that also mean we'd have multiple bgwriters, one - * for each node, or would one bgwriter handle all of that? + * XXX Note that ClockSweepTick() is NUMA-aware, i.e. it only looks at + * buffers from a single partition, aligned with the NUMA node. That + * means it only accesses buffers from the same NUMA node. + * + * XXX That also means each process "sweeps" only a fraction of buffers, + * even if the other buffers are better candidates for eviction. Maybe + * there should be some logic to "steal" buffers from other freelists + * or other nodes? + * + * XXX Would that also mean we'd have multiple bgwriters, one for each + * node, or would one bgwriter handle all of that? */ trycounter = NBuffers; for (;;) @@ -563,6 +617,41 @@ StrategyFreeBuffer(BufferDesc *buf) SpinLockRelease(&freelist->freelist_lock); } +/* + * StrategySyncStart -- prepare for sync of all partitions + * + * Determine the number of clocksweep partitions, and calculate the recent + * buffers allocs (as a sum of all the partitions). This allows BgBufferSync + * to calculate average number of allocations per partition for the next + * sync cycle. + * + * In addition it returns the count of recent buffer allocs, which is a total + * summed from all partitions. The alloc counts are reset after being read, + * as the partitions are walked. + */ +void +StrategySyncPrepare(int *num_parts, uint32 *num_buf_alloc) +{ + *num_buf_alloc = 0; + *num_parts = StrategyControl->num_partitions; + + /* + * We lock the partitions one by one, so not exacly in sync, but that + * should be fine. We're only looking for heuristics anyway. + */ + for (int i = 0; i < StrategyControl->num_partitions; i++) + { + ClockSweep *sweep = &StrategyControl->sweeps[i]; + + SpinLockAcquire(&sweep->clock_sweep_lock); + if (num_buf_alloc) + { + *num_buf_alloc += pg_atomic_exchange_u32(&sweep->numBufferAllocs, 0); + } + SpinLockRelease(&sweep->clock_sweep_lock); + } +} + /* * StrategySyncStart -- tell BgBufferSync where to start syncing * @@ -570,37 +659,44 @@ StrategyFreeBuffer(BufferDesc *buf) * BgBufferSync() will proceed circularly around the buffer array from there. * * In addition, we return the completed-pass count (which is effectively - * the higher-order bits of nextVictimBuffer) and the count of recent buffer - * allocs if non-NULL pointers are passed. The alloc count is reset after - * being read. + * the higher-order bits of nextVictimBuffer). + * + * This only considers a single clocksweep partition, as BgBufferSync looks + * at them one by one. */ int -StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc) +StrategySyncStart(int partition, uint32 *complete_passes, + int *first_buffer, int *num_buffers) { uint32 nextVictimBuffer; int result; + ClockSweep *sweep = &StrategyControl->sweeps[partition]; - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); - nextVictimBuffer = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer); - result = nextVictimBuffer % NBuffers; + Assert((partition >= 0) && (partition < StrategyControl->num_partitions)); + + SpinLockAcquire(&sweep->clock_sweep_lock); + nextVictimBuffer = pg_atomic_read_u32(&sweep->nextVictimBuffer); + result = nextVictimBuffer % sweep->numBuffers; + + *first_buffer = sweep->firstBuffer; + *num_buffers = sweep->numBuffers; if (complete_passes) { - *complete_passes = StrategyControl->completePasses; + *complete_passes = sweep->completePasses; /* * Additionally add the number of wraparounds that happened before * completePasses could be incremented. C.f. ClockSweepTick(). */ - *complete_passes += nextVictimBuffer / NBuffers; + *complete_passes += nextVictimBuffer / sweep->numBuffers; } + SpinLockRelease(&sweep->clock_sweep_lock); - if (num_buf_alloc) - { - *num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0); - } - SpinLockRelease(&StrategyControl->buffer_strategy_lock); - return result; + /* XXX buffer IDs start at 1, we're calculating 0-based indexes */ + Assert(BufferIsValid(1 + sweep->firstBuffer + result)); + + return sweep->firstBuffer + result; } /* @@ -696,6 +792,10 @@ StrategyShmemSize(void) size = add_size(size, MAXALIGN(mul_size(sizeof(BufferStrategyFreelist), num_partitions))); + /* size of clocksweep partitions (at least one per NUMA node) */ + size = add_size(size, MAXALIGN(mul_size(sizeof(ClockSweep), + num_partitions))); + return size; } @@ -714,6 +814,7 @@ StrategyInitialize(bool init) int num_partitions; int num_partitions_per_group; + char *ptr; /* */ num_partitions = calculate_partition_count(strategy_nnodes); @@ -736,7 +837,8 @@ StrategyInitialize(bool init) StrategyControl = (BufferStrategyControl *) ShmemInitStruct("Buffer Strategy Status", MAXALIGN(offsetof(BufferStrategyControl, freelists)) + - MAXALIGN(sizeof(BufferStrategyFreelist) * num_partitions), + MAXALIGN(sizeof(BufferStrategyFreelist) * num_partitions) + + MAXALIGN(sizeof(ClockSweep) * num_partitions), &found); if (!found) @@ -758,12 +860,32 @@ StrategyInitialize(bool init) SpinLockInit(&StrategyControl->buffer_strategy_lock); - /* Initialize the clock sweep pointer */ - pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0); + /* have to point the sweeps array to right after the freelists */ + ptr = (char *) StrategyControl + + MAXALIGN(offsetof(BufferStrategyControl, freelists)) + + MAXALIGN(sizeof(BufferStrategyFreelist) * num_partitions); + StrategyControl->sweeps = (ClockSweep *) ptr; - /* Clear statistics */ - StrategyControl->completePasses = 0; - pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0); + /* Initialize the clock sweep pointers (for all partitions) */ + for (int i = 0; i < num_partitions; i++) + { + SpinLockInit(&StrategyControl->sweeps[i].clock_sweep_lock); + + pg_atomic_init_u32(&StrategyControl->sweeps[i].nextVictimBuffer, 0); + + /* + * FIXME This may not quite right, because if NBuffers is not + * a perfect multiple of numBuffers, the last partition will have + * numBuffers set too high. buf_init handles this by tracking the + * remaining number of buffers, and not overflowing. + */ + StrategyControl->sweeps[i].numBuffers = numBuffers; + StrategyControl->sweeps[i].firstBuffer = (numBuffers * i); + + /* Clear statistics */ + StrategyControl->sweeps[i].completePasses = 0; + pg_atomic_init_u32(&StrategyControl->sweeps[i].numBufferAllocs, 0); + } /* No pending notification */ StrategyControl->bgwprocno = -1; diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 52a71b138f7..b50f9458156 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -448,7 +448,9 @@ extern void StrategyFreeBuffer(BufferDesc *buf); extern bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring); -extern int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc); +extern void StrategySyncPrepare(int *num_parts, uint32 *num_buf_alloc); +extern int StrategySyncStart(int partition, uint32 *complete_passes, + int *first_buffer, int *num_buffers); extern void StrategyNotifyBgWriter(int bgwprocno); extern Size StrategyShmemSize(void); -- 2.49.0