From 34c5477568b4a31863aeba57c4c0cb700e274f4e Mon Sep 17 00:00:00 2001 From: Ashutosh Bapat Date: Thu, 19 Jun 2025 17:38:51 +0200 Subject: [PATCH 15/19] Reinitialize StrategyControl after resizing buffers ... and BgBufferSync and ClockSweepTick adjustments Reinitializing strategry control area ===================================== The commit introduces a separate function StrategyReInitialize() instead of reusing StrategyInitialize() since some of the things that the second one does are not required in the first one. Here's list of what StrategyReInitialize() does and how does it differ from StrategyInitialize(). 1. StrategyControl pointer needn't be fetched again since it should not change. But added an Assert to make sure the pointer is valid. 2. &StrategyControl->buffer_strategy_lock need not be initialized again. 3. nextVictimBuffer, completePasses and numBufferAllocs are viewed in the context of NBuffers. Now that NBuffers itself has changed, those three do not make sense. Reset them as if the server has restarted again. Ability to delay resizing operation =================================== This commit introduces a flag delay_shmem_resize, which postgresql backends and workers can use to signal the coordinator to delay resizing operation. Background writer sets this flag when its scanning buffers. Background writer operation =========================== Background writer is blocked when the actual resizing is in progress. It stops a scan in progress when it sees that the resizing has begun or is about to begin. Once the buffer resizing is finished, before resuming the regular operation, bgwriter resets the information saved so far. This information is viewed in the context of NBuffers and hence does not make sense after resizing which chanegs NBuffers. Buffer lookup table =================== Right now there is no way to free shared memory. Even if we shrink the buffer lookup table when shrinking the buffer pool the unused hash table entries can not be freed. When we expand the buffer pool, more entries can be allocated but we can not resize the hash table directory without rehashing all the entries. Just allocating more entries will lead to more contention. Hence we setup the buffer lookup table considering the maximum possible size of the buffer pool which is MaxAvailableMemory only once at the beginning. Shared buffer lookup table and StrategyControl are not resized even if the buffer pool is resized hence they are allocated in the main shared memory segment TODO: ==== 1. The way BgBufferSync is written today, it packs four functionalities: setting up the buffer sync state, performing the buffer sync, resetting the buffer sync state when bgwriter_lru_maxpages <= 0 and setting it up again after bgwriter_lru_maxpages > 0. That makes the code hard to read. It will be good to divide this function into 3/4 different functions each performing one functionality. Then pack all the state (the local variables from that function converted to static global) into a structure, which is passed to these functions. Once that happens BgBufferSyncReset() will call one of the functions to reset the state when buffer pool is resized. 2. The condition (pg_atomic_read_u32(&ShmemCtrl->NSharedBuffers) == NBuffers) checked in BgBufferSync() to check whether buffer resizing is "about to begin" is wrong. NBuffers it not changed, until AnonymousShmemResize() is called and it wont' be called unless BgBufferSync() finishes if it has already begun. Need a better condition to check whether buffer resizing is about to begin. Author: Ashutosh Bapat Reviewed-by: Tomas Vondra --- src/backend/port/sysv_shmem.c | 23 ++++++-- src/backend/storage/buffer/buf_init.c | 19 +++++-- src/backend/storage/buffer/buf_table.c | 9 ++- src/backend/storage/buffer/bufmgr.c | 72 ++++++++++++++++++------ src/backend/storage/buffer/freelist.c | 77 ++++++++++++++++++++++++-- src/include/storage/buf_internals.h | 1 + src/include/storage/bufmgr.h | 1 + src/include/storage/ipc.h | 1 + src/include/storage/pg_shmem.h | 5 +- 9 files changed, 170 insertions(+), 38 deletions(-) diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 9e1b2c3201f..3be28e228ae 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -104,6 +104,7 @@ AnonymousMapping Mappings[ANON_MAPPINGS]; /* Flag telling postmaster that resize is needed */ volatile bool pending_pm_shmem_resize = false; +volatile bool delay_shmem_resize = false; /* Keeps track of the previous NBuffers value */ static int NBuffersOld = -1; @@ -144,12 +145,11 @@ static int NBuffersPending = -1; * makes sense to evaluate them more precise. */ static double SHMEM_RESIZE_RATIO[6] = { - 0.1, /* MAIN_SHMEM_SEGMENT */ + 0.15, /* MAIN_SHMEM_SEGMENT */ 0.6, /* BUFFERS_SHMEM_SEGMENT */ 0.1, /* BUFFER_DESCRIPTORS_SHMEM_SEGMENT */ 0.1, /* BUFFER_IOCV_SHMEM_SEGMENT */ 0.05, /* CHECKPOINT_BUFFERS_SHMEM_SEGMENT */ - 0.05, /* STRATEGY_SHMEM_SEGMENT */ }; /* @@ -225,8 +225,6 @@ MappingName(int shmem_segment) return "iocv"; case CHECKPOINT_BUFFERS_SHMEM_SEGMENT: return "checkpoint"; - case STRATEGY_SHMEM_SEGMENT: - return "strategy"; default: return "unknown"; } @@ -1125,13 +1123,17 @@ ProcessBarrierShmemResize(Barrier *barrier) { Assert(IsUnderPostmaster); - elog(DEBUG1, "Handle a barrier for shmem resizing from %d to %d, %d", - NBuffersOld, NBuffersPending, pending_pm_shmem_resize); + elog(DEBUG1, "Handle a barrier for shmem resizing from %d to %d, %d, %d", + NBuffersOld, NBuffersPending, pending_pm_shmem_resize, delay_shmem_resize); /* Wait until we have seen the new NBuffers value */ if (!pending_pm_shmem_resize) return false; + /* Wait till this process becomes ready to resize buffers. */ + if (delay_shmem_resize) + return false; + /* * First thing to do after attaching to the barrier is to wait for others. * We can't simply use BarrierArriveAndWait, because backends might arrive @@ -1182,6 +1184,15 @@ ProcessBarrierShmemResize(Barrier *barrier) /* The second phase means the resize has finished, SHMEM_RESIZE_DONE */ BarrierArriveAndWait(barrier, WAIT_EVENT_SHMEM_RESIZE_DONE); + if (MyBackendType == B_BG_WRITER) + { + /* + * Before resuming regular background writer activity, adjust the + * statistics collected so far. + */ + BgBufferSyncReset(NBuffersOld, NBuffers); + } + BarrierDetach(barrier); return true; } diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 0e72e373193..be64fa5a136 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -152,8 +152,15 @@ BufferManagerShmemInit(int FirstBufferToInit) } #endif - /* Init other shared buffer-management stuff */ - StrategyInitialize(!foundDescs); + /* + * Init other shared buffer-management stuff from scratch configuring buffer + * pool the first time. If we are just resizing buffer pool adjust only the + * required structures. + */ + if (FirstBufferToInit == 0) + StrategyInitialize(!foundDescs); + else + StrategyReInitialize(FirstBufferToInit); /* Initialize per-backend file flush context */ WritebackContextInit(&BackendWritebackContext, @@ -184,9 +191,6 @@ BufferManagerShmemSize(void) size = add_size(size, mul_size(NBuffers, BLCKSZ)); Mappings[BUFFERS_SHMEM_SEGMENT].shmem_req_size = size; - /* size of stuff controlled by freelist.c */ - Mappings[STRATEGY_SHMEM_SEGMENT].shmem_req_size = StrategyShmemSize(); - /* size of I/O condition variables, plus alignment padding */ size = add_size(0, mul_size(NBuffers, sizeof(ConditionVariableMinimallyPadded))); @@ -196,5 +200,10 @@ BufferManagerShmemSize(void) /* size of checkpoint sort array in bufmgr.c */ Mappings[CHECKPOINT_BUFFERS_SHMEM_SEGMENT].shmem_req_size = mul_size(NBuffers, sizeof(CkptSortItem)); + /* Allocations in the main memory segment, at the end. */ + + /* size of stuff controlled by freelist.c */ + size = add_size(0, StrategyShmemSize()); + return size; } diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c index 18a78967138..e5a97e557d9 100644 --- a/src/backend/storage/buffer/buf_table.c +++ b/src/backend/storage/buffer/buf_table.c @@ -65,11 +65,18 @@ InitBufTable(int size) info.entrysize = sizeof(BufferLookupEnt); info.num_partitions = NUM_BUFFER_PARTITIONS; + /* + * The shared buffer look up table is set up only once with maximum possible + * entries considering maximum size of the buffer pool. It is not resized + * after that even if the buffer pool is resized. Hence it is allocated in + * the main shared memory segment and not in a resizeable shared memory + * segment. + */ SharedBufHash = ShmemInitHashInSegment("Shared Buffer Lookup Table", size, size, &info, HASH_ELEM | HASH_BLOBS | HASH_PARTITION | HASH_FIXED_SIZE, - STRATEGY_SHMEM_SEGMENT); + MAIN_SHMEM_SEGMENT); } /* diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 467d9880f7b..fdcb5556235 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -3614,6 +3614,32 @@ BufferSync(int flags) TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan); } +/* + * Information saved between BgBufferSync() calls so we can determine the + * strategy point's advance rate and avoid scanning already-cleaned buffers. The + * variables are global instead of static local so that BgBufferSyncReset() can + * adjust it when resizing shared buffers. + */ +static bool saved_info_valid = false; +static int prev_strategy_buf_id; +static uint32 prev_strategy_passes; +static int next_to_clean; +static uint32 next_passes; + +/* Moving averages of allocation rate and clean-buffer density */ +static float smoothed_alloc = 0; +static float smoothed_density = 10.0; + +void +BgBufferSyncReset(int NBuffersOld, int NBuffersNew) +{ + saved_info_valid = false; +#ifdef BGW_DEBUG + elog(DEBUG2, "invalidated background writer status after resizing buffers from %d to %d", + NBuffersOld, NBuffersNew); +#endif +} + /* * BgBufferSync -- Write out some dirty buffers in the pool. * @@ -3633,20 +3659,6 @@ BgBufferSync(WritebackContext *wb_context) uint32 strategy_passes; uint32 recent_alloc; - /* - * Information saved between calls so we can determine the strategy - * point's advance rate and avoid scanning already-cleaned buffers. - */ - static bool saved_info_valid = false; - static int prev_strategy_buf_id; - static uint32 prev_strategy_passes; - static int next_to_clean; - static uint32 next_passes; - - /* Moving averages of allocation rate and clean-buffer density */ - static float smoothed_alloc = 0; - static float smoothed_density = 10.0; - /* Potentially these could be tunables, but for now, not */ float smoothing_samples = 16; float scan_whole_pool_milliseconds = 120000.0; @@ -3669,6 +3681,22 @@ BgBufferSync(WritebackContext *wb_context) long new_strategy_delta; uint32 new_recent_alloc; + /* + * If buffer pool is being shrunk the buffer being written out may not remain + * valid. If the buffer pool is being expanded, more buffers will become + * available without even this function writing out any. Hence wait till + * buffer resizing finishes i.e. go into hibernation mode. + */ + if (pg_atomic_read_u32(&ShmemCtrl->NSharedBuffers) != NBuffers) + return true; + + /* + * Resizing shared buffers while this function is performing an LRU scan on + * them may lead to wrong results. Indicate that the resizing should wait for + * the LRU scan to complete. + */ + delay_shmem_resize = true; + /* * Find out where the clock-sweep currently is, and how many buffer * allocations have happened since our last call. @@ -3845,8 +3873,17 @@ BgBufferSync(WritebackContext *wb_context) num_written = 0; reusable_buffers = reusable_buffers_est; - /* Execute the LRU scan */ - while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est) + /* + * Execute the LRU scan. + * + * If buffer pool is being shrunk, the buffer being written may not remain + * valid. If the buffer pool is being expanded, more buffers will become + * available without even this function writing any. Hence stop what we are doing. This + * also unblocks other processes that are waiting for buffer resizing to + * finish. + */ + while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est && + pg_atomic_read_u32(&ShmemCtrl->NSharedBuffers) == NBuffers) { int sync_state = SyncOneBuffer(next_to_clean, true, wb_context); @@ -3905,6 +3942,9 @@ BgBufferSync(WritebackContext *wb_context) #endif } + /* Let the resizing commence. */ + delay_shmem_resize = false; + /* Return true if OK to hibernate */ return (bufs_to_lap == 0 && recent_alloc == 0); } diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 0da8fbb580e..55be5eebe0a 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -408,12 +408,21 @@ StrategyInitialize(bool init) * * Since we can't tolerate running out of lookup table entries, we must be * sure to specify an adequate table size here. The maximum steady-state - * usage is of course NBuffers entries, but BufferAlloc() tries to insert - * a new entry before deleting the old. In principle this could be - * happening in each partition concurrently, so we could need as many as - * NBuffers + NUM_BUFFER_PARTITIONS entries. + * usage is of course is as many number of entries as the number of buffers + * in the buffer pool. Right now there is no way to free shared memory. Even + * if we shrink the buffer lookup table when shrinking the buffer pool the + * unused hash table entries can not be freed. When we expand the buffer + * pool, more entries can be allocated but we can not resize the hash table + * directory without rehashing all the entries. Just allocating more entries + * will lead to more contention. Hence we setup the buffer lookup table + * considering the maximum possible size of the buffer pool which is + * MaxAvailableMemory. + * + * Additionally BufferAlloc() tries to insert a new entry before deleting the + * old. In principle this could be happening in each partition concurrently, + * so we need extra NUM_BUFFER_PARTITIONS entries. */ - InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS); + InitBufTable(MaxAvailableMemory + NUM_BUFFER_PARTITIONS); /* * Get or create the shared strategy control block @@ -421,7 +430,7 @@ StrategyInitialize(bool init) StrategyControl = (BufferStrategyControl *) ShmemInitStructInSegment("Buffer Strategy Status", sizeof(BufferStrategyControl), - &found, STRATEGY_SHMEM_SEGMENT); + &found, MAIN_SHMEM_SEGMENT); if (!found) { @@ -446,6 +455,62 @@ StrategyInitialize(bool init) Assert(!init); } +/* + * StrategyReInitialize -- re-initialize the buffer cache replacement + * strategy. + * + * To be called when resizing buffer manager and only from the coordinator. + * TODO: Assess the differences between this function and StrategyInitialize(). + */ +void +StrategyReInitialize(int FirstBufferIdToInit) +{ + bool found; + + /* + * Resizing memory for buffer pools should not affect the address of + * StrategyControl. + */ + if (StrategyControl != (BufferStrategyControl *) + ShmemInitStructInSegment("Buffer Strategy Status", + sizeof(BufferStrategyControl), + &found, MAIN_SHMEM_SEGMENT)) + elog(FATAL, "something went wrong while re-initializing the buffer strategy"); + + Assert(found); + + /* TODO: Buffer lookup table adjustment: There are two options: + * + * 1. Resize the buffer lookup table to match the new number of buffers. But + * this requires rehashing all the entries in the buffer lookup table with + * the new table size. + * + * 2. Allocate maximum size of the buffer lookup table at the beginning and + * never resize it. This leaves sparse buffer lookup table which is + * inefficient from both memory and time perspective. According to David + * Rowley, the sparse entries in the buffer look up table cause frequent + * cacheline reload which affect performance. If the impact of that + * inefficiency in a benchmark is significant, we will need to consider first + * option. + */ + /* + * The clock sweep tick pointer might have got invalidated. Reset it as if + * starting a fresh server. + */ + pg_atomic_write_u32(&StrategyControl->nextVictimBuffer, 0); + + /* + * The old statistics is viewed in the context of the number of shared + * buffers. It does not make sense now that the number of shared buffers + * itself has changed. + */ + StrategyControl->completePasses = 0; + pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0); + + /* No pending notification */ + StrategyControl->bgwprocno = -1; +} + /* ---------------------------------------------------------------- * Backend-private buffer ring management diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index c1206a46aba..20bea8132fd 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -447,6 +447,7 @@ extern void StrategyNotifyBgWriter(int bgwprocno); extern Size StrategyShmemSize(void); extern void StrategyInitialize(bool init); +extern void StrategyReInitialize(int FirstBufferToInit); /* buf_table.c */ extern Size BufTableShmemSize(int size); diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index e7c973adca8..74e226269af 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -300,6 +300,7 @@ extern bool IsBufferCleanupOK(Buffer buffer); extern bool HoldingBufferPinThatDelaysRecovery(void); extern bool BgBufferSync(WritebackContext *wb_context); +extern void BgBufferSyncReset(int NBuffersOld, int NBuffersNew); extern uint32 GetPinLimit(void); extern uint32 GetLocalPinLimit(void); diff --git a/src/include/storage/ipc.h b/src/include/storage/ipc.h index 847f56a36dc..6e7b0abb625 100644 --- a/src/include/storage/ipc.h +++ b/src/include/storage/ipc.h @@ -65,6 +65,7 @@ typedef void (*shmem_startup_hook_type) (void); extern PGDLLIMPORT bool proc_exit_inprogress; extern PGDLLIMPORT bool shmem_exit_inprogress; extern PGDLLIMPORT volatile bool pending_pm_shmem_resize; +extern PGDLLIMPORT volatile bool delay_shmem_resize; pg_noreturn extern void proc_exit(int code); extern void shmem_exit(int code); diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index 0a59746b472..704b065f9e9 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -65,7 +65,7 @@ typedef struct ShmemSegment } ShmemSegment; /* Number of available segments for anonymous memory mappings */ -#define ANON_MAPPINGS 6 +#define ANON_MAPPINGS 5 extern PGDLLIMPORT ShmemSegment Segments[ANON_MAPPINGS]; extern PGDLLIMPORT AnonymousMapping Mappings[ANON_MAPPINGS]; @@ -172,7 +172,4 @@ extern void ShmemControlInit(void); /* Checkpoint BufferIds */ #define CHECKPOINT_BUFFERS_SHMEM_SEGMENT 4 -/* Buffer strategy status */ -#define STRATEGY_SHMEM_SEGMENT 5 - #endif /* PG_SHMEM_H */ -- 2.34.1