From 482f56744a72c168492eefac2af9dcf5ee153524 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Mon, 8 Sep 2025 17:16:01 -0400 Subject: [PATCH v3 2/6] bufmgr: Don't lock buffer header in StrategyGetBuffer() This is a small improvement on its own, due to not needing to hold the buffer spinlock as often. But the main reason for this is to reduce the use of PinBuffer_Locked() and LockBufHdr(), which a future commit will make a bit more expensive (to make more common paths faster). Author: Reviewed-by: Discussion: https://postgr.es/m/ Backpatch: --- src/include/storage/buf_internals.h | 4 + src/backend/storage/buffer/bufmgr.c | 44 +++++------ src/backend/storage/buffer/freelist.c | 110 +++++++++++++++++++------- 3 files changed, 105 insertions(+), 53 deletions(-) diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index dfd614f7ca4..c1206a46aba 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -371,6 +371,8 @@ UnlockBufHdr(BufferDesc *desc, uint32 buf_state) pg_atomic_write_u32(&desc->state, buf_state & (~BM_LOCKED)); } +extern uint32 WaitBufHdrUnlocked(BufferDesc *buf); + /* in bufmgr.c */ /* @@ -425,6 +427,8 @@ extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_co extern void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag); +extern void TrackNewBufferPin(Buffer buf); + /* solely to make it easier to write tests */ extern bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait); extern void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index b5eebfb6990..0b841eb7838 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -518,7 +518,6 @@ static void PinBuffer_Locked(BufferDesc *buf); static void UnpinBuffer(BufferDesc *buf); static void UnpinBufferNoOwner(BufferDesc *buf); static void BufferSync(int flags); -static uint32 WaitBufHdrUnlocked(BufferDesc *buf); static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context); static void WaitIO(BufferDesc *buf); @@ -2325,7 +2324,7 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context) /* * Ensure, while the spinlock's not yet held, that there's a free refcount - * entry, and a resource owner slot for the pin. + * entry, and a resource owner slot for the pin. FIXME: Need to be updated */ ReservePrivateRefCountEntry(); ResourceOwnerEnlarge(CurrentResourceOwner); @@ -2334,17 +2333,11 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context) again: /* - * Select a victim buffer. The buffer is returned with its header - * spinlock still held! + * Select a victim buffer. The buffer is returned pinned by this backend. */ buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring); buf = BufferDescriptorGetBuffer(buf_hdr); - Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0); - - /* Pin the buffer and then release the buffer spinlock */ - PinBuffer_Locked(buf_hdr); - /* * We shouldn't have any other pins for this buffer. */ @@ -3043,8 +3036,6 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy, uint32 buf_state; uint32 old_buf_state; - ref = NewPrivateRefCountEntry(b); - old_buf_state = pg_atomic_read_u32(&buf->state); for (;;) { @@ -3091,6 +3082,8 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy, break; } } + + TrackNewBufferPin(b); } else { @@ -3110,11 +3103,12 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy, * cannot meddle with that. */ result = (pg_atomic_read_u32(&buf->state) & BM_VALID) != 0; + + Assert(ref->refcount > 0); + ref->refcount++; + ResourceOwnerRememberBuffer(CurrentResourceOwner, b); } - ref->refcount++; - Assert(ref->refcount > 0); - ResourceOwnerRememberBuffer(CurrentResourceOwner, b); return result; } @@ -3143,8 +3137,6 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy, static void PinBuffer_Locked(BufferDesc *buf) { - Buffer b; - PrivateRefCountEntry *ref; uint32 buf_state; /* @@ -3169,12 +3161,7 @@ PinBuffer_Locked(BufferDesc *buf) buf_state += BUF_REFCOUNT_ONE; UnlockBufHdr(buf, buf_state); - b = BufferDescriptorGetBuffer(buf); - - ref = NewPrivateRefCountEntry(b); - ref->refcount++; - - ResourceOwnerRememberBuffer(CurrentResourceOwner, b); + TrackNewBufferPin(BufferDescriptorGetBuffer(buf)); } /* @@ -3289,6 +3276,17 @@ UnpinBufferNoOwner(BufferDesc *buf) } } +inline void +TrackNewBufferPin(Buffer buf) +{ + PrivateRefCountEntry *ref; + + ref = NewPrivateRefCountEntry(buf); + ref->refcount++; + + ResourceOwnerRememberBuffer(CurrentResourceOwner, buf); +} + #define ST_SORT sort_checkpoint_bufferids #define ST_ELEMENT_TYPE CkptSortItem #define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b) @@ -6242,7 +6240,7 @@ LockBufHdr(BufferDesc *desc) * Obviously the buffer could be locked by the time the value is returned, so * this is primarily useful in CAS style loops. */ -static uint32 +pg_noinline uint32 WaitBufHdrUnlocked(BufferDesc *buf) { SpinDelayStatus delayStatus; diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 7d59a92bd1a..9d9fb0471a0 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -164,8 +164,8 @@ ClockSweepTick(void) * * strategy is a BufferAccessStrategy object, or NULL for default strategy. * - * To ensure that no one else can pin the buffer before we do, we must - * return the buffer with the buffer header spinlock still held. + * The buffer is pinned before returning, but resource ownership is the + * responsibility of the caller. */ BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring) @@ -173,7 +173,6 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r BufferDesc *buf; int bgwprocno; int trycounter; - uint32 local_buf_state; /* to avoid repeated (de-)referencing */ *from_ring = false; @@ -228,44 +227,74 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r trycounter = NBuffers; for (;;) { + uint32 old_buf_state; + uint32 local_buf_state; + buf = GetBufferDescriptor(ClockSweepTick()); /* * If the buffer is pinned or has a nonzero usage_count, we cannot use * it; decrement the usage_count (unless pinned) and keep scanning. */ - local_buf_state = LockBufHdr(buf); - if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0) + old_buf_state = pg_atomic_read_u32(&buf->state); + + for (;;) { + local_buf_state = old_buf_state; + + if (BUF_STATE_GET_REFCOUNT(local_buf_state) != 0) + { + if (--trycounter == 0) + { + /* + * We've scanned all the buffers without making any state + * changes, so all the buffers are pinned (or were when we + * looked at them). We could hope that someone will free + * one eventually, but it's probably better to fail than + * to risk getting stuck in an infinite loop. + */ + elog(ERROR, "no unpinned buffers available"); + } + break; + } + + if (unlikely(local_buf_state & BM_LOCKED)) + { + old_buf_state = WaitBufHdrUnlocked(buf); + continue; + } + if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0) { local_buf_state -= BUF_USAGECOUNT_ONE; - trycounter = NBuffers; + if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state, + local_buf_state)) + { + trycounter = NBuffers; + break; + } } else { - /* Found a usable buffer */ - if (strategy != NULL) - AddBufferToRing(strategy, buf); - *buf_state = local_buf_state; - return buf; + local_buf_state += BUF_REFCOUNT_ONE; + + if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state, + local_buf_state)) + { + /* Found a usable buffer */ + if (strategy != NULL) + AddBufferToRing(strategy, buf); + *buf_state = local_buf_state; + + TrackNewBufferPin(BufferDescriptorGetBuffer(buf)); + + return buf; + } } + } - else if (--trycounter == 0) - { - /* - * We've scanned all the buffers without making any state changes, - * so all the buffers are pinned (or were when we looked at them). - * We could hope that someone will free one eventually, but it's - * probably better to fail than to risk getting stuck in an - * infinite loop. - */ - UnlockBufHdr(buf, local_buf_state); - elog(ERROR, "no unpinned buffers available"); - } - UnlockBufHdr(buf, local_buf_state); } } @@ -621,6 +650,7 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state) { BufferDesc *buf; Buffer bufnum; + uint32 old_buf_state; uint32 local_buf_state; /* to avoid repeated (de-)referencing */ @@ -647,14 +677,34 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state) * shouldn't re-use it. */ buf = GetBufferDescriptor(bufnum - 1); - local_buf_state = LockBufHdr(buf); - if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0 - && BUF_STATE_GET_USAGECOUNT(local_buf_state) <= 1) + + old_buf_state = pg_atomic_read_u32(&buf->state); + + for (;;) { - *buf_state = local_buf_state; - return buf; + local_buf_state = old_buf_state; + + if (BUF_STATE_GET_REFCOUNT(local_buf_state) != 0 + || BUF_STATE_GET_USAGECOUNT(local_buf_state) > 1) + break; + + if (unlikely(local_buf_state & BM_LOCKED)) + { + old_buf_state = WaitBufHdrUnlocked(buf); + continue; + } + + local_buf_state += BUF_REFCOUNT_ONE; + + if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state, + local_buf_state)) + { + *buf_state = local_buf_state; + + TrackNewBufferPin(BufferDescriptorGetBuffer(buf)); + return buf; + } } - UnlockBufHdr(buf, local_buf_state); /* * Tell caller to allocate a new buffer with the normal allocation -- 2.48.1.76.g4e746b1a31.dirty