From d5fed9a73fac8579c87952d4551adf5bdfeba8c4 Mon Sep 17 00:00:00 2001 From: Greg Burd Date: Tue, 12 Aug 2025 10:58:52 -0400 Subject: [PATCH v13 4/4] Optimize modulo and division used in clock-sweep algorithm Improve the performance of the buffer manager by replacing the modulo and division operations with a technique described in the paper "Division by Invariant Integers using Multiplication" [1]. Our implementation is inspired by the MIT Licensed "fastdiv" [2]. This algorithm provides accurate division and modulo in constant time that is pipeline and ALU friendly and estimated to take about ~12-18 cycles (vs 26-90 for hardware division). Because our divisor (NBuffers) is fixed at startup so we need only calculate the constant used by it once. [1] https://gmplib.org/~tege/divcnst-pldi94.pdf [2] https://github.com/jmtilli/fastdiv --- src/backend/storage/buffer/freelist.c | 106 ++++++++++++++++++++++++-- 1 file changed, 98 insertions(+), 8 deletions(-) diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 7d68f2227b3..96ae21fb152 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -31,12 +31,28 @@ typedef struct { /* * The clock-sweep counter is atomically updated by 1 at every tick. Use - * the macro CLOCKSWEEP_HAND() to find the location of the hand on the - * clock. Use CLOCKSWEEP_PASSES() to calculate the number of times the + * the function ClockSweepHand() to find the location of the hand on the + * clock. Use ClockSweepPasses() to calculate the number of times the * clock-sweep hand has made a complete pass around the clock. */ pg_atomic_uint64 clockSweepCounter; + /* + * Division and modulo can be expensive to calculate repeatedly. Given + * that the buffer manager is a very hot code path we implement a more + * efficient method based on using "Division by invariant Integers using + * Multiplication" (https://gmplib.org/~tege/divcnst-pldi94.pdf) by + * Granlund-Montgomery. Our implementation below was inspired by the MIT + * Licensed "fastdiv" (https://github.com/jmtilli/fastdiv). + */ + struct + { + uint32 mul; + uint32 mod; + uint8 shift1:1; + uint8 shift2:7; + } md; + /* * Statistics. These counters should be wide enough that they can't * overflow during a single bgwriter cycle. @@ -86,17 +102,75 @@ static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy, static void AddBufferToRing(BufferAccessStrategy strategy, BufferDesc *buf); +static inline uint32 +InvariantDivision(uint64 n) +{ + /* Compute quotient using multiplication */ + uint64 product = n * StrategyControl->md.mul; + uint32 quotient = (uint32) (product >> 32); + + /* + * The invariant multiplication gives us an approximation that may be off + * by 1. + */ + n -= quotient; + n >>= StrategyControl->md.shift1; + n += quotient; + n >>= StrategyControl->md.shift2; + + return n; +} + +static inline uint32 +InvariantModulo(uint64 n) +{ + /* Compute quotient using multiplication */ + uint64 product = n * StrategyControl->md.mul; + uint32 quotient = (uint32) (product >> 32); + uint32 on = n; + + /* + * The invariant multiplication gives us an approximation that may be off + * by 1. + */ + n -= quotient; + n >>= StrategyControl->md.shift1; + n += quotient; + n >>= StrategyControl->md.shift2; + + quotient = StrategyControl->md.mod * n; + return on - quotient; +} + /* * The clock-sweep counter is a uint64 but the clock hand can never be larger - * than a uint32. Enforce that contract uniformly using this macro. + * than a uint32. */ -#define CLOCKSWEEP_HAND(counter) \ ((uint32) (counter)) % NBuffers +static inline uint32 +ClockSweepHand(uint64 counter) +{ + uint32 result = InvariantModulo(counter); + + Assert(result < NBuffers); + Assert(result == (uint32) counter % NBuffers); + + return result; +} /* * The number of times the clock hand has made a complete pass around the clock * visiting all the available buffers is the counter divided by NBuffers. */ -#define CLOCKSWEEP_PASSES(counter) \ (uint32) ((counter) / NBuffers) +static inline uint32 +ClockSweepPasses(uint64 counter) +{ + uint32 result = InvariantDivision(counter); + + /* Verify our result matches standard division */ + Assert(result == (uint32) (counter / NBuffers)); + + return result; +} /* * ClockSweepTick - Helper routine for StrategyGetBuffer() @@ -117,7 +191,7 @@ ClockSweepTick(void) */ counter = pg_atomic_fetch_add_u64(&StrategyControl->clockSweepCounter, 1); - hand = CLOCKSWEEP_HAND(counter); + hand = ClockSweepHand(counter); Assert(hand < NBuffers); return hand; @@ -251,10 +325,10 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc) uint32 result; counter = pg_atomic_read_u64(&StrategyControl->clockSweepCounter); - result = CLOCKSWEEP_HAND(counter); + result = ClockSweepHand(counter); if (complete_passes) - *complete_passes = CLOCKSWEEP_PASSES(counter); + *complete_passes = ClockSweepPasses(counter); if (num_buf_alloc) *num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0); @@ -333,11 +407,27 @@ StrategyInitialize(bool init) if (!found) { + uint8 shift2 = 0; + uint32 divisor = NBuffers; + uint8 is_pow2 = (divisor & (divisor - 1)) == 0 ? 0 : 1; + /* * Only done once, usually in postmaster */ Assert(init); + /* Calculate the constants used for speeding up division and modulo */ + Assert(NBuffers > 0 && NBuffers < (1U << 31)); + + /* shift2 = ilog(NBuffers) */ + for (uint32 n = divisor; n >>= 1;) + shift2++; + + StrategyControl->md.shift1 = is_pow2; + StrategyControl->md.shift2 = shift2; + StrategyControl->md.mod = NBuffers; + StrategyControl->md.mul = (1ULL << (32 + is_pow2 + shift2)) / NBuffers + 1; + /* Initialize combined clock-sweep pointer/complete passes counter */ pg_atomic_init_u64(&StrategyControl->clockSweepCounter, 0); -- 2.49.0