From c4d51ab87b92f9900e37d42cf74980e87b648a56 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Sun, 8 Jun 2025 18:53:12 +0200
Subject: [PATCH v2 5/7] NUMA: clockweep partitioning

---
 src/backend/storage/buffer/bufmgr.c   | 473 ++++++++++++++------------
 src/backend/storage/buffer/freelist.c | 202 ++++++++---
 src/include/storage/buf_internals.h   |   4 +-
 3 files changed, 424 insertions(+), 255 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 5922689fe5d..3d6c834d77c 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3587,6 +3587,23 @@ BufferSync(int flags)
 	TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
 }
 
+/*
+ * Information saved between calls so we can determine the strategy
+ * point's advance rate and avoid scanning already-cleaned buffers.
+ *
+ * XXX One value per partition. We don't know how many partitions are
+ * there, so allocate 32, should be enough for the PoC patch.
+ *
+ * XXX might be better to have a per-partition struct with all the info
+ */
+#define MAX_CLOCKSWEEP_PARTITIONS 32
+static bool saved_info_valid = false;
+static int	prev_strategy_buf_id[MAX_CLOCKSWEEP_PARTITIONS];
+static uint32 prev_strategy_passes[MAX_CLOCKSWEEP_PARTITIONS];
+static int	next_to_clean[MAX_CLOCKSWEEP_PARTITIONS];
+static uint32 next_passes[MAX_CLOCKSWEEP_PARTITIONS];
+
+
 /*
  * BgBufferSync -- Write out some dirty buffers in the pool.
  *
@@ -3602,55 +3619,24 @@ bool
 BgBufferSync(WritebackContext *wb_context)
 {
 	/* info obtained from freelist.c */
-	int			strategy_buf_id;
-	uint32		strategy_passes;
 	uint32		recent_alloc;
+	uint32		recent_alloc_partition;
+	int			num_partitions;
 
-	/*
-	 * Information saved between calls so we can determine the strategy
-	 * point's advance rate and avoid scanning already-cleaned buffers.
-	 */
-	static bool saved_info_valid = false;
-	static int	prev_strategy_buf_id;
-	static uint32 prev_strategy_passes;
-	static int	next_to_clean;
-	static uint32 next_passes;
-
-	/* Moving averages of allocation rate and clean-buffer density */
-	static float smoothed_alloc = 0;
-	static float smoothed_density = 10.0;
-
-	/* Potentially these could be tunables, but for now, not */
-	float		smoothing_samples = 16;
-	float		scan_whole_pool_milliseconds = 120000.0;
-
-	/* Used to compute how far we scan ahead */
-	long		strategy_delta;
-	int			bufs_to_lap;
-	int			bufs_ahead;
-	float		scans_per_alloc;
-	int			reusable_buffers_est;
-	int			upcoming_alloc_est;
-	int			min_scan_buffers;
-
-	/* Variables for the scanning loop proper */
-	int			num_to_scan;
-	int			num_written;
-	int			reusable_buffers;
+	/* assume we can hibernate, any partition can set to false */
+	bool		hibernate = true;
 
-	/* Variables for final smoothed_density update */
-	long		new_strategy_delta;
-	uint32		new_recent_alloc;
+	/* get the number of clocksweep partitions, and total alloc count */
+	StrategySyncPrepare(&num_partitions, &recent_alloc);
 
-	/*
-	 * Find out where the freelist clock sweep currently is, and how many
-	 * buffer allocations have happened since our last call.
-	 */
-	strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
+	Assert(num_partitions <= MAX_CLOCKSWEEP_PARTITIONS);
 
 	/* Report buffer alloc counts to pgstat */
 	PendingBgWriterStats.buf_alloc += recent_alloc;
 
+	/* average alloc buffers per partition */
+	recent_alloc_partition = (recent_alloc / num_partitions);
+
 	/*
 	 * If we're not running the LRU scan, just stop after doing the stats
 	 * stuff.  We mark the saved state invalid so that we can recover sanely
@@ -3663,223 +3649,282 @@ BgBufferSync(WritebackContext *wb_context)
 	}
 
 	/*
-	 * Compute strategy_delta = how many buffers have been scanned by the
-	 * clock sweep since last time.  If first time through, assume none. Then
-	 * see if we are still ahead of the clock sweep, and if so, how many
-	 * buffers we could scan before we'd catch up with it and "lap" it. Note:
-	 * weird-looking coding of xxx_passes comparisons are to avoid bogus
-	 * behavior when the passes counts wrap around.
-	 */
-	if (saved_info_valid)
-	{
-		int32		passes_delta = strategy_passes - prev_strategy_passes;
-
-		strategy_delta = strategy_buf_id - prev_strategy_buf_id;
-		strategy_delta += (long) passes_delta * NBuffers;
+	 * now process the clocksweep partitions, one by one, using the same
+	 * cleanup that we used for all buffers
+	 *
+	 * XXX Maybe we should randomize the order of partitions a bit, so that
+	 * we don't start from partition 0 all the time? Perhaps not entirely,
+	 * but at least pick a random starting point?
+	 */
+	for (int partition = 0; partition < num_partitions; partition++)
+	{
+		/* info obtained from freelist.c */
+		int			strategy_buf_id;
+		uint32		strategy_passes;
+
+		/* Moving averages of allocation rate and clean-buffer density */
+		static float smoothed_alloc = 0;
+		static float smoothed_density = 10.0;
+
+		/* Potentially these could be tunables, but for now, not */
+		float		smoothing_samples = 16;
+		float		scan_whole_pool_milliseconds = 120000.0;
+
+		/* Used to compute how far we scan ahead */
+		long		strategy_delta;
+		int			bufs_to_lap;
+		int			bufs_ahead;
+		float		scans_per_alloc;
+		int			reusable_buffers_est;
+		int			upcoming_alloc_est;
+		int			min_scan_buffers;
+
+		/* Variables for the scanning loop proper */
+		int			num_to_scan;
+		int			num_written;
+		int			reusable_buffers;
+
+		/* Variables for final smoothed_density update */
+		long		new_strategy_delta;
+		uint32		new_recent_alloc;
+
+		/* buffer range for the clocksweep partition */
+		int			first_buffer;
+		int			num_buffers;
 
-		Assert(strategy_delta >= 0);
+		/*
+		 * Find out where the freelist clock sweep currently is, and how many
+		 * buffer allocations have happened since our last call.
+		 */
+		strategy_buf_id = StrategySyncStart(partition, &strategy_passes,
+											&first_buffer, &num_buffers);
 
-		if ((int32) (next_passes - strategy_passes) > 0)
+		/*
+		 * Compute strategy_delta = how many buffers have been scanned by the
+		 * clock sweep since last time.  If first time through, assume none. Then
+		 * see if we are still ahead of the clock sweep, and if so, how many
+		 * buffers we could scan before we'd catch up with it and "lap" it. Note:
+		 * weird-looking coding of xxx_passes comparisons are to avoid bogus
+		 * behavior when the passes counts wrap around.
+		 */
+		if (saved_info_valid)
 		{
-			/* we're one pass ahead of the strategy point */
-			bufs_to_lap = strategy_buf_id - next_to_clean;
+			int32		passes_delta = strategy_passes - prev_strategy_passes[partition];
+
+			strategy_delta = strategy_buf_id - prev_strategy_buf_id[partition];
+			strategy_delta += (long) passes_delta * num_buffers;
+
+			Assert(strategy_delta >= 0);
+
+			if ((int32) (next_passes[partition] - strategy_passes) > 0)
+			{
+				/* we're one pass ahead of the strategy point */
+				bufs_to_lap = strategy_buf_id - next_to_clean[partition];
 #ifdef BGW_DEBUG
-			elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
-				 next_passes, next_to_clean,
-				 strategy_passes, strategy_buf_id,
-				 strategy_delta, bufs_to_lap);
+				elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
+					 next_passes, next_to_clean,
+					 strategy_passes, strategy_buf_id,
+					 strategy_delta, bufs_to_lap);
 #endif
-		}
-		else if (next_passes == strategy_passes &&
-				 next_to_clean >= strategy_buf_id)
-		{
-			/* on same pass, but ahead or at least not behind */
-			bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
+			}
+			else if (next_passes[partition] == strategy_passes &&
+					 next_to_clean[partition] >= strategy_buf_id)
+			{
+				/* on same pass, but ahead or at least not behind */
+				bufs_to_lap = num_buffers - (next_to_clean[partition] - strategy_buf_id);
+#ifdef BGW_DEBUG
+				elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
+					 next_passes, next_to_clean,
+					 strategy_passes, strategy_buf_id,
+					 strategy_delta, bufs_to_lap);
+#endif
+			}
+			else
+			{
+				/*
+				 * We're behind, so skip forward to the strategy point and start
+				 * cleaning from there.
+				 */
 #ifdef BGW_DEBUG
-			elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
-				 next_passes, next_to_clean,
-				 strategy_passes, strategy_buf_id,
-				 strategy_delta, bufs_to_lap);
+				elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
+					 next_passes, next_to_clean,
+					 strategy_passes, strategy_buf_id,
+					 strategy_delta);
 #endif
+				next_to_clean[partition] = strategy_buf_id;
+				next_passes[partition] = strategy_passes;
+				bufs_to_lap = num_buffers;
+			}
 		}
 		else
 		{
 			/*
-			 * We're behind, so skip forward to the strategy point and start
-			 * cleaning from there.
+			 * Initializing at startup or after LRU scanning had been off. Always
+			 * start at the strategy point.
 			 */
 #ifdef BGW_DEBUG
-			elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
-				 next_passes, next_to_clean,
-				 strategy_passes, strategy_buf_id,
-				 strategy_delta);
+			elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
+				 strategy_passes, strategy_buf_id);
 #endif
-			next_to_clean = strategy_buf_id;
-			next_passes = strategy_passes;
-			bufs_to_lap = NBuffers;
+			strategy_delta = 0;
+			next_to_clean[partition] = strategy_buf_id;
+			next_passes[partition] = strategy_passes;
+			bufs_to_lap = num_buffers;
 		}
-	}
-	else
-	{
-		/*
-		 * Initializing at startup or after LRU scanning had been off. Always
-		 * start at the strategy point.
-		 */
-#ifdef BGW_DEBUG
-		elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
-			 strategy_passes, strategy_buf_id);
-#endif
-		strategy_delta = 0;
-		next_to_clean = strategy_buf_id;
-		next_passes = strategy_passes;
-		bufs_to_lap = NBuffers;
-	}
 
-	/* Update saved info for next time */
-	prev_strategy_buf_id = strategy_buf_id;
-	prev_strategy_passes = strategy_passes;
-	saved_info_valid = true;
+		/* Update saved info for next time */
+		prev_strategy_buf_id[partition] = strategy_buf_id;
+		prev_strategy_passes[partition] = strategy_passes;
+		// FIXME has to happen after all partitions
+		// saved_info_valid = true;
 
-	/*
-	 * Compute how many buffers had to be scanned for each new allocation, ie,
-	 * 1/density of reusable buffers, and track a moving average of that.
-	 *
-	 * If the strategy point didn't move, we don't update the density estimate
-	 */
-	if (strategy_delta > 0 && recent_alloc > 0)
-	{
-		scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
-		smoothed_density += (scans_per_alloc - smoothed_density) /
-			smoothing_samples;
-	}
+		/*
+		 * Compute how many buffers had to be scanned for each new allocation, ie,
+		 * 1/density of reusable buffers, and track a moving average of that.
+		 *
+		 * If the strategy point didn't move, we don't update the density estimate
+		 */
+		if (strategy_delta > 0 && recent_alloc_partition > 0)
+		{
+			scans_per_alloc = (float) strategy_delta / (float) recent_alloc_partition;
+			smoothed_density += (scans_per_alloc - smoothed_density) /
+				smoothing_samples;
+		}
 
-	/*
-	 * Estimate how many reusable buffers there are between the current
-	 * strategy point and where we've scanned ahead to, based on the smoothed
-	 * density estimate.
-	 */
-	bufs_ahead = NBuffers - bufs_to_lap;
-	reusable_buffers_est = (float) bufs_ahead / smoothed_density;
+		/*
+		 * Estimate how many reusable buffers there are between the current
+		 * strategy point and where we've scanned ahead to, based on the smoothed
+		 * density estimate.
+		 */
+		bufs_ahead = num_buffers - bufs_to_lap;
+		reusable_buffers_est = (float) bufs_ahead / smoothed_density;
 
-	/*
-	 * Track a moving average of recent buffer allocations.  Here, rather than
-	 * a true average we want a fast-attack, slow-decline behavior: we
-	 * immediately follow any increase.
-	 */
-	if (smoothed_alloc <= (float) recent_alloc)
-		smoothed_alloc = recent_alloc;
-	else
-		smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
-			smoothing_samples;
+		/*
+		 * Track a moving average of recent buffer allocations.  Here, rather than
+		 * a true average we want a fast-attack, slow-decline behavior: we
+		 * immediately follow any increase.
+		 */
+		if (smoothed_alloc <= (float) recent_alloc_partition)
+			smoothed_alloc = recent_alloc_partition;
+		else
+			smoothed_alloc += ((float) recent_alloc_partition - smoothed_alloc) /
+				smoothing_samples;
 
-	/* Scale the estimate by a GUC to allow more aggressive tuning. */
-	upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
+		/* Scale the estimate by a GUC to allow more aggressive tuning. */
+		upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
 
-	/*
-	 * If recent_alloc remains at zero for many cycles, smoothed_alloc will
-	 * eventually underflow to zero, and the underflows produce annoying
-	 * kernel warnings on some platforms.  Once upcoming_alloc_est has gone to
-	 * zero, there's no point in tracking smaller and smaller values of
-	 * smoothed_alloc, so just reset it to exactly zero to avoid this
-	 * syndrome.  It will pop back up as soon as recent_alloc increases.
-	 */
-	if (upcoming_alloc_est == 0)
-		smoothed_alloc = 0;
+		/*
+		 * If recent_alloc remains at zero for many cycles, smoothed_alloc will
+		 * eventually underflow to zero, and the underflows produce annoying
+		 * kernel warnings on some platforms.  Once upcoming_alloc_est has gone to
+		 * zero, there's no point in tracking smaller and smaller values of
+		 * smoothed_alloc, so just reset it to exactly zero to avoid this
+		 * syndrome.  It will pop back up as soon as recent_alloc increases.
+		 */
+		if (upcoming_alloc_est == 0)
+			smoothed_alloc = 0;
 
-	/*
-	 * Even in cases where there's been little or no buffer allocation
-	 * activity, we want to make a small amount of progress through the buffer
-	 * cache so that as many reusable buffers as possible are clean after an
-	 * idle period.
-	 *
-	 * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
-	 * the BGW will be called during the scan_whole_pool time; slice the
-	 * buffer pool into that many sections.
-	 */
-	min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
+		/*
+		 * Even in cases where there's been little or no buffer allocation
+		 * activity, we want to make a small amount of progress through the buffer
+		 * cache so that as many reusable buffers as possible are clean after an
+		 * idle period.
+		 *
+		 * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
+		 * the BGW will be called during the scan_whole_pool time; slice the
+		 * buffer pool into that many sections.
+		 */
+		min_scan_buffers = (int) (num_buffers / (scan_whole_pool_milliseconds / BgWriterDelay));
 
-	if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
-	{
+		if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
+		{
 #ifdef BGW_DEBUG
-		elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
-			 upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
+			elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
+				 upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
 #endif
-		upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
-	}
-
-	/*
-	 * Now write out dirty reusable buffers, working forward from the
-	 * next_to_clean point, until we have lapped the strategy scan, or cleaned
-	 * enough buffers to match our estimate of the next cycle's allocation
-	 * requirements, or hit the bgwriter_lru_maxpages limit.
-	 */
+			upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
+		}
 
-	num_to_scan = bufs_to_lap;
-	num_written = 0;
-	reusable_buffers = reusable_buffers_est;
+		/*
+		 * Now write out dirty reusable buffers, working forward from the
+		 * next_to_clean point, until we have lapped the strategy scan, or cleaned
+		 * enough buffers to match our estimate of the next cycle's allocation
+		 * requirements, or hit the bgwriter_lru_maxpages limit.
+		 */
 
-	/* Execute the LRU scan */
-	while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
-	{
-		int			sync_state = SyncOneBuffer(next_to_clean, true,
-											   wb_context);
+		num_to_scan = bufs_to_lap;
+		num_written = 0;
+		reusable_buffers = reusable_buffers_est;
 
-		if (++next_to_clean >= NBuffers)
+		/* Execute the LRU scan */
+		while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
 		{
-			next_to_clean = 0;
-			next_passes++;
-		}
-		num_to_scan--;
+			int			sync_state = SyncOneBuffer(next_to_clean[partition], true,
+												   wb_context);
 
-		if (sync_state & BUF_WRITTEN)
-		{
-			reusable_buffers++;
-			if (++num_written >= bgwriter_lru_maxpages)
+			if (++next_to_clean[partition] >= (first_buffer + num_buffers))
 			{
-				PendingBgWriterStats.maxwritten_clean++;
-				break;
+				next_to_clean[partition] = first_buffer;
+				next_passes[partition]++;
+			}
+			num_to_scan--;
+
+			if (sync_state & BUF_WRITTEN)
+			{
+				reusable_buffers++;
+				if (++num_written >= (bgwriter_lru_maxpages / num_partitions))
+				{
+					PendingBgWriterStats.maxwritten_clean++;
+					break;
+				}
 			}
+			else if (sync_state & BUF_REUSABLE)
+				reusable_buffers++;
 		}
-		else if (sync_state & BUF_REUSABLE)
-			reusable_buffers++;
-	}
 
-	PendingBgWriterStats.buf_written_clean += num_written;
+		PendingBgWriterStats.buf_written_clean += num_written;
 
 #ifdef BGW_DEBUG
-	elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
-		 recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
-		 smoothed_density, reusable_buffers_est, upcoming_alloc_est,
-		 bufs_to_lap - num_to_scan,
-		 num_written,
-		 reusable_buffers - reusable_buffers_est);
+		elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
+			 recent_alloc_partition, smoothed_alloc, strategy_delta, bufs_ahead,
+			 smoothed_density, reusable_buffers_est, upcoming_alloc_est,
+			 bufs_to_lap - num_to_scan,
+			 num_written,
+			 reusable_buffers - reusable_buffers_est);
 #endif
 
-	/*
-	 * Consider the above scan as being like a new allocation scan.
-	 * Characterize its density and update the smoothed one based on it. This
-	 * effectively halves the moving average period in cases where both the
-	 * strategy and the background writer are doing some useful scanning,
-	 * which is helpful because a long memory isn't as desirable on the
-	 * density estimates.
-	 */
-	new_strategy_delta = bufs_to_lap - num_to_scan;
-	new_recent_alloc = reusable_buffers - reusable_buffers_est;
-	if (new_strategy_delta > 0 && new_recent_alloc > 0)
-	{
-		scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
-		smoothed_density += (scans_per_alloc - smoothed_density) /
-			smoothing_samples;
+		/*
+		 * Consider the above scan as being like a new allocation scan.
+		 * Characterize its density and update the smoothed one based on it. This
+		 * effectively halves the moving average period in cases where both the
+		 * strategy and the background writer are doing some useful scanning,
+		 * which is helpful because a long memory isn't as desirable on the
+		 * density estimates.
+		 */
+		new_strategy_delta = bufs_to_lap - num_to_scan;
+		new_recent_alloc = reusable_buffers - reusable_buffers_est;
+		if (new_strategy_delta > 0 && new_recent_alloc > 0)
+		{
+			scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
+			smoothed_density += (scans_per_alloc - smoothed_density) /
+				smoothing_samples;
 
 #ifdef BGW_DEBUG
-		elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
-			 new_recent_alloc, new_strategy_delta,
-			 scans_per_alloc, smoothed_density);
+			elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
+				 new_recent_alloc, new_strategy_delta,
+				 scans_per_alloc, smoothed_density);
 #endif
+		}
+
+		/* hibernate if all partitions can hibernate */
+		hibernate &= (bufs_to_lap == 0 && recent_alloc_partition == 0);
 	}
 
+	/* now that we've scanned all partitions, mark the cached info as valid */
+	saved_info_valid = true;
+
 	/* Return true if OK to hibernate */
-	return (bufs_to_lap == 0 && recent_alloc == 0);
+	return hibernate;
 }
 
 /*
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index e38e5c7ec3d..1827e052da7 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -63,17 +63,27 @@ typedef struct BufferStrategyFreelist
 #define MIN_FREELIST_PARTITIONS		4
 
 /*
- * The shared freelist control information.
+ * Information about one partition of the ClockSweep (on a subset of buffers).
+ *
+ * XXX Should be careful to align this to cachelines, etc.
  */
 typedef struct
 {
 	/* Spinlock: protects the values below */
-	slock_t		buffer_strategy_lock;
+	slock_t		clock_sweep_lock;
+
+	/* range for this clock weep partition */
+	int32		firstBuffer;
+	int32		numBuffers;
 
 	/*
 	 * Clock sweep hand: index of next buffer to consider grabbing. Note that
 	 * this isn't a concrete buffer - we only ever increase the value. So, to
 	 * get an actual buffer, it needs to be used modulo NBuffers.
+	 *
+	 * XXX This is relative to firstBuffer, so needs to be offset properly.
+	 *
+	 * XXX firstBuffer + (nextVictimBuffer % numBuffers)
 	 */
 	pg_atomic_uint32 nextVictimBuffer;
 
@@ -83,6 +93,15 @@ typedef struct
 	 */
 	uint32		completePasses; /* Complete cycles of the clock sweep */
 	pg_atomic_uint32 numBufferAllocs;	/* Buffers allocated since last reset */
+} ClockSweep;
+
+/*
+ * The shared freelist control information.
+ */
+typedef struct
+{
+	/* Spinlock: protects the values below */
+	slock_t		buffer_strategy_lock;
 
 	/*
 	 * Bgworker process to be notified upon activity or -1 if none. See
@@ -99,6 +118,9 @@ typedef struct
 	int			num_partitions_groups;	/* effectively num of NUMA nodes */
 	int			num_partitions_per_group;
 
+	/* clocksweep partitions */
+	ClockSweep *sweeps;
+
 	BufferStrategyFreelist freelists[FLEXIBLE_ARRAY_MEMBER];
 } BufferStrategyControl;
 
@@ -152,6 +174,7 @@ static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
 									 uint32 *buf_state);
 static void AddBufferToRing(BufferAccessStrategy strategy,
 							BufferDesc *buf);
+static ClockSweep *ChooseClockSweep(void);
 
 /*
  * ClockSweepTick - Helper routine for StrategyGetBuffer()
@@ -163,6 +186,7 @@ static inline uint32
 ClockSweepTick(void)
 {
 	uint32		victim;
+	ClockSweep *sweep = ChooseClockSweep();
 
 	/*
 	 * Atomically move hand ahead one buffer - if there's several processes
@@ -170,14 +194,14 @@ ClockSweepTick(void)
 	 * apparent order.
 	 */
 	victim =
-		pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1);
+		pg_atomic_fetch_add_u32(&sweep->nextVictimBuffer, 1);
 
-	if (victim >= NBuffers)
+	if (victim >= sweep->numBuffers)
 	{
 		uint32		originalVictim = victim;
 
 		/* always wrap what we look up in BufferDescriptors */
-		victim = victim % NBuffers;
+		victim = victim % sweep->numBuffers;
 
 		/*
 		 * If we're the one that just caused a wraparound, force
@@ -203,19 +227,23 @@ ClockSweepTick(void)
 				 * could lead to an overflow of nextVictimBuffers, but that's
 				 * highly unlikely and wouldn't be particularly harmful.
 				 */
-				SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
+				SpinLockAcquire(&sweep->clock_sweep_lock);
 
-				wrapped = expected % NBuffers;
+				wrapped = expected % sweep->numBuffers;
 
-				success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer,
+				success = pg_atomic_compare_exchange_u32(&sweep->nextVictimBuffer,
 														 &expected, wrapped);
 				if (success)
-					StrategyControl->completePasses++;
-				SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+					sweep->completePasses++;
+				SpinLockRelease(&sweep->clock_sweep_lock);
 			}
 		}
 	}
-	return victim;
+
+	/* XXX buffer IDs are 1-based, we're calculating 0-based indexes */
+	Assert(BufferIsValid(1 + sweep->firstBuffer + (victim % sweep->numBuffers)));
+
+	return sweep->firstBuffer + victim;
 }
 
 /*
@@ -289,6 +317,28 @@ calculate_partition_index()
 	return index;
 }
 
+/*
+ * ChooseClockSweep
+ *		pick a clocksweep partition based on NUMA node and CPU
+ *
+ * The number of clocksweep partitions may not match the number of NUMA
+ * nodes, but it should not be lower. Each partition should be mapped to
+ * a single NUMA node, but a node may have multiple partitions. If there
+ * are multiple partitions per node (all nodes have the same number of
+ * partitions), we pick the partition using CPU.
+ *
+ * XXX Maybe we should do both the total and "per group" counts a power of
+ * two? That'd allow using shifts instead of divisions in the calculation,
+ * and that's cheaper. But how would that deal with odd number of nodes?
+ */
+static ClockSweep *
+ChooseClockSweep(void)
+{
+	int index = calculate_partition_index();
+
+	return &StrategyControl->sweeps[index];
+}
+
 /*
  * ChooseFreeList
  *		Pick the buffer freelist to use, depending on the CPU and NUMA node.
@@ -404,7 +454,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r
 	 * the rate of buffer consumption.  Note that buffers recycled by a
 	 * strategy object are intentionally not counted here.
 	 */
-	pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1);
+	pg_atomic_fetch_add_u32(&ChooseClockSweep()->numBufferAllocs, 1);
 
 	/*
 	 * First check, without acquiring the lock, whether there's buffers in the
@@ -475,13 +525,17 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r
 	/*
 	 * Nothing on the freelist, so run the "clock sweep" algorithm
 	 *
-	 * XXX Should we also make this NUMA-aware, to only access buffers from
-	 * the same NUMA node? That'd probably mean we need to make the clock
-	 * sweep NUMA-aware, perhaps by having multiple clock sweeps, each for a
-	 * subset of buffers. But that also means each process could "sweep" only
-	 * a fraction of buffers, even if the other buffers are better candidates
-	 * for eviction. Would that also mean we'd have multiple bgwriters, one
-	 * for each node, or would one bgwriter handle all of that?
+	 * XXX Note that ClockSweepTick() is NUMA-aware, i.e. it only looks at
+	 * buffers from a single partition, aligned with the NUMA node. That
+	 * means it only accesses buffers from the same NUMA node.
+	 *
+	 * XXX That also means each process "sweeps" only a fraction of buffers,
+	 * even if the other buffers are better candidates for eviction. Maybe
+	 * there should be some logic to "steal" buffers from other freelists
+	 * or other nodes?
+	 *
+	 * XXX Would that also mean we'd have multiple bgwriters, one for each
+	 * node, or would one bgwriter handle all of that?
 	 */
 	trycounter = NBuffers;
 	for (;;)
@@ -563,6 +617,41 @@ StrategyFreeBuffer(BufferDesc *buf)
 	SpinLockRelease(&freelist->freelist_lock);
 }
 
+/*
+ * StrategySyncStart -- prepare for sync of all partitions
+ *
+ * Determine the number of clocksweep partitions, and calculate the recent
+ * buffers allocs (as a sum of all the partitions). This allows BgBufferSync
+ * to calculate average number of allocations per partition for the next
+ * sync cycle.
+ *
+ * In addition it returns the count of recent buffer allocs, which is a total
+ * summed from all partitions. The alloc counts are reset after being read,
+ * as the partitions are walked.
+ */
+void
+StrategySyncPrepare(int *num_parts, uint32 *num_buf_alloc)
+{
+	*num_buf_alloc = 0;
+	*num_parts = StrategyControl->num_partitions;
+
+	/*
+	 * We lock the partitions one by one, so not exacly in sync, but that
+	 * should be fine. We're only looking for heuristics anyway.
+	 */
+	for (int i = 0; i < StrategyControl->num_partitions; i++)
+	{
+		ClockSweep *sweep = &StrategyControl->sweeps[i];
+
+		SpinLockAcquire(&sweep->clock_sweep_lock);
+		if (num_buf_alloc)
+		{
+			*num_buf_alloc += pg_atomic_exchange_u32(&sweep->numBufferAllocs, 0);
+		}
+		SpinLockRelease(&sweep->clock_sweep_lock);
+	}
+}
+
 /*
  * StrategySyncStart -- tell BgBufferSync where to start syncing
  *
@@ -570,37 +659,44 @@ StrategyFreeBuffer(BufferDesc *buf)
  * BgBufferSync() will proceed circularly around the buffer array from there.
  *
  * In addition, we return the completed-pass count (which is effectively
- * the higher-order bits of nextVictimBuffer) and the count of recent buffer
- * allocs if non-NULL pointers are passed.  The alloc count is reset after
- * being read.
+ * the higher-order bits of nextVictimBuffer).
+ *
+ * This only considers a single clocksweep partition, as BgBufferSync looks
+ * at them one by one.
  */
 int
-StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
+StrategySyncStart(int partition, uint32 *complete_passes,
+				  int *first_buffer, int *num_buffers)
 {
 	uint32		nextVictimBuffer;
 	int			result;
+	ClockSweep *sweep = &StrategyControl->sweeps[partition];
 
-	SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
-	nextVictimBuffer = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer);
-	result = nextVictimBuffer % NBuffers;
+	Assert((partition >= 0) && (partition < StrategyControl->num_partitions));
+
+	SpinLockAcquire(&sweep->clock_sweep_lock);
+	nextVictimBuffer = pg_atomic_read_u32(&sweep->nextVictimBuffer);
+	result = nextVictimBuffer % sweep->numBuffers;
+
+	*first_buffer = sweep->firstBuffer;
+	*num_buffers = sweep->numBuffers;
 
 	if (complete_passes)
 	{
-		*complete_passes = StrategyControl->completePasses;
+		*complete_passes = sweep->completePasses;
 
 		/*
 		 * Additionally add the number of wraparounds that happened before
 		 * completePasses could be incremented. C.f. ClockSweepTick().
 		 */
-		*complete_passes += nextVictimBuffer / NBuffers;
+		*complete_passes += nextVictimBuffer / sweep->numBuffers;
 	}
+	SpinLockRelease(&sweep->clock_sweep_lock);
 
-	if (num_buf_alloc)
-	{
-		*num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0);
-	}
-	SpinLockRelease(&StrategyControl->buffer_strategy_lock);
-	return result;
+	/* XXX buffer IDs start at 1, we're calculating 0-based indexes */
+	Assert(BufferIsValid(1 + sweep->firstBuffer + result));
+
+	return sweep->firstBuffer + result;
 }
 
 /*
@@ -696,6 +792,10 @@ StrategyShmemSize(void)
 	size = add_size(size, MAXALIGN(mul_size(sizeof(BufferStrategyFreelist),
 											num_partitions)));
 
+	/* size of clocksweep partitions (at least one per NUMA node) */
+	size = add_size(size, MAXALIGN(mul_size(sizeof(ClockSweep),
+											num_partitions)));
+
 	return size;
 }
 
@@ -714,6 +814,7 @@ StrategyInitialize(bool init)
 
 	int			num_partitions;
 	int			num_partitions_per_group;
+	char	   *ptr;
 
 	/* */
 	num_partitions = calculate_partition_count(strategy_nnodes);
@@ -736,7 +837,8 @@ StrategyInitialize(bool init)
 	StrategyControl = (BufferStrategyControl *)
 		ShmemInitStruct("Buffer Strategy Status",
 						MAXALIGN(offsetof(BufferStrategyControl, freelists)) +
-						MAXALIGN(sizeof(BufferStrategyFreelist) * num_partitions),
+						MAXALIGN(sizeof(BufferStrategyFreelist) * num_partitions) +
+						MAXALIGN(sizeof(ClockSweep) * num_partitions),
 						&found);
 
 	if (!found)
@@ -758,12 +860,32 @@ StrategyInitialize(bool init)
 
 		SpinLockInit(&StrategyControl->buffer_strategy_lock);
 
-		/* Initialize the clock sweep pointer */
-		pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0);
+		/* have to point the sweeps array to right after the freelists */
+		ptr = (char *) StrategyControl +
+				MAXALIGN(offsetof(BufferStrategyControl, freelists)) +
+				MAXALIGN(sizeof(BufferStrategyFreelist) * num_partitions);
+		StrategyControl->sweeps = (ClockSweep *) ptr;
 
-		/* Clear statistics */
-		StrategyControl->completePasses = 0;
-		pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0);
+		/* Initialize the clock sweep pointers (for all partitions) */
+		for (int i = 0; i < num_partitions; i++)
+		{
+			SpinLockInit(&StrategyControl->sweeps[i].clock_sweep_lock);
+
+			pg_atomic_init_u32(&StrategyControl->sweeps[i].nextVictimBuffer, 0);
+
+			/*
+			 * FIXME This may not quite right, because if NBuffers is not
+			 * a perfect multiple of numBuffers, the last partition will have
+			 * numBuffers set too high. buf_init handles this by tracking the
+			 * remaining number of buffers, and not overflowing.
+			 */
+			StrategyControl->sweeps[i].numBuffers = numBuffers;
+			StrategyControl->sweeps[i].firstBuffer = (numBuffers * i);
+
+			/* Clear statistics */
+			StrategyControl->sweeps[i].completePasses = 0;
+			pg_atomic_init_u32(&StrategyControl->sweeps[i].numBufferAllocs, 0);
+		}
 
 		/* No pending notification */
 		StrategyControl->bgwprocno = -1;
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 52a71b138f7..b50f9458156 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -448,7 +448,9 @@ extern void StrategyFreeBuffer(BufferDesc *buf);
 extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
 								 BufferDesc *buf, bool from_ring);
 
-extern int	StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc);
+extern void StrategySyncPrepare(int *num_parts, uint32 *num_buf_alloc);
+extern int	StrategySyncStart(int partition, uint32 *complete_passes,
+							  int *first_buffer, int *num_buffers);
 extern void StrategyNotifyBgWriter(int bgwprocno);
 
 extern Size StrategyShmemSize(void);
-- 
2.49.0