From 6d807a3c85b77d7daf7b8345ab988b6f44fd4c87 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 11 Mar 2024 11:44:41 +1300
Subject: [PATCH v3 3/3] Improve bulk_write.c memory management.

Instead of allocating buffers one at a time with palloc(), allocate an
array full of them up front, and then manage them in a FIFO freelist.
Aside from avoiding allocator overheads, this means that callers who
write sequential blocks will tend to fill up sequential memory, which
hopefully generates more efficient vectored writes.

Discussion: https://postgr.es/m/CA%2BhUKGLx5bLwezZKAYB2O_qHj%3Dov10RpgRVY7e8TSJVE74oVjg%40mail.gmail.com
---
 src/backend/storage/smgr/bulk_write.c | 62 +++++++++++++++++++--------
 1 file changed, 43 insertions(+), 19 deletions(-)

diff --git a/src/backend/storage/smgr/bulk_write.c b/src/backend/storage/smgr/bulk_write.c
index 2a566a80f60..d9c38ffa88b 100644
--- a/src/backend/storage/smgr/bulk_write.c
+++ b/src/backend/storage/smgr/bulk_write.c
@@ -36,6 +36,7 @@
 
 #include "access/xloginsert.h"
 #include "access/xlogrecord.h"
+#include "lib/ilist.h"
 #include "storage/bufmgr.h"
 #include "storage/bufpage.h"
 #include "storage/bulk_write.h"
@@ -45,9 +46,15 @@
 
 #define MAX_PENDING_WRITES XLR_MAX_BLOCK_ID
 
+typedef union BufferSlot
+{
+	PGIOAlignedBlock buffer;
+	dlist_node	freelist_node;
+}			BufferSlot;
+
 typedef struct PendingWrite
 {
-	BulkWriteBuffer buf;
+	BufferSlot *slot;
 	BlockNumber blkno;
 	bool		page_std;
 } PendingWrite;
@@ -57,6 +64,10 @@ typedef struct PendingWrite
  */
 struct BulkWriteState
 {
+	/* Comes first so we can align it correctly. */
+	BufferSlot	buffer_slots[MAX_PENDING_WRITES + 2];
+	dlist_head	buffer_slots_freelist;
+
 	/* Information about the target relation we're writing */
 	SMgrRelation smgr;
 	ForkNumber	forknum;
@@ -71,8 +82,6 @@ struct BulkWriteState
 
 	/* The RedoRecPtr at the time that the bulk operation started */
 	XLogRecPtr	start_RedoRecPtr;
-
-	MemoryContext memcxt;
 };
 
 static void smgr_bulk_flush(BulkWriteState *bulkstate);
@@ -98,7 +107,7 @@ smgr_bulk_start_smgr(SMgrRelation smgr, ForkNumber forknum, bool use_wal)
 {
 	BulkWriteState *state;
 
-	state = palloc(sizeof(BulkWriteState));
+	state = palloc_aligned(sizeof(BulkWriteState), PG_IO_ALIGN_SIZE, 0);
 	state->smgr = smgr;
 	state->forknum = forknum;
 	state->use_wal = use_wal;
@@ -108,11 +117,11 @@ smgr_bulk_start_smgr(SMgrRelation smgr, ForkNumber forknum, bool use_wal)
 
 	state->start_RedoRecPtr = GetRedoRecPtr();
 
-	/*
-	 * Remember the memory context.  We will use it to allocate all the
-	 * buffers later.
-	 */
-	state->memcxt = CurrentMemoryContext;
+	/* Set up the free-list of buffers. */
+	dlist_init(&state->buffer_slots_freelist);
+	for (int i = 0; i < lengthof(state->buffer_slots); ++i)
+		dlist_push_tail(&state->buffer_slots_freelist,
+						&state->buffer_slots[i].freelist_node);
 
 	return state;
 }
@@ -206,7 +215,7 @@ smgr_bulk_flush(BulkWriteState *bulkstate)
 		for (int i = 0; i < npending; i++)
 		{
 			blknos[i] = pending_writes[i].blkno;
-			pages[i] = pending_writes[i].buf->data;
+			pages[i] = pending_writes[i].slot->buffer.data;
 
 			/*
 			 * If any of the pages use !page_std, we log them all as such.
@@ -231,7 +240,7 @@ smgr_bulk_flush(BulkWriteState *bulkstate)
 
 		/* Prepare to write the first block. */
 		blkno = pending_writes[i].blkno;
-		page = pending_writes[i].buf->data;
+		page = pending_writes[i].slot->buffer.data;
 		PageSetChecksumInplace(page, blkno);
 		pages[0] = page;
 		nblocks = 1;
@@ -268,7 +277,7 @@ smgr_bulk_flush(BulkWriteState *bulkstate)
 			   pending_writes[i + 1].blkno == blkno + nblocks &&
 			   nblocks < max_nblocks)
 		{
-			page = pending_writes[++i].buf->data;
+			page = pending_writes[++i].slot->buffer.data;
 			PageSetChecksumInplace(page, pending_writes[i].blkno);
 			pages[nblocks++] = page;
 		}
@@ -285,8 +294,14 @@ smgr_bulk_flush(BulkWriteState *bulkstate)
 			smgrwritev(bulkstate->smgr, bulkstate->forknum, blkno, pages, nblocks, true);
 		}
 
-		for (int j = 0; j < nblocks; ++j)
-			pfree(pending_writes[i - j].buf->data);
+		/*
+		 * Maintain FIFO ordering in the free list, so that users who write
+		 * blocks in sequential order tend to get sequential chunks of buffer
+		 * memory, which may be slight more efficient for vectored writes.
+		 */
+		for (int j = i - nblocks + 1; j <= i; ++j)
+			dlist_push_tail(&bulkstate->buffer_slots_freelist,
+							&pending_writes[j].slot->freelist_node);
 	}
 
 	bulkstate->npending = 0;
@@ -306,7 +321,7 @@ smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer
 	PendingWrite *w;
 
 	w = &bulkstate->pending_writes[bulkstate->npending++];
-	w->buf = buf;
+	w->slot = (BufferSlot *) buf;
 	w->blkno = blocknum;
 	w->page_std = page_std;
 
@@ -320,12 +335,21 @@ smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer
  * There is no function to free the buffer.  When you pass it to
  * smgr_bulk_write(), it takes ownership and frees it when it's no longer
  * needed.
- *
- * This is currently implemented as a simple palloc, but could be implemented
- * using a ring buffer or larger chunks in the future, so don't rely on it.
  */
 BulkWriteBuffer
 smgr_bulk_get_buf(BulkWriteState *bulkstate)
 {
-	return MemoryContextAllocAligned(bulkstate->memcxt, BLCKSZ, PG_IO_ALIGN_SIZE, 0);
+	BufferSlot *slot;
+
+	if (dlist_is_empty(&bulkstate->buffer_slots_freelist))
+	{
+		smgr_bulk_flush(bulkstate);
+		if (dlist_is_empty(&bulkstate->buffer_slots_freelist))
+			elog(ERROR, "too many bulk write buffers used but not yet written");
+	}
+
+	slot = dlist_head_element(BufferSlot, freelist_node, &bulkstate->buffer_slots_freelist);
+	dlist_pop_head_node(&bulkstate->buffer_slots_freelist);
+
+	return &slot->buffer;
 }
-- 
2.39.2