From 6d807a3c85b77d7daf7b8345ab988b6f44fd4c87 Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Mon, 11 Mar 2024 11:44:41 +1300 Subject: [PATCH v3 3/3] Improve bulk_write.c memory management. Instead of allocating buffers one at a time with palloc(), allocate an array full of them up front, and then manage them in a FIFO freelist. Aside from avoiding allocator overheads, this means that callers who write sequential blocks will tend to fill up sequential memory, which hopefully generates more efficient vectored writes. Discussion: https://postgr.es/m/CA%2BhUKGLx5bLwezZKAYB2O_qHj%3Dov10RpgRVY7e8TSJVE74oVjg%40mail.gmail.com --- src/backend/storage/smgr/bulk_write.c | 62 +++++++++++++++++++-------- 1 file changed, 43 insertions(+), 19 deletions(-) diff --git a/src/backend/storage/smgr/bulk_write.c b/src/backend/storage/smgr/bulk_write.c index 2a566a80f60..d9c38ffa88b 100644 --- a/src/backend/storage/smgr/bulk_write.c +++ b/src/backend/storage/smgr/bulk_write.c @@ -36,6 +36,7 @@ #include "access/xloginsert.h" #include "access/xlogrecord.h" +#include "lib/ilist.h" #include "storage/bufmgr.h" #include "storage/bufpage.h" #include "storage/bulk_write.h" @@ -45,9 +46,15 @@ #define MAX_PENDING_WRITES XLR_MAX_BLOCK_ID +typedef union BufferSlot +{ + PGIOAlignedBlock buffer; + dlist_node freelist_node; +} BufferSlot; + typedef struct PendingWrite { - BulkWriteBuffer buf; + BufferSlot *slot; BlockNumber blkno; bool page_std; } PendingWrite; @@ -57,6 +64,10 @@ typedef struct PendingWrite */ struct BulkWriteState { + /* Comes first so we can align it correctly. */ + BufferSlot buffer_slots[MAX_PENDING_WRITES + 2]; + dlist_head buffer_slots_freelist; + /* Information about the target relation we're writing */ SMgrRelation smgr; ForkNumber forknum; @@ -71,8 +82,6 @@ struct BulkWriteState /* The RedoRecPtr at the time that the bulk operation started */ XLogRecPtr start_RedoRecPtr; - - MemoryContext memcxt; }; static void smgr_bulk_flush(BulkWriteState *bulkstate); @@ -98,7 +107,7 @@ smgr_bulk_start_smgr(SMgrRelation smgr, ForkNumber forknum, bool use_wal) { BulkWriteState *state; - state = palloc(sizeof(BulkWriteState)); + state = palloc_aligned(sizeof(BulkWriteState), PG_IO_ALIGN_SIZE, 0); state->smgr = smgr; state->forknum = forknum; state->use_wal = use_wal; @@ -108,11 +117,11 @@ smgr_bulk_start_smgr(SMgrRelation smgr, ForkNumber forknum, bool use_wal) state->start_RedoRecPtr = GetRedoRecPtr(); - /* - * Remember the memory context. We will use it to allocate all the - * buffers later. - */ - state->memcxt = CurrentMemoryContext; + /* Set up the free-list of buffers. */ + dlist_init(&state->buffer_slots_freelist); + for (int i = 0; i < lengthof(state->buffer_slots); ++i) + dlist_push_tail(&state->buffer_slots_freelist, + &state->buffer_slots[i].freelist_node); return state; } @@ -206,7 +215,7 @@ smgr_bulk_flush(BulkWriteState *bulkstate) for (int i = 0; i < npending; i++) { blknos[i] = pending_writes[i].blkno; - pages[i] = pending_writes[i].buf->data; + pages[i] = pending_writes[i].slot->buffer.data; /* * If any of the pages use !page_std, we log them all as such. @@ -231,7 +240,7 @@ smgr_bulk_flush(BulkWriteState *bulkstate) /* Prepare to write the first block. */ blkno = pending_writes[i].blkno; - page = pending_writes[i].buf->data; + page = pending_writes[i].slot->buffer.data; PageSetChecksumInplace(page, blkno); pages[0] = page; nblocks = 1; @@ -268,7 +277,7 @@ smgr_bulk_flush(BulkWriteState *bulkstate) pending_writes[i + 1].blkno == blkno + nblocks && nblocks < max_nblocks) { - page = pending_writes[++i].buf->data; + page = pending_writes[++i].slot->buffer.data; PageSetChecksumInplace(page, pending_writes[i].blkno); pages[nblocks++] = page; } @@ -285,8 +294,14 @@ smgr_bulk_flush(BulkWriteState *bulkstate) smgrwritev(bulkstate->smgr, bulkstate->forknum, blkno, pages, nblocks, true); } - for (int j = 0; j < nblocks; ++j) - pfree(pending_writes[i - j].buf->data); + /* + * Maintain FIFO ordering in the free list, so that users who write + * blocks in sequential order tend to get sequential chunks of buffer + * memory, which may be slight more efficient for vectored writes. + */ + for (int j = i - nblocks + 1; j <= i; ++j) + dlist_push_tail(&bulkstate->buffer_slots_freelist, + &pending_writes[j].slot->freelist_node); } bulkstate->npending = 0; @@ -306,7 +321,7 @@ smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer PendingWrite *w; w = &bulkstate->pending_writes[bulkstate->npending++]; - w->buf = buf; + w->slot = (BufferSlot *) buf; w->blkno = blocknum; w->page_std = page_std; @@ -320,12 +335,21 @@ smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer * There is no function to free the buffer. When you pass it to * smgr_bulk_write(), it takes ownership and frees it when it's no longer * needed. - * - * This is currently implemented as a simple palloc, but could be implemented - * using a ring buffer or larger chunks in the future, so don't rely on it. */ BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate) { - return MemoryContextAllocAligned(bulkstate->memcxt, BLCKSZ, PG_IO_ALIGN_SIZE, 0); + BufferSlot *slot; + + if (dlist_is_empty(&bulkstate->buffer_slots_freelist)) + { + smgr_bulk_flush(bulkstate); + if (dlist_is_empty(&bulkstate->buffer_slots_freelist)) + elog(ERROR, "too many bulk write buffers used but not yet written"); + } + + slot = dlist_head_element(BufferSlot, freelist_node, &bulkstate->buffer_slots_freelist); + dlist_pop_head_node(&bulkstate->buffer_slots_freelist); + + return &slot->buffer; } -- 2.39.2