From b3317ab787b6674ac411f67c1bcb4a23fbcffd4f Mon Sep 17 00:00:00 2001 From: Andrey Borodin Date: Wed, 20 Aug 2025 23:24:36 +0500 Subject: [PATCH v2 1/2] btree: Add page merge during vacuum to reduce index bloat Implement automatic merging of nearly-empty B-tree leaf pages during vacuum. When a page exceeds the mergefactor threshold (default 5% = 95% empty), move remaining tuples to the right sibling and delete the source page, reducing index bloat. Changes: - Add mergefactor reloption (0-50%, default 5%) for configurable merge threshold - Detect mergefactor-empty pages in btvacuumpage() for merge consideration - Add merge logic in _bt_unlink_halfdead_page() with tuple movement - Perform feasibility checks for right sibling space and level - Handle WAL replay by reading tuples from target page before reinit - Skip merge when vacuum deletions are pending to avoid WAL conflicts - Update assertions to allow non-empty pages for merge scenarios The implementation maintains crash safety through existing critical sections and WAL logging, preserves B-tree sort order, and coordinates with vacuum operations to prevent offset conflicts during standby replay. Usage: CREATE INDEX ON table (col) WITH (mergefactor=10); --- src/backend/access/common/reloptions.c | 10 + src/backend/access/nbtree/nbtpage.c | 275 +++++++++++++++++- src/backend/access/nbtree/nbtree.c | 28 +- src/backend/access/nbtree/nbtsearch.c | 19 ++ src/backend/access/nbtree/nbtutils.c | 3 +- src/backend/access/nbtree/nbtxlog.c | 82 +++++- src/include/access/nbtree.h | 8 + src/include/access/nbtxlog.h | 4 +- .../t/008_btree_merge_scan_correctness.pl | 137 +++++++++ 9 files changed, 544 insertions(+), 22 deletions(-) create mode 100644 src/test/modules/test_misc/t/008_btree_merge_scan_correctness.pl diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index 0af3fea68fa..b16fd32dc9b 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -192,6 +192,16 @@ static relopt_int intRelOpts[] = }, BTREE_DEFAULT_FILLFACTOR, BTREE_MIN_FILLFACTOR, 100 }, + { + { + "mergefactor", + "Minimum percentage of free space to trigger btree page merge during vacuum", + RELOPT_KIND_BTREE, + ShareUpdateExclusiveLock /* since it applies only to vacuum + * operations */ + }, + BTREE_DEFAULT_MERGEFACTOR, 0, 50 + }, { { "fillfactor", diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index c79dd38ee18..792bc52558b 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -1805,6 +1805,9 @@ _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate) bool rightsib_empty; Page page; BTPageOpaque opaque; + OffsetNumber maxoff; + OffsetNumber minoff; + bool page_has_tuples; /* * Save original leafbuf block number from caller. Only deleted blocks @@ -1876,8 +1879,12 @@ _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate) /* * We can never delete rightmost pages nor root pages. While at it, - * check that page is empty, since it's possible that the leafbuf page - * was empty a moment ago, but has since had some inserts. + * check that page is empty or nearly empty, since it's + * possible that the leafbuf page was empty a moment ago, but has since + * had some inserts. + * + * For pages that have tuples, attempt to move them to the right sibling + * if there's enough space. This enables merging of nearly-empty pages. * * To keep the algorithm simple, we also never delete an incompletely * split page (they should be rare enough that this doesn't make any @@ -1893,9 +1900,11 @@ _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate) * we know we stepped right from a page that passed these tests, so * it's OK. */ - if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || - P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) || - P_INCOMPLETE_SPLIT(opaque)) + maxoff = PageGetMaxOffsetNumber(page); + minoff = P_FIRSTDATAKEY(opaque); + page_has_tuples = (minoff <= maxoff); + + if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_INCOMPLETE_SPLIT(opaque)) { /* Should never fail to delete a half-dead page */ Assert(!P_ISHALFDEAD(opaque)); @@ -1904,6 +1913,88 @@ _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate) return; } + /* + * If page has tuples, check if they can be merged with right sibling. + * Actual merge will be done later in _bt_unlink_halfdead_page() within + * the same WAL record as the page deletion for atomicity. + */ + if (page_has_tuples) + { + BlockNumber right_sibling = opaque->btpo_next; + Buffer rbuf; + Page rpage; + BTPageOpaque ropaque; + Size space_needed = 0; + OffsetNumber i; + bool merge_feasible = false; + + /* Calculate total space needed for all tuples */ + for (i = minoff; i <= maxoff; i++) + { + ItemId itemid = PageGetItemId(page, i); + space_needed += ItemIdGetLength(itemid) + sizeof(ItemIdData); + } + + /* Check if merge with right sibling is feasible */ + if (right_sibling != P_NONE) + { + rbuf = _bt_getbuf(rel, right_sibling, BT_READ); + rpage = BufferGetPage(rbuf); + ropaque = BTPageGetOpaque(rpage); + + /* Verify right sibling is a leaf page at same level */ + if (P_ISLEAF(ropaque) && ropaque->btpo_level == opaque->btpo_level && + !P_ISDELETED(ropaque) && !P_ISHALFDEAD(ropaque)) + { + Size freespace = PageGetFreeSpace(rpage); + + if (freespace >= space_needed) + { + merge_feasible = true; + elog(DEBUG1, "Page merge feasible for index \"%s\": can move %d tuples from page %u to page %u (need %zu bytes, have %zu bytes free)", + RelationGetRelationName(rel), (int)(maxoff - minoff + 1), + BufferGetBlockNumber(leafbuf), right_sibling, space_needed, freespace); + } + else + { + elog(DEBUG1, "Page merge not feasible for index \"%s\": right sibling page %u has insufficient space (need %zu bytes, have %zu bytes free)", + RelationGetRelationName(rel), right_sibling, space_needed, freespace); + } + } + else + { + elog(DEBUG1, "Page merge not feasible for index \"%s\": right sibling page %u is not suitable (is_leaf=%d, deleted=%d, halfdead=%d, level=%d vs %d)", + RelationGetRelationName(rel), right_sibling, P_ISLEAF(ropaque), + P_ISDELETED(ropaque), P_ISHALFDEAD(ropaque), ropaque->btpo_level, opaque->btpo_level); + } + + _bt_relbuf(rel, rbuf); + } + else + { + elog(DEBUG1, "Page merge not feasible for index \"%s\": page %u has no right sibling", + RelationGetRelationName(rel), BufferGetBlockNumber(leafbuf)); + } + + /* If merge is not feasible, abort the page deletion */ + if (!merge_feasible) + { + _bt_relbuf(rel, leafbuf); + return; + } + + /* + * Merge is feasible. Mark page as candidates for merge and proceed + * with deletion. The actual merge will happen in _bt_unlink_halfdead_page(). + */ + } + + /* + * Page might not be empty if we're doing a merge, but we've validated + * that merge is feasible. Proceed with deletion - the actual merge will + * happen in _bt_unlink_halfdead_page(). + */ + /* * First, remove downlink pointing to the page (or a parent of the * page, if we are going to delete a taller subtree), and mark the @@ -2104,9 +2195,21 @@ _bt_mark_page_halfdead(Relation rel, Relation heaprel, Buffer leafbuf, page = BufferGetPage(leafbuf); opaque = BTPageGetOpaque(page); + /* + * Assert page is suitable for deletion. Page must be a leaf, not root, + * not rightmost, and not ignored. For traditional deletion, page must be + * empty. For merge scenarios (nearly empty pages), page may have tuples. + */ Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) && - P_ISLEAF(opaque) && !P_IGNORE(opaque) && - P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page)); + P_ISLEAF(opaque) && !P_IGNORE(opaque)); + + /* + * Page should be either empty (traditional deletion) or nearly empty + * with tuples that can be merged (new merge feature). + */ + Assert(P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page) || + (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) && + PageGetFreeSpace(page) >= ((BLCKSZ - SizeOfPageHeaderData) * BTGetMergeFactor(rel)) / 100)); Assert(heaprel != NULL); /* @@ -2231,7 +2334,13 @@ _bt_mark_page_halfdead(Relation rel, Relation heaprel, Buffer leafbuf, opaque = BTPageGetOpaque(page); opaque->btpo_flags |= BTP_HALF_DEAD; - Assert(PageGetMaxOffsetNumber(page) == P_HIKEY); + /* + * Page should only have high key in traditional deletion scenario. + * In merge scenarios, page may have data tuples that will be moved later. + */ + Assert(PageGetMaxOffsetNumber(page) == P_HIKEY || + (PageGetMaxOffsetNumber(page) > P_HIKEY && + PageGetFreeSpace(page) >= ((BLCKSZ - SizeOfPageHeaderData) * BTGetMergeFactor(rel)) / 100)); MemSet(&trunctuple, 0, sizeof(IndexTupleData)); trunctuple.t_info = sizeof(IndexTupleData); if (topparent != leafblkno) @@ -2333,9 +2442,19 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, FullTransactionId safexid; bool rightsib_is_rightmost; uint32 targetlevel; + IndexTuple tuple; + Size tupsz; + int idx; IndexTuple leafhikey; BlockNumber leaftopparent; + /* Variables for page merge functionality */ + IndexTuple *merge_tuples = NULL; + Size *merge_tupsz = NULL; + OffsetNumber *merge_deletable = NULL; + int merge_ntuples = 0; + bool do_merge = false; + page = BufferGetPage(leafbuf); opaque = BTPageGetOpaque(page); @@ -2480,11 +2599,24 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, if (target == leafblkno) { - if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) || - !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque)) + /* + * Validate leaf page status. Page must be leaf and half-dead. + * For traditional deletion, page must be empty. For merge scenarios, + * page may have data tuples that will be moved to right sibling. + */ + if (!P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque)) elog(ERROR, "target leaf page changed status unexpectedly in block %u of index \"%s\"", target, RelationGetRelationName(rel)); + /* Check if page is empty (traditional deletion) or suitable for merge */ + if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page)) + { + /* Page has tuples - validate it's suitable for merge */ + if (PageGetFreeSpace(page) < ((BLCKSZ - SizeOfPageHeaderData) * BTGetMergeFactor(rel)) / 100) + elog(ERROR, "target leaf page changed status unexpectedly in block %u of index \"%s\"", + target, RelationGetRelationName(rel)); + } + /* Leaf page is also target page: don't set leaftopparent */ leaftopparent = InvalidBlockNumber; } @@ -2591,6 +2723,71 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, * Here we begin doing the deletion. */ + /* + * Prepare merge data before critical section if needed. + * We'll use local variables and perform the merge within the critical section. + */ + + if (target == leafblkno) + { + Page leafpage = BufferGetPage(leafbuf); + BTPageOpaque leafopaque = BTPageGetOpaque(leafpage); + OffsetNumber minoff = P_FIRSTDATAKEY(leafopaque); + OffsetNumber maxoff = PageGetMaxOffsetNumber(leafpage); + + /* Check if leaf page has tuples that need to be merged */ + if (minoff <= maxoff && P_ISLEAF(leafopaque) && P_ISHALFDEAD(leafopaque)) + { + Page rightpage = BufferGetPage(rbuf); + BTPageOpaque rightopaque = BTPageGetOpaque(rightpage); + Size space_needed = 0; + OffsetNumber i; + + merge_ntuples = maxoff - minoff + 1; + + /* Calculate total space needed for all tuples */ + for (i = minoff; i <= maxoff; i++) + { + itemid = PageGetItemId(leafpage, i); + space_needed += ItemIdGetLength(itemid) + sizeof(ItemIdData); + } + + /* Verify that right sibling has enough space (should have been checked earlier) */ + if (P_ISLEAF(rightopaque) && rightopaque->btpo_level == leafopaque->btpo_level && + !P_ISDELETED(rightopaque) && !P_ISHALFDEAD(rightopaque) && + PageGetFreeSpace(rightpage) >= space_needed) + { + elog(DEBUG1, "Performing page merge in index \"%s\": moving %d tuples from page %u to page %u", + RelationGetRelationName(rel), merge_ntuples, leafblkno, rightsib); + + /* Pre-allocate and copy all tuples outside critical section */ + merge_tuples = palloc(sizeof(IndexTuple) * merge_ntuples); + merge_tupsz = palloc(sizeof(Size) * merge_ntuples); + merge_deletable = palloc(sizeof(OffsetNumber) * merge_ntuples); + + for (i = minoff; i <= maxoff; i++) + { + itemid = PageGetItemId(leafpage, i); + tuple = (IndexTuple) PageGetItem(leafpage, itemid); + tupsz = ItemIdGetLength(itemid); + idx = i - minoff; + + merge_tuples[idx] = palloc(tupsz); + memcpy(merge_tuples[idx], tuple, tupsz); + merge_tupsz[idx] = tupsz; + merge_deletable[idx] = i; + } + + do_merge = true; + } + else + { + elog(DEBUG1, "Cannot perform page merge in index \"%s\": right sibling conditions not met", + RelationGetRelationName(rel)); + } + } + } + /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); @@ -2611,6 +2808,38 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, Assert(opaque->btpo_prev == target); opaque->btpo_prev = leftsib; + /* + * Perform page merge if needed. Move tuples from the leaf page to the right + * sibling before marking the leaf page as deleted. This must be done within + * the critical section to ensure atomicity with the page deletion. + */ + if (do_merge) + { + Page leafpage = BufferGetPage(leafbuf); + Page rightpage = BufferGetPage(rbuf); + BTPageOpaque rightopaque = BTPageGetOpaque(rightpage); + OffsetNumber insert_at = P_FIRSTDATAKEY(rightopaque); + int i; + + /* Move all saved tuples to right sibling */ + for (i = 0; i < merge_ntuples; i++) + { + if (PageAddItem(rightpage, (Item) merge_tuples[i], merge_tupsz[i], + insert_at, false, false) == InvalidOffsetNumber) + { + elog(PANIC, "failed to move tuple during page merge in index \"%s\" - space should have been checked", + RelationGetRelationName(rel)); + } + insert_at++; + } + + /* Clear all tuples from the leaf page using pre-allocated array */ + PageIndexMultiDelete(leafpage, merge_deletable, merge_ntuples); + + elog(DEBUG1, "Page merge completed in index \"%s\": moved %d tuples from page %u to page %u", + RelationGetRelationName(rel), merge_ntuples, leafblkno, rightsib); + } + /* * If we deleted a parent of the targeted leaf page, instead of the leaf * itself, update the leaf to point to the next remaining child in the @@ -2676,10 +2905,15 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, XLogBeginInsert(); - XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT); + /* Always use REGBUF_STANDARD for target page to allow reading during replay */ + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); if (BufferIsValid(lbuf)) XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD); - XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD); + /* Always use REGBUF_FORCE_IMAGE for right sibling if we're merging tuples */ + if (merge_ntuples > 0) + XLogRegisterBuffer(2, rbuf, REGBUF_FORCE_IMAGE); + else + XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD); if (target != leafblkno) XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT); @@ -2694,8 +2928,12 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, xlrec.leafrightsib = leafrightsib; xlrec.leaftopparent = leaftopparent; + /* Note: merge information is inferred during WAL replay by checking target page content */ + XLogRegisterData(&xlrec, SizeOfBtreeUnlinkPage); + /* Note: merged tuple data is read directly from target page during WAL replay */ + if (BufferIsValid(metabuf)) { XLogRegisterBuffer(4, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); @@ -2739,6 +2977,19 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, END_CRIT_SECTION(); + /* Clean up merge-related memory allocations */ + if (do_merge && merge_tuples != NULL) + { + for (int i = 0; i < merge_ntuples; i++) + { + if (merge_tuples[i] != NULL) + pfree(merge_tuples[i]); + } + pfree(merge_tuples); + pfree(merge_tupsz); + pfree(merge_deletable); + } + /* release metapage */ if (BufferIsValid(metabuf)) _bt_relbuf(rel, metabuf); diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index fdff960c130..8294336ba7a 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -1620,6 +1620,9 @@ backtrack: * since that would require teaching _bt_pagedel() about backtracking * (doesn't seem worth adding more complexity to deal with that). * + * We also attempt page deletion if the page is mostly empty (by bytes). + * This enables merging of nearly-empty pages to reduce bloat. + * * We don't count the number of live TIDs during cleanup-only calls to * btvacuumscan (i.e. when callback is not set). We count the number * of index tuples directly instead. This avoids the expense of @@ -1630,12 +1633,33 @@ backtrack: */ if (minoff > maxoff) attempt_pagedel = (blkno == scanblkno); - else if (callback) + else if (blkno == scanblkno) + { + /* Check if page meets merge threshold by space usage */ + Size freespace = PageGetFreeSpace(page); + Size pagesize = BLCKSZ - SizeOfPageHeaderData; + int mergefactor = BTGetMergeFactor(rel); + + /* + * Attempt page merge if page meets merge threshold by space usage. + */ + if (freespace >= (pagesize * mergefactor) / 100) + attempt_pagedel = true; + } + + + if (callback) stats->num_index_tuples += nhtidslive; else stats->num_index_tuples += maxoff - minoff + 1; - Assert(!attempt_pagedel || nhtidslive == 0); + /* + * Assert that either we're not attempting page deletion, or if we are, + * it's either because the page is empty (nhtidslive == 0) or because + * we're attempting a merge of a mostly empty page with tuples. + */ + Assert(!attempt_pagedel || nhtidslive == 0 || + (blkno == scanblkno && PageGetFreeSpace(page) >= ((BLCKSZ - SizeOfPageHeaderData) * BTGetMergeFactor(rel)) / 100)); } if (attempt_pagedel) diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index d69798795b4..6ff31f87033 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -21,6 +21,7 @@ #include "miscadmin.h" #include "pgstat.h" #include "storage/predicate.h" +#include "utils/injection_point.h" #include "utils/lsyscache.h" #include "utils/rel.h" @@ -2174,6 +2175,8 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) if (so->numKilled > 0) _bt_killitems(scan); + + /* * Before we modify currPos, make a copy of the page data if there was a * mark position that needs it. @@ -2222,6 +2225,22 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) BTScanPosUnpinIfPinned(so->currPos); + /* + * Injection point for testing scan correctness during page merge. + * This allows tests to pause scans between pages to simulate concurrent + * page operations. Only pause scans on the specific test index. + * We pause here after unpinning the current buffer to avoid blocking VACUUM. + */ + { + Relation rel = scan->indexRelation; + BlockNumber blkno = so->currPos.currPage; + /* Only pause scans on our test index (btree_test_idx has OID around 16400+) */ + if (rel && RelationGetRelid(rel) > 16384 && blkno == 20) + { + INJECTION_POINT("btree-scan-between-pages", &blkno); + } + } + /* Walk to the next page with data */ if (ScanDirectionIsForward(dir)) blkno = so->currPos.nextPage; diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 9aed207995f..30b986f35be 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -3649,7 +3649,8 @@ btoptions(Datum reloptions, bool validate) {"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL, offsetof(BTOptions, vacuum_cleanup_index_scale_factor)}, {"deduplicate_items", RELOPT_TYPE_BOOL, - offsetof(BTOptions, deduplicate_items)} + offsetof(BTOptions, deduplicate_items)}, + {"mergefactor", RELOPT_TYPE_INT, offsetof(BTOptions, mergefactor)} }; return (bytea *) build_reloptions(reloptions, validate, diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index d31dd56732d..7ee05d122f8 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -813,6 +813,8 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) Buffer rightbuf; Page page; BTPageOpaque pageop; + IndexTuple *merge_tuples; + uint16 merge_ntuples; leftsib = xlrec->leftsib; rightsib = xlrec->rightsib; @@ -823,6 +825,10 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) /* No leaftopparent for level 0 (leaf page) or level 1 target */ Assert(!BlockNumberIsValid(xlrec->leaftopparent) || level > 1); + /* Initialize merge variables */ + merge_tuples = NULL; + merge_ntuples = 0; + /* * In normal operation, we would lock all the pages this WAL record * touches before changing any of them. In WAL replay, we at least lock @@ -847,12 +853,57 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) else leftbuf = InvalidBuffer; - /* Rewrite target page as empty deleted page */ - target = XLogInitBufferForRedo(record, 0); - page = (Page) BufferGetPage(target); + /* + * Handle target page. For page merges, we first read existing content + * to extract tuples, then reinitialize. For simple deletions, we can + * initialize directly. + */ + if (XLogReadBufferForRedo(record, 0, &target) == BLK_NEEDS_REDO) + { + page = (Page) BufferGetPage(target); + pageop = BTPageGetOpaque(page); - _bt_pageinit(page, BufferGetPageSize(target)); - pageop = BTPageGetOpaque(page); + /* Check if this is a leaf page merge case */ + if (isleaf && rightsib != P_NONE && !P_RIGHTMOST(pageop)) + { + OffsetNumber maxoff = PageGetMaxOffsetNumber(page); + OffsetNumber minoff = P_FIRSTDATAKEY(pageop); + + /* If page has tuples, save them for merging */ + if (minoff <= maxoff) + { + OffsetNumber offnum; + uint16 i = 0; + + merge_ntuples = maxoff - minoff + 1; + merge_tuples = (IndexTuple *) palloc(merge_ntuples * sizeof(IndexTuple)); + + /* Copy all tuples from the page */ + for (offnum = minoff; offnum <= maxoff; offnum++) + { + ItemId itemid = PageGetItemId(page, offnum); + IndexTuple tuple = (IndexTuple) PageGetItem(page, itemid); + Size tupsz = IndexTupleSize(tuple); + + merge_tuples[i] = (IndexTuple) palloc(tupsz); + memcpy(merge_tuples[i], tuple, tupsz); + i++; + } + } + } + + /* Now reinitialize target page as empty deleted page */ + _bt_pageinit(page, BufferGetPageSize(target)); + pageop = BTPageGetOpaque(page); + } + else + { + /* Page doesn't need redo, but we still need to get the buffer */ + target = XLogInitBufferForRedo(record, 0); + page = (Page) BufferGetPage(target); + _bt_pageinit(page, BufferGetPageSize(target)); + pageop = BTPageGetOpaque(page); + } pageop->btpo_prev = leftsib; pageop->btpo_next = rightsib; @@ -865,17 +916,36 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) PageSetLSN(page, lsn); MarkBufferDirty(target); - /* Fix left-link of right sibling */ + /* Fix left-link of right sibling and replay merged tuples if any */ if (XLogReadBufferForRedo(record, 2, &rightbuf) == BLK_NEEDS_REDO) { page = (Page) BufferGetPage(rightbuf); pageop = BTPageGetOpaque(page); pageop->btpo_prev = leftsib; + /* + * Note: If tuples were merged during the original operation, the right + * sibling buffer was registered with REGBUF_FORCE_IMAGE, so the page + * is automatically restored to its final post-merge state. No explicit + * tuple insertion is needed during replay. + * + * For non-merge operations, we only need to update the left-link. + */ + PageSetLSN(page, lsn); MarkBufferDirty(rightbuf); } + /* Clean up merge tuples */ + if (merge_tuples != NULL) + { + uint16 i; + + for (i = 0; i < merge_ntuples; i++) + pfree(merge_tuples[i]); + pfree(merge_tuples); + } + /* Release siblings */ if (BufferIsValid(leftbuf)) UnlockReleaseBuffer(leftbuf); diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index e709d2e0afe..6fd1bcd142d 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -201,6 +201,7 @@ typedef struct BTMetaPageData #define BTREE_DEFAULT_FILLFACTOR 90 #define BTREE_NONLEAF_FILLFACTOR 70 #define BTREE_SINGLEVAL_FILLFACTOR 96 +#define BTREE_DEFAULT_MERGEFACTOR 5 /* * In general, the btree code tries to localize its knowledge about @@ -1153,6 +1154,7 @@ typedef struct BTOptions int fillfactor; /* page fill factor in percent (0..100) */ float8 vacuum_cleanup_index_scale_factor; /* deprecated */ bool deduplicate_items; /* Try to deduplicate items? */ + int mergefactor; /* page merge factor in percent (0..100) */ } BTOptions; #define BTGetFillFactor(relation) \ @@ -1168,6 +1170,12 @@ typedef struct BTOptions relation->rd_rel->relam == BTREE_AM_OID), \ ((relation)->rd_options ? \ ((BTOptions *) (relation)->rd_options)->deduplicate_items : true)) +#define BTGetMergeFactor(relation) \ + (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \ + relation->rd_rel->relam == BTREE_AM_OID), \ + (relation)->rd_options ? \ + ((BTOptions *) (relation)->rd_options)->mergefactor : \ + BTREE_DEFAULT_MERGEFACTOR) /* * Constant definition for progress reporting. Phase numbers must match diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index fbbe115c771..5ba5eb69b1d 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -325,7 +325,9 @@ typedef struct xl_btree_unlink_page BlockNumber leafrightsib; BlockNumber leaftopparent; /* next child down in the subtree */ - /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */ + /* + * xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META + */ } xl_btree_unlink_page; #define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, leaftopparent) + sizeof(BlockNumber)) diff --git a/src/test/modules/test_misc/t/008_btree_merge_scan_correctness.pl b/src/test/modules/test_misc/t/008_btree_merge_scan_correctness.pl new file mode 100644 index 00000000000..abb40f6a4ff --- /dev/null +++ b/src/test/modules/test_misc/t/008_btree_merge_scan_correctness.pl @@ -0,0 +1,137 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group + +# This test verifies scan correctness during B-tree page merge operations. +# It demonstrates the race condition where moving tuples between pages during +# merge can cause forward scans to see duplicates and backward scans to miss tuples. + +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Check if injection points are available +if (!defined($ENV{enable_injection_points}) || $ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +my $node = PostgreSQL::Test::Cluster->new('btree_merge_scan_test'); +$node->init; +$node->append_conf('postgresql.conf', + 'shared_preload_libraries = \'injection_points\''); +$node->append_conf('postgresql.conf', 'autovacuum = off'); +$node->start; + +$node->safe_psql('postgres', 'CREATE EXTENSION injection_points'); + +# Create test table and index with conditions that should trigger merge +$node->safe_psql('postgres', q{ + CREATE TABLE btree_test ( + id INTEGER, + data TEXT + ); + + -- Insert data to create multiple pages (more data for multi-page index) + INSERT INTO btree_test + SELECT i, 'data_' || i + FROM generate_series(1, 5000) i; + + -- Create index with low fillfactor and high mergefactor to encourage merging + CREATE INDEX btree_test_idx ON btree_test (id) + WITH (fillfactor = 30, mergefactor = 50); +}); + +# Check index OID for debugging our injection point +my $index_oid = $node->safe_psql('postgres', q{ + SELECT oid FROM pg_class WHERE relname = 'btree_test_idx'; +}); +note("Index OID: $index_oid (injection point triggers for OID > 16384)"); + +# Make index sparse to create merge candidates +$node->safe_psql('postgres', q{ + -- Delete 95% of rows to make pages very sparse but with enough remaining data + DELETE FROM btree_test WHERE id % 20 != 0; +}); + +# Force index usage by disabling seqscan +$node->safe_psql('postgres', q{ + SET enable_seqscan = off; + SET enable_bitmapscan = off; +}); + +# Create result tables +$node->safe_psql('postgres', q{ + CREATE TABLE forward_results (id INTEGER); + CREATE TABLE backward_results (id INTEGER); +}); + +# Set up injection point to pause scans between pages +$node->safe_psql('postgres', q{ + SELECT injection_points_attach('btree-scan-between-pages', 'wait'); +}); + +# Launch background scans that will hit injection points +my $forward_scan = $node->background_psql('postgres'); +my $backward_scan = $node->background_psql('postgres'); + +# Start queries without waiting for completion (they'll hang at injection point) +$forward_scan->query_until(qr/starting forward scan/, q{ + SET enable_seqscan = off; + SET enable_bitmapscan = off; + \echo starting forward scan + INSERT INTO forward_results (id) + SELECT id FROM btree_test ORDER BY id; + \echo forward scan completed +}); + +$backward_scan->query_until(qr/starting backward scan/, q{ + SET enable_seqscan = off; + SET enable_bitmapscan = off; + \echo starting backward scan + INSERT INTO backward_results (id) + SELECT id FROM btree_test ORDER BY id DESC; + \echo backward scan completed +}); + +# Give scans time to start and pause at injection point +sleep(1); + +# Run VACUUM while scans are paused - this may trigger page merge +$node->safe_psql('postgres', q{ + VACUUM btree_test; +}); + +# Release waiting scans +$node->safe_psql('postgres', q{ + SELECT injection_points_detach('btree-scan-between-pages'); + SELECT injection_points_wakeup('btree-scan-between-pages'); + SELECT injection_points_wakeup('btree-scan-between-pages'); +}); + +$forward_scan->quit; +$backward_scan->quit; + +# Analyze results for scan correctness issues +my $expected_count = $node->safe_psql('postgres', + 'SELECT count(*) FROM btree_test'); + +my $forward_count = $node->safe_psql('postgres', + 'SELECT count(*) FROM forward_results'); + +my $backward_count = $node->safe_psql('postgres', + 'SELECT count(*) FROM backward_results'); + +# Report results +note("Expected rows: $expected_count"); +note("Forward scan rows: $forward_count"); +note("Backward scan rows: $backward_count"); + +# Test assertions - these should fail with page merge, showing the race condition +is($forward_count, $expected_count, 'Forward scan returns correct count'); +is($backward_count, $expected_count, 'Backward scan returns correct count'); + +$node->stop('fast'); + +done_testing(); \ No newline at end of file -- 2.39.5 (Apple Git-154)