From 435f9b4d82e70665603a7c0eb19c9065010eb5d0 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Sun, 7 Feb 2021 19:24:03 -0800 Subject: [PATCH v5 2/3] Recycle pages deleted during same VACUUM. Author: Peter Geoghegan Discussion: https://postgr.es/m/CAH2-Wzk76_P=67iUscb1UN44-gyZL-KgpsXbSxq_bdcMa7Q+wQ@mail.gmail.com --- src/include/access/nbtree.h | 38 +++++++++- src/backend/access/nbtree/README | 31 ++++++++ src/backend/access/nbtree/nbtpage.c | 90 ++++++++++++++++------ src/backend/access/nbtree/nbtree.c | 111 ++++++++++++++++++++++++---- 4 files changed, 229 insertions(+), 41 deletions(-) diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 901b6f4dc8..5c197fc5c1 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -280,7 +280,8 @@ BTPageGetDeleteXid(Page page) * Is an existing page recyclable? * * This exists to centralize the policy on which deleted pages are now safe to - * re-use. + * re-use. The _bt_newly_deleted_pages_recycle() optimization behaves more + * aggressively, though that has certain known limitations. * * Note: PageIsNew() pages are always safe to recycle, but we can't deal with * them here (caller is responsible for that case themselves). Caller might @@ -313,6 +314,39 @@ BTPageIsRecyclable(Page page) return false; } +/* + * BTVacState is nbtree.c state used during VACUUM. It is exported for use by + * page deletion related code in nbtpage.c. + */ +typedef struct BTPendingRecycle +{ + BlockNumber blkno; + FullTransactionId safexid; +} BTPendingRecycle; + +typedef struct BTVacState +{ + /* + * VACUUM operation state + */ + IndexVacuumInfo *info; + IndexBulkDeleteResult *stats; + IndexBulkDeleteCallback callback; + void *callback_state; + BTCycleId cycleid; + + /* + * Page deletion state for VACUUM + */ + MemoryContext pagedelcontext; + BTPendingRecycle *deleted; + bool grow; + bool full; + uint32 ndeletedspace; + uint64 maxndeletedspace; + uint32 ndeleted; +} BTVacState; + /* * Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost * page. The high key is not a tuple that is used to visit the heap. It is @@ -1182,7 +1216,7 @@ extern void _bt_delitems_vacuum(Relation rel, Buffer buf, extern void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate); -extern uint32 _bt_pagedel(Relation rel, Buffer leafbuf); +extern void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate); /* * prototypes for functions in nbtsearch.c diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 46d49bf025..265814ea46 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -430,6 +430,37 @@ whenever it is subsequently taken from the FSM for reuse. The deleted page's contents will be overwritten by the split operation (it will become the new right sibling page). +Prior to PostgreSQL 14, VACUUM was only able to recycle pages that were +deleted by a previous VACUUM operation (VACUUM typically placed all pages +deleted by the last VACUUM into the FSM, though there were and are no +certainties here). This had the obvious disadvantage of creating +uncertainty about when and how pages get recycled, especially with bursty +workloads. It was naive, even within the constraints of the design, since +there is no reason to think that it will take long for a deleted page to +become recyclable. It's convenient to use XIDs to implement the drain +technique, but that is totally unrelated to any of the other things that +VACUUM needs to do with XIDs. + +VACUUM operations now consider if it's possible to recycle any pages that +the same operation deleted after the physical scan of the index, the last +point it's convenient to do one last check. This changes nothing about +the basic design, and so it might still not be possible to recycle any +pages at that time (e.g., there might not even be one single new +transactions after an index page deletion, but before VACUUM ends). But +we have little to lose and plenty to gain by trying. We only need to keep +around a little information about recently deleted pages in local memory. +We don't even have to access the deleted pages a second time. + +Currently VACUUM delays considering the possibility of recycling its own +recently deleted page until the end of its btbulkdelete scan (or until the +end of btvacuumcleanup in cases where there were no tuples to delete in +the index). It would be slightly more effective if btbulkdelete page +deletions were deferred until btvacuumcleanup, simply because more time +will have passed. Our current approach works well enough in practice, +especially in cases where it really matters: cases where we're vacuuming a +large index, where recycling pages sooner rather than later is +particularly likely to matter. + Fastpath For Index Insertion ---------------------------- diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 8ae16428d7..55395c87c1 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -50,7 +50,7 @@ static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, bool *rightsib_empty, - uint32 *ndeleted); + BTVacState *vstate); static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child, BTStack stack, Buffer *subtreeparent, @@ -1761,20 +1761,22 @@ _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib) * should never pass a buffer containing an existing deleted page here. The * lock and pin on caller's buffer will be dropped before we return. * - * Returns the number of pages successfully deleted (zero if page cannot - * be deleted now; could be more than one if parent or right sibling pages - * were deleted too). Note that this does not include pages that we delete - * that the btvacuumscan scan has yet to reach; they'll get counted later - * instead. + * Maintains bulk delete stats for caller, which are taken from vstate. We + * need to cooperate closely with caller here so that whole VACUUM operation + * reliably avoids any double counting of subsidiary-to-leafbuf pages that we + * delete in passing. If such pages happen to be from a block number that is + * ahead of the current scanblkno position, then caller is expected to count + * them directly later on. It's simpler for us to understand caller's + * requirements than it would be for caller to understand when or how a + * deleted page became deleted after the fact. * * NOTE: this leaks memory. Rather than trying to clean up everything * carefully, it's better to run it in a temp context that can be reset * frequently. */ -uint32 -_bt_pagedel(Relation rel, Buffer leafbuf) +void +_bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate) { - uint32 ndeleted = 0; BlockNumber rightsib; bool rightsib_empty; Page page; @@ -1782,7 +1784,8 @@ _bt_pagedel(Relation rel, Buffer leafbuf) /* * Save original leafbuf block number from caller. Only deleted blocks - * that are <= scanblkno get counted in ndeleted return value. + * that are <= scanblkno are added to bulk delete stat's pages_deleted + * count. */ BlockNumber scanblkno = BufferGetBlockNumber(leafbuf); @@ -1844,7 +1847,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf) RelationGetRelationName(rel)))); _bt_relbuf(rel, leafbuf); - return ndeleted; + return; } /* @@ -1874,7 +1877,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf) Assert(!P_ISHALFDEAD(opaque)); _bt_relbuf(rel, leafbuf); - return ndeleted; + return; } /* @@ -1923,8 +1926,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf) if (_bt_leftsib_splitflag(rel, leftsib, leafblkno)) { ReleaseBuffer(leafbuf); - Assert(ndeleted == 0); - return ndeleted; + return; } /* we need an insertion scan key for the search, so build one */ @@ -1965,7 +1967,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf) if (!_bt_mark_page_halfdead(rel, leafbuf, stack)) { _bt_relbuf(rel, leafbuf); - return ndeleted; + return; } } @@ -1980,7 +1982,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf) { /* Check for interrupts in _bt_unlink_halfdead_page */ if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno, - &rightsib_empty, &ndeleted)) + &rightsib_empty, vstate)) { /* * _bt_unlink_halfdead_page should never fail, since we @@ -1991,7 +1993,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf) * lock and pin on leafbuf for us. */ Assert(false); - return ndeleted; + return; } } @@ -2027,8 +2029,6 @@ _bt_pagedel(Relation rel, Buffer leafbuf) leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE); } - - return ndeleted; } /* @@ -2263,9 +2263,10 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) */ static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, - bool *rightsib_empty, uint32 *ndeleted) + bool *rightsib_empty, BTVacState *vstate) { BlockNumber leafblkno = BufferGetBlockNumber(leafbuf); + IndexBulkDeleteResult *stats = vstate->stats; BlockNumber leafleftsib; BlockNumber leafrightsib; BlockNumber target; @@ -2673,12 +2674,53 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, _bt_relbuf(rel, buf); /* - * If btvacuumscan won't revisit this page in a future btvacuumpage call - * and count it as deleted then, we count it as deleted by current - * btvacuumpage call + * Maintain pages_deleted in a way that takes into account how + * btvacuumpage() will count deleted pages that have yet to become + * scanblkno -- only count page when it's not going to get that treatment + * later on. */ if (target <= scanblkno) - (*ndeleted)++; + stats->pages_deleted++; + + /* + * Maintain array of pages that were deleted during current btvacuumscan() + * call. We may well be able to recycle them in a separate pass at the + * end of the current btvacuumscan(). + * + * Need to respect work_mem/maxndeletedspace limitation on size of deleted + * array. Our strategy when the array can no longer grow within the + * bounds of work_mem is simple: keep earlier entries (which are likelier + * to be recyclable in the end), but stop saving new entries. + */ + if (vstate->full) + return true; + + if (vstate->ndeleted >= vstate->ndeletedspace) + { + uint64 newndeletedspace; + + if (!vstate->grow) + { + vstate->full = true; + return true; + } + + newndeletedspace = vstate->ndeletedspace * 2; + if (newndeletedspace > vstate->maxndeletedspace) + { + newndeletedspace = vstate->maxndeletedspace; + vstate->grow = false; + } + vstate->ndeletedspace = newndeletedspace; + + vstate->deleted = + repalloc(vstate->deleted, + sizeof(BTPendingRecycle) * vstate->ndeletedspace); + } + + vstate->deleted[vstate->ndeleted].blkno = target; + vstate->deleted[vstate->ndeleted].safexid = safexid; + vstate->ndeleted++; return true; } diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index b5a674d9e0..b18022936b 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -21,7 +21,9 @@ #include "access/nbtree.h" #include "access/nbtxlog.h" #include "access/relscan.h" +#include "access/table.h" #include "access/xlog.h" +#include "catalog/index.h" #include "commands/progress.h" #include "commands/vacuum.h" #include "miscadmin.h" @@ -32,23 +34,13 @@ #include "storage/indexfsm.h" #include "storage/ipc.h" #include "storage/lmgr.h" +#include "storage/procarray.h" #include "storage/smgr.h" #include "utils/builtins.h" #include "utils/index_selfuncs.h" #include "utils/memutils.h" -/* Working state needed by btvacuumpage */ -typedef struct -{ - IndexVacuumInfo *info; - IndexBulkDeleteResult *stats; - IndexBulkDeleteCallback callback; - void *callback_state; - BTCycleId cycleid; - MemoryContext pagedelcontext; -} BTVacState; - /* * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started. * @@ -868,6 +860,68 @@ _bt_vacuum_needs_cleanup(IndexVacuumInfo *info) return false; } +/* + * _bt_newly_deleted_pages_recycle() -- Are _bt_pagedel pages recyclable now? + * + * Note that we assume that the array is ordered by safexid. No further + * entries can be safe to recycle once we encounter the first non-recyclable + * entry in the deleted array. + */ +static inline void +_bt_newly_deleted_pages_recycle(Relation rel, BTVacState *vstate) +{ + IndexBulkDeleteResult *stats = vstate->stats; + Relation heapRel; + + /* + * Recompute VACUUM XID boundaries. + * + * We don't actually care about the oldest non-removable XID. Computing + * the oldest such XID has a useful side-effect: It updates the procarray + * state that tracks XID horizon. This is not just an optimization; it's + * essential. It allows the GlobalVisCheckRemovableFullXid() calls we + * make here to notice if and when safexid values from pages this same + * VACUUM operation deleted are sufficiently old to allow recycling to + * take place safely. + */ + GetOldestNonRemovableTransactionId(NULL); + + /* + * Use the heap relation for GlobalVisCheckRemovableFullXid() calls (don't + * pass NULL rel argument). + * + * This is an optimization; it allows us to be much more aggressive in + * cases involving logical decoding (unless this happens to be a system + * catalog). We don't simply use BTPageIsRecyclable(). + * + * XXX: The BTPageIsRecyclable() criteria creates problems for this + * optimization. Its safexid test is applied in a redundant manner within + * _bt_getbuf() (via its BTPageIsRecyclable() call). Consequently, + * _bt_getbuf() may believe that it is still unsafe to recycle a page that + * we know to be recycle safe -- in which case it is unnecessarily + * discarded. + * + * We should get around to fixing this _bt_getbuf() issue some day. For + * now we can still proceed in the hopes that BTPageIsRecyclable() will + * catch up with us before _bt_getbuf() ever reaches the page. + */ + heapRel = table_open(IndexGetRelation(RelationGetRelid(rel), false), + AccessShareLock); + for (int i = 0; i < vstate->ndeleted; i++) + { + BlockNumber blkno = vstate->deleted[i].blkno; + FullTransactionId safexid = vstate->deleted[i].safexid; + + if (!GlobalVisCheckRemovableFullXid(heapRel, safexid)) + break; + + RecordFreeIndexPage(rel, blkno); + stats->pages_free++; + } + + table_close(heapRel, AccessShareLock); +} + /* * Bulk deletion of all index entries pointing to a set of heap tuples. * The set of target tuples is specified via a callback routine that tells @@ -953,6 +1007,14 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) * _bt_vacuum_needs_cleanup() to force the next VACUUM to proceed with a * btvacuumscan() call. * + * Note: Prior to PostgreSQL 14, we were completely reliant on the next + * VACUUM operation taking care of recycling whatever pages the current + * VACUUM operation found to be empty and then deleted. It is now usually + * possible for _bt_newly_deleted_pages_recycle() to recycle all of the + * pages that any given VACUUM operation deletes, as part of the same + * VACUUM operation. As a result, it is rare for num_delpages to actually + * exceed 0, including with indexes where page deletions are frequent. + * * Note: We must delay the _bt_set_cleanup_info() call until this late * stage of VACUUM (the btvacuumcleanup() phase), to keep num_heap_tuples * accurate. The btbulkdelete()-time num_heap_tuples value is generally @@ -1041,6 +1103,16 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, "_bt_pagedel", ALLOCSET_DEFAULT_SIZES); + /* Allocate _bt_newly_deleted_pages_recycle related information */ + vstate.ndeletedspace = 512; + vstate.grow = true; + vstate.full = false; + vstate.maxndeletedspace = ((work_mem * 1024L) / sizeof(BTPendingRecycle)); + vstate.maxndeletedspace = Min(vstate.maxndeletedspace, MaxBlockNumber); + vstate.maxndeletedspace = Max(vstate.maxndeletedspace, vstate.ndeletedspace); + vstate.ndeleted = 0; + vstate.deleted = palloc(sizeof(BTPendingRecycle) * vstate.ndeletedspace); + /* * The outer loop iterates over all index pages except the metapage, in * physical order (we hope the kernel will cooperate in providing @@ -1109,7 +1181,18 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, * * Note that if no recyclable pages exist, we don't bother vacuuming the * FSM at all. + * + * Before vacuuming the FSM, try to make the most of the pages we + * ourselves deleted: see if they can be recycled already (try to avoid + * waiting until the next VACUUM operation to recycle). Our approach is + * to check the local array of pages that were newly deleted during this + * VACUUM. */ + if (vstate.ndeleted > 0) + _bt_newly_deleted_pages_recycle(rel, &vstate); + + pfree(vstate.deleted); + if (stats->pages_free > 0) IndexFreeSpaceMapVacuum(rel); } @@ -1448,12 +1531,10 @@ backtrack: oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext); /* - * We trust the _bt_pagedel return value because it does not include - * any page that a future call here from btvacuumscan is expected to - * count. There will be no double-counting. + * _bt_pagedel maintains the bulk delete stats on our behalf */ Assert(blkno == scanblkno); - stats->pages_deleted += _bt_pagedel(rel, buf); + _bt_pagedel(rel, buf, vstate); MemoryContextSwitchTo(oldcontext); /* pagedel released buffer, so we shouldn't */ -- 2.27.0