From 2c200ceb43899301fd0a6ad079aa9d4d48c24afb Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Mon, 7 Jul 2025 17:30:14 -0400 Subject: [PATCH v3 12/13] Allow on-access pruning to set pages all-visible Many queries do not modify the underlying relation. For such queries, if on-access pruning occurs during the scan, we can check whether the page has become all-visible and update the visibility map accordingly. Previously, only vacuum marked pages as all-visible or all-frozen. Supporting this requires passing information about whether the relation is modified from the executor down to the scan descriptor. This commit implements on-access VM setting for sequential scans as well as for the underlying heap relation in index scans and bitmap heap scans. --- src/backend/access/heap/heapam.c | 15 +++++- src/backend/access/heap/heapam_handler.c | 17 ++++++- src/backend/access/heap/pruneheap.c | 59 +++++++++++++++++------ src/backend/access/index/indexam.c | 46 ++++++++++++++++++ src/backend/access/table/tableam.c | 39 +++++++++++++-- src/backend/executor/execMain.c | 4 ++ src/backend/executor/execUtils.c | 2 + src/backend/executor/nodeBitmapHeapscan.c | 6 ++- src/backend/executor/nodeIndexscan.c | 17 ++++--- src/backend/executor/nodeSeqscan.c | 17 +++++-- src/backend/storage/ipc/procarray.c | 12 +++++ src/include/access/genam.h | 11 +++++ src/include/access/heapam.h | 24 +++++++-- src/include/access/relscan.h | 6 +++ src/include/access/tableam.h | 30 +++++++++++- src/include/nodes/execnodes.h | 17 +++++++ src/include/utils/snapmgr.h | 1 + 17 files changed, 285 insertions(+), 38 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 48f7b84156a..50b0d169d54 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -560,6 +560,7 @@ heap_prepare_pagescan(TableScanDesc sscan) int lines; bool all_visible; bool check_serializable; + bool allow_vmset; Assert(BufferGetBlockNumber(buffer) == block); @@ -570,7 +571,9 @@ heap_prepare_pagescan(TableScanDesc sscan) /* * Prune and repair fragmentation for the whole page, if possible. */ - heap_page_prune_opt(scan->rs_base.rs_rd, buffer); + allow_vmset = sscan->rs_flags & SO_ALLOW_VM_SET; + heap_page_prune_opt(scan->rs_base.rs_rd, buffer, + allow_vmset ? &scan->rs_vmbuffer : NULL, allow_vmset); /* * We must hold share lock on the buffer content while examining tuple @@ -1236,6 +1239,7 @@ heap_beginscan(Relation relation, Snapshot snapshot, sizeof(TBMIterateResult)); } + scan->rs_vmbuffer = InvalidBuffer; return (TableScanDesc) scan; } @@ -1274,6 +1278,12 @@ heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params, scan->rs_cbuf = InvalidBuffer; } + if (BufferIsValid(scan->rs_vmbuffer)) + { + ReleaseBuffer(scan->rs_vmbuffer); + scan->rs_vmbuffer = InvalidBuffer; + } + /* * SO_TYPE_BITMAPSCAN would be cleaned up here, but it does not hold any * additional data vs a normal HeapScan @@ -1306,6 +1316,9 @@ heap_endscan(TableScanDesc sscan) if (BufferIsValid(scan->rs_cbuf)) ReleaseBuffer(scan->rs_cbuf); + if (BufferIsValid(scan->rs_vmbuffer)) + ReleaseBuffer(scan->rs_vmbuffer); + /* * Must free the read stream before freeing the BufferAccessStrategy. */ diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index cb4bc35c93e..fb450c5a84f 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -85,6 +85,7 @@ heapam_index_fetch_begin(Relation rel) hscan->xs_base.rel = rel; hscan->xs_cbuf = InvalidBuffer; + hscan->xs_vmbuffer = InvalidBuffer; return &hscan->xs_base; } @@ -99,6 +100,12 @@ heapam_index_fetch_reset(IndexFetchTableData *scan) ReleaseBuffer(hscan->xs_cbuf); hscan->xs_cbuf = InvalidBuffer; } + + if (BufferIsValid(hscan->xs_vmbuffer)) + { + ReleaseBuffer(hscan->xs_vmbuffer); + hscan->xs_vmbuffer = InvalidBuffer; + } } static void @@ -138,7 +145,9 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, * Prune page, but only if we weren't already on this page */ if (prev_buf != hscan->xs_cbuf) - heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf); + heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf, + scan->modifies_base_rel ? NULL : &hscan->xs_vmbuffer, + !scan->modifies_base_rel); } /* Obtain share-lock on the buffer so we can examine visibility */ @@ -2471,6 +2480,7 @@ BitmapHeapScanNextBlock(TableScanDesc scan, TBMIterateResult *tbmres; OffsetNumber offsets[TBM_MAX_TUPLES_PER_PAGE]; int noffsets = -1; + bool allow_vmset = false; Assert(scan->rs_flags & SO_TYPE_BITMAPSCAN); Assert(hscan->rs_read_stream); @@ -2517,7 +2527,10 @@ BitmapHeapScanNextBlock(TableScanDesc scan, /* * Prune and repair fragmentation for the whole page, if possible. */ - heap_page_prune_opt(scan->rs_rd, buffer); + allow_vmset = scan->rs_flags & SO_ALLOW_VM_SET; + heap_page_prune_opt(scan->rs_rd, buffer, + allow_vmset ? &hscan->rs_vmbuffer : NULL, + allow_vmset); /* * We must hold share lock on the buffer content while examining tuple diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index f6509695e3a..af23008ddf7 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -158,6 +158,7 @@ typedef struct bool all_visible; bool all_frozen; TransactionId visibility_cutoff_xid; + TransactionId oldest_xmin; } PruneState; /* Local functions */ @@ -203,9 +204,13 @@ static bool identify_and_fix_vm_corruption(Relation relation, * if there's not any use in pruning. * * Caller must have pin on the buffer, and must *not* have a lock on it. + * + * If allow_vmset is true, it is okay for pruning to set the visibility map if + * the page is all visible. */ void -heap_page_prune_opt(Relation relation, Buffer buffer) +heap_page_prune_opt(Relation relation, Buffer buffer, + Buffer *vmbuffer, bool allow_vmset) { Page page = BufferGetPage(buffer); TransactionId prune_xid; @@ -260,6 +265,9 @@ heap_page_prune_opt(Relation relation, Buffer buffer) if (!ConditionalLockBufferForCleanup(buffer)) return; + /* Caller should not pass a vmbuffer if allow_vmset is false. */ + Assert(allow_vmset || vmbuffer == NULL); + /* * Now that we have buffer lock, get accurate information about the * page's free space, and recheck the heuristic about whether to @@ -269,6 +277,13 @@ heap_page_prune_opt(Relation relation, Buffer buffer) { OffsetNumber dummy_off_loc; PruneFreezeResult presult; + int options = 0; + + if (allow_vmset) + { + visibilitymap_pin(relation, BufferGetBlockNumber(buffer), vmbuffer); + options = HEAP_PAGE_PRUNE_UPDATE_VM; + } /* * For now, pass mark_unused_now as false regardless of whether or @@ -276,8 +291,8 @@ heap_page_prune_opt(Relation relation, Buffer buffer) * that during on-access pruning with the current implementation. */ heap_page_prune_and_freeze(relation, buffer, false, - InvalidBuffer, - vistest, 0, + vmbuffer ? *vmbuffer : InvalidBuffer, + vistest, options, NULL, &presult, PRUNE_ON_ACCESS, &dummy_off_loc, NULL, NULL); /* @@ -467,6 +482,10 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, prstate.mark_unused_now = (options & HEAP_PAGE_PRUNE_MARK_UNUSED_NOW) != 0; prstate.freeze = (options & HEAP_PAGE_PRUNE_FREEZE) != 0; prstate.update_vm = (options & HEAP_PAGE_PRUNE_UPDATE_VM) != 0; + if (cutoffs) + prstate.oldest_xmin = cutoffs->OldestXmin; + else + prstate.oldest_xmin = OldestXminFromGlobalVisState(vistest); prstate.cutoffs = cutoffs; /* @@ -877,6 +896,20 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, */ if (prstate.update_vm) { + /* + * If this is on-access and we aren't actually pruning, don't set the + * VM if doing so would newly dirty the heap page or, if the page is + * already dirty, if the WAL record emitted would have to contain an + * FPI of the heap page. This should rarely happen, as we only attempt + * on-access pruning when pd_prune_xid is valid. + */ + if (reason == PRUNE_ON_ACCESS && + !do_prune && !do_freeze && + (!BufferIsDirty(buffer) || XLogCheckBufferNeedsBackup(buffer))) + { + /* Don't update the VM */ + } + /* * Clear any VM corruption. This does not need to be in a critical * section, so we do it first. If PD_ALL_VISIBLE is incorrectly set, @@ -885,9 +918,9 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, * of VM corruption, so we don't have to worry about the extra * performance overhead. */ - if (identify_and_fix_vm_corruption(relation, - blockno, buffer, page, - blk_known_av, prstate.lpdead_items, vmbuffer)) + else if (identify_and_fix_vm_corruption(relation, + blockno, buffer, page, + blk_known_av, prstate.lpdead_items, vmbuffer)) { /* If we fix corruption, don't update the VM further */ } @@ -1013,7 +1046,7 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, */ else if (do_freeze) { - conflict_xid = prstate.cutoffs->OldestXmin; + conflict_xid = prstate.oldest_xmin; TransactionIdRetreat(conflict_xid); } @@ -1071,12 +1104,10 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, TransactionId debug_cutoff; bool debug_all_frozen; - Assert(cutoffs); - Assert(prstate.lpdead_items == 0); if (!heap_page_is_all_visible(relation, buffer, - cutoffs->OldestXmin, + prstate.oldest_xmin, &debug_all_frozen, &debug_cutoff, off_loc)) Assert(false); @@ -1136,9 +1167,8 @@ heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer) * vacuuming the relation. OldestXmin is used for freezing determination * and we cannot freeze dead tuples' xmaxes. */ - if (prstate->cutoffs && - TransactionIdIsValid(prstate->cutoffs->OldestXmin) && - NormalTransactionIdPrecedes(dead_after, prstate->cutoffs->OldestXmin)) + if (TransactionIdIsValid(prstate->oldest_xmin) && + NormalTransactionIdPrecedes(dead_after, prstate->oldest_xmin)) return HEAPTUPLE_DEAD; /* @@ -1607,8 +1637,7 @@ heap_prune_record_unchanged_lp_normal(Page page, PruneState *prstate, OffsetNumb * could use GlobalVisTestIsRemovableXid instead, if a * non-freezing caller wanted to set the VM bit. */ - Assert(prstate->cutoffs); - if (!TransactionIdPrecedes(xmin, prstate->cutoffs->OldestXmin)) + if (!TransactionIdPrecedes(xmin, prstate->oldest_xmin)) { prstate->all_visible = false; break; diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 219df1971da..d803c307517 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -279,6 +279,32 @@ index_beginscan(Relation heapRelation, return scan; } +/* + * Similar to index_beginscan(), but allows the caller to indicate whether the + * query modifies the underlying base relation. This is used when the caller + * wants to attempt marking pages in the base relation as all-visible in the + * visibility map during on-access pruning. + */ +IndexScanDesc +index_beginscan_vmset(Relation heapRelation, + Relation indexRelation, + Snapshot snapshot, + IndexScanInstrumentation *instrument, + int nkeys, int norderbys, bool modifies_base_rel) +{ + IndexScanDesc scan; + + scan = index_beginscan(heapRelation, + indexRelation, + snapshot, + instrument, + nkeys, norderbys); + + scan->xs_heapfetch->modifies_base_rel = modifies_base_rel; + + return scan; +} + /* * index_beginscan_bitmap - start a scan of an index with amgetbitmap * @@ -610,6 +636,26 @@ index_beginscan_parallel(Relation heaprel, Relation indexrel, return scan; } +/* + * Parallel version of index_beginscan_vmset() + */ +IndexScanDesc +index_beginscan_parallel_vmset(Relation heaprel, Relation indexrel, + IndexScanInstrumentation *instrument, + int nkeys, int norderbys, + ParallelIndexScanDesc pscan, + bool modifies_base_rel) +{ + IndexScanDesc scan; + + scan = index_beginscan_parallel(heaprel, indexrel, + instrument, + nkeys, norderbys, + pscan); + scan->xs_heapfetch->modifies_base_rel = modifies_base_rel; + return scan; +} + /* ---------------- * index_getnext_tid - get the next TID from a scan * diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index a56c5eceb14..67dbf99f5b5 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -49,6 +49,10 @@ char *default_table_access_method = DEFAULT_TABLE_ACCESS_METHOD; bool synchronize_seqscans = true; +/* Helper for table_beginscan_parallel() and table_beginscan_parallel_vmset() */ +static TableScanDesc table_beginscan_parallel_common(Relation relation, ParallelTableScanDesc pscan, + uint32 flags); + /* ---------------------------------------------------------------------------- * Slot functions. @@ -162,12 +166,14 @@ table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, } } -TableScanDesc -table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan) +/* + * Common helper for table_beginscan_parallel() and table_beginscan_parallel_vmset() + */ +static TableScanDesc +table_beginscan_parallel_common(Relation relation, ParallelTableScanDesc pscan, + uint32 flags) { Snapshot snapshot; - uint32 flags = SO_TYPE_SEQSCAN | - SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE; Assert(RelFileLocatorEquals(relation->rd_locator, pscan->phs_locator)); @@ -188,6 +194,31 @@ table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan) pscan, flags); } +TableScanDesc +table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan) +{ + uint32 flags = SO_TYPE_SEQSCAN | + SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE; + + return table_beginscan_parallel_common(relation, pscan, flags); +} + +/* + * Parallel version of table_beginscan_vmset() + */ +TableScanDesc +table_beginscan_parallel_vmset(Relation relation, ParallelTableScanDesc pscan, + bool modifies_rel) +{ + uint32 flags = SO_TYPE_SEQSCAN | + SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE; + + if (!modifies_rel) + flags |= SO_ALLOW_VM_SET; + + return table_beginscan_parallel_common(relation, pscan, flags); +} + /* ---------------------------------------------------------------------------- * Index scan related functions. diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 0391798dd2c..065676eb7cf 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -917,6 +917,10 @@ InitPlan(QueryDesc *queryDesc, int eflags) break; } + /* If it has a rowmark, the relation is modified */ + estate->es_modified_relids = bms_add_member(estate->es_modified_relids, + rc->rti); + /* Check that relation is a legal target for marking */ if (relation) CheckValidRowMarkRel(relation, rc->markType); diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c index fdc65c2b42b..28a06dcd244 100644 --- a/src/backend/executor/execUtils.c +++ b/src/backend/executor/execUtils.c @@ -893,6 +893,8 @@ ExecInitResultRelation(EState *estate, ResultRelInfo *resultRelInfo, estate->es_result_relations = (ResultRelInfo **) palloc0(estate->es_range_table_size * sizeof(ResultRelInfo *)); estate->es_result_relations[rti - 1] = resultRelInfo; + estate->es_modified_relids = bms_add_member(estate->es_modified_relids, + rti); /* * Saving in the list allows to avoid needlessly traversing the whole diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index bf24f3d7fe0..2c57bc7ac49 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -109,7 +109,8 @@ BitmapTableScanSetup(BitmapHeapScanState *node) table_beginscan_bm(node->ss.ss_currentRelation, node->ss.ps.state->es_snapshot, 0, - NULL); + NULL, + node->modifies_rel); } node->ss.ss_currentScanDesc->st.rs_tbmiterator = tbmiterator; @@ -360,6 +361,9 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags) scanstate->initialized = false; scanstate->pstate = NULL; scanstate->recheck = true; + scanstate->modifies_rel = + bms_is_member(node->scan.scanrelid, + estate->es_modified_relids); /* * Miscellaneous initialization diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 7fcaa37fe62..f91c6b17620 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -106,12 +106,13 @@ IndexNext(IndexScanState *node) * We reach here if the index scan is not parallel, or if we're * serially executing an index scan that was planned to be parallel. */ - scandesc = index_beginscan(node->ss.ss_currentRelation, - node->iss_RelationDesc, - estate->es_snapshot, - &node->iss_Instrument, - node->iss_NumScanKeys, - node->iss_NumOrderByKeys); + scandesc = index_beginscan_vmset(node->ss.ss_currentRelation, + node->iss_RelationDesc, + estate->es_snapshot, + &node->iss_Instrument, + node->iss_NumScanKeys, + node->iss_NumOrderByKeys, + node->iss_ModifiesBaseRel); node->iss_ScanDesc = scandesc; @@ -935,6 +936,10 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags) indexstate->ss.ss_currentRelation = currentRelation; indexstate->ss.ss_currentScanDesc = NULL; /* no heap scan here */ + indexstate->iss_ModifiesBaseRel = + bms_is_member(node->scan.scanrelid, + estate->es_modified_relids); + /* * get the scan type from the relation descriptor. */ diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c index ed35c58c2c3..cded7f15703 100644 --- a/src/backend/executor/nodeSeqscan.c +++ b/src/backend/executor/nodeSeqscan.c @@ -69,9 +69,9 @@ SeqNext(SeqScanState *node) * We reach here if the scan is not parallel, or if we're serially * executing a scan that was planned to be parallel. */ - scandesc = table_beginscan(node->ss.ss_currentRelation, - estate->es_snapshot, - 0, NULL); + scandesc = table_beginscan_vmset(node->ss.ss_currentRelation, + estate->es_snapshot, + 0, NULL, node->modifies_rel); node->ss.ss_currentScanDesc = scandesc; } @@ -237,6 +237,10 @@ ExecInitSeqScan(SeqScan *node, EState *estate, int eflags) node->scan.scanrelid, eflags); + scanstate->modifies_rel = + bms_is_member(node->scan.scanrelid, + estate->es_modified_relids); + /* and create slot with the appropriate rowtype */ ExecInitScanTupleSlot(estate, &scanstate->ss, RelationGetDescr(scanstate->ss.ss_currentRelation), @@ -370,7 +374,8 @@ ExecSeqScanInitializeDSM(SeqScanState *node, estate->es_snapshot); shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); node->ss.ss_currentScanDesc = - table_beginscan_parallel(node->ss.ss_currentRelation, pscan); + table_beginscan_parallel_vmset(node->ss.ss_currentRelation, pscan, + node->modifies_rel); } /* ---------------------------------------------------------------- @@ -403,5 +408,7 @@ ExecSeqScanInitializeWorker(SeqScanState *node, pscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false); node->ss.ss_currentScanDesc = - table_beginscan_parallel(node->ss.ss_currentRelation, pscan); + table_beginscan_parallel_vmset(node->ss.ss_currentRelation, + pscan, + node->modifies_rel); } diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index e5b945a9ee3..01d2bda3f72 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -4133,6 +4133,18 @@ GlobalVisTestFor(Relation rel) return state; } +/* + * Returns maybe_needed as a 32-bit TransactionId. Can be used in callers that + * need to compare transaction IDs to a single value and are okay with using + * the more conservative boundary. + */ +TransactionId +OldestXminFromGlobalVisState(GlobalVisState *state) +{ + return XidFromFullTransactionId(state->maybe_needed); +} + + /* * Return true if it's worth updating the accurate maybe_needed boundary. * diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 5b2ab181b5f..bf272c2c37f 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -180,6 +180,11 @@ extern IndexScanDesc index_beginscan(Relation heapRelation, Snapshot snapshot, IndexScanInstrumentation *instrument, int nkeys, int norderbys); +extern IndexScanDesc index_beginscan_vmset(Relation heapRelation, + Relation indexRelation, + Snapshot snapshot, + IndexScanInstrumentation *instrument, + int nkeys, int norderbys, bool modifies_heap_rel); extern IndexScanDesc index_beginscan_bitmap(Relation indexRelation, Snapshot snapshot, IndexScanInstrumentation *instrument, @@ -206,6 +211,12 @@ extern IndexScanDesc index_beginscan_parallel(Relation heaprel, IndexScanInstrumentation *instrument, int nkeys, int norderbys, ParallelIndexScanDesc pscan); + +extern IndexScanDesc index_beginscan_parallel_vmset(Relation heaprel, Relation indexrel, + IndexScanInstrumentation *instrument, + int nkeys, int norderbys, + ParallelIndexScanDesc pscan, + bool modifies_rel); extern ItemPointer index_getnext_tid(IndexScanDesc scan, ScanDirection direction); struct TupleTableSlot; diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 0b9bb1c9b13..46ea8b8455c 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -94,6 +94,13 @@ typedef struct HeapScanDescData */ ParallelBlockTableScanWorkerData *rs_parallelworkerdata; + /* + * For sequential scans and bitmap heap scans. If the relation is not + * being modified, on-access pruning may read in the current heap page's + * corresponding VM block to this buffer. + */ + Buffer rs_vmbuffer; + /* these fields only used in page-at-a-time mode and for bitmap scans */ uint32 rs_cindex; /* current tuple's index in vistuples */ uint32 rs_ntuples; /* number of visible tuples on page */ @@ -116,8 +123,18 @@ typedef struct IndexFetchHeapData { IndexFetchTableData xs_base; /* AM independent part of the descriptor */ - Buffer xs_cbuf; /* current heap buffer in scan, if any */ - /* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ + /* + * Current heap buffer in scan, if any. NB: if xs_cbuf is not + * InvalidBuffer, we hold a pin on that buffer. + */ + Buffer xs_cbuf; + + /* + * For index scans that do not modify the underlying heap table, on-access + * pruning may read in the current heap page's corresponding VM block to + * this buffer. + */ + Buffer xs_vmbuffer; } IndexFetchHeapData; /* Result codes for HeapTupleSatisfiesVacuum */ @@ -374,7 +391,8 @@ extern TransactionId heap_index_delete_tuples(Relation rel, /* in heap/pruneheap.c */ struct GlobalVisState; -extern void heap_page_prune_opt(Relation relation, Buffer buffer); +extern void heap_page_prune_opt(Relation relation, Buffer buffer, + Buffer *vmbuffer, bool allow_vmset); extern void heap_page_prune_and_freeze(Relation relation, Buffer buffer, bool blk_known_av, Buffer vmbuffer, diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index b5e0fb386c0..f496e0b4939 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -121,6 +121,12 @@ typedef struct ParallelBlockTableScanWorkerData *ParallelBlockTableScanWorker; typedef struct IndexFetchTableData { Relation rel; + + /* + * Some optimizations can only be performed if the query does not modify + * the underlying relation. Track that here. + */ + bool modifies_base_rel; } IndexFetchTableData; struct IndexScanInstrumentation; diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 1c9e802a6b1..0e986d8ef72 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -62,6 +62,8 @@ typedef enum ScanOptions /* unregister snapshot at scan end? */ SO_TEMP_SNAPSHOT = 1 << 9, + /* whether or not scan should attempt to set the VM */ + SO_ALLOW_VM_SET = 1 << 10, } ScanOptions; /* @@ -876,6 +878,25 @@ table_beginscan(Relation rel, Snapshot snapshot, return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); } +/* + * Similar to table_beginscan(), but allows the caller to indicate whether the + * query modifies the relation. This is used when the caller wants to attempt + * marking pages in the relation as all-visible in the visibility map during + * on-access pruning. + */ +static inline TableScanDesc +table_beginscan_vmset(Relation rel, Snapshot snapshot, + int nkeys, struct ScanKeyData *key, bool modifies_rel) +{ + uint32 flags = SO_TYPE_SEQSCAN | + SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE; + + if (!modifies_rel) + flags |= SO_ALLOW_VM_SET; + + return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); +} + /* * Like table_beginscan(), but for scanning catalog. It'll automatically use a * snapshot appropriate for scanning catalog relations. @@ -913,10 +934,13 @@ table_beginscan_strat(Relation rel, Snapshot snapshot, */ static inline TableScanDesc table_beginscan_bm(Relation rel, Snapshot snapshot, - int nkeys, struct ScanKeyData *key) + int nkeys, struct ScanKeyData *key, bool modifies_rel) { uint32 flags = SO_TYPE_BITMAPSCAN | SO_ALLOW_PAGEMODE; + if (!modifies_rel) + flags |= SO_ALLOW_VM_SET; + return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); } @@ -1125,6 +1149,10 @@ extern void table_parallelscan_initialize(Relation rel, extern TableScanDesc table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan); +extern TableScanDesc table_beginscan_parallel_vmset(Relation relation, + ParallelTableScanDesc pscan, + bool modifies_rel); + /* * Restart a parallel scan. Call this in the leader process. Caller is * responsible for making sure that all workers have finished the scan diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index e107d6e5f81..1d0b374b652 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -680,6 +680,12 @@ typedef struct EState * ExecDoInitialPruning() */ const char *es_sourceText; /* Source text from QueryDesc */ + /* + * RT indexes of relations modified by the query either through + * UPDATE/DELETE/INSERT/MERGE or SELECT FOR UPDATE + */ + Bitmapset *es_modified_relids; + JunkFilter *es_junkFilter; /* top-level junk filter, if any */ /* If query can insert/delete tuples, the command ID to mark them with */ @@ -1631,6 +1637,13 @@ typedef struct SeqScanState { ScanState ss; /* its first field is NodeTag */ Size pscan_len; /* size of parallel heap scan descriptor */ + + /* + * Whether or not the query modifies the relation scanned by this node. + * This is used to avoid the overhead of optimizations that are only + * effective for tables not modified by the query. + */ + bool modifies_rel; } SeqScanState; /* ---------------- @@ -1702,6 +1715,7 @@ typedef struct * OrderByTypByVals is the datatype of order by expression pass-by-value? * OrderByTypLens typlens of the datatypes of order by expressions * PscanLen size of parallel index scan descriptor + * ModifiesBaseRel true if query modifies base relation * ---------------- */ typedef struct IndexScanState @@ -1731,6 +1745,7 @@ typedef struct IndexScanState bool *iss_OrderByTypByVals; int16 *iss_OrderByTypLens; Size iss_PscanLen; + bool iss_ModifiesBaseRel; } IndexScanState; /* ---------------- @@ -1888,6 +1903,7 @@ typedef struct SharedBitmapHeapInstrumentation * pstate shared state for parallel bitmap scan * sinstrument statistics for parallel workers * recheck do current page's tuples need recheck + * modifies_rel does the query modify the base relation * ---------------- */ typedef struct BitmapHeapScanState @@ -1900,6 +1916,7 @@ typedef struct BitmapHeapScanState ParallelBitmapHeapState *pstate; SharedBitmapHeapInstrumentation *sinstrument; bool recheck; + bool modifies_rel; } BitmapHeapScanState; /* ---------------- diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h index d346be71642..fcb10b8d136 100644 --- a/src/include/utils/snapmgr.h +++ b/src/include/utils/snapmgr.h @@ -101,6 +101,7 @@ extern bool GlobalVisTestIsRemovableXid(GlobalVisState *state, TransactionId xid extern bool GlobalVisTestIsRemovableFullXid(GlobalVisState *state, FullTransactionId fxid); extern bool GlobalVisCheckRemovableXid(Relation rel, TransactionId xid); extern bool GlobalVisCheckRemovableFullXid(Relation rel, FullTransactionId fxid); +extern TransactionId OldestXminFromGlobalVisState(GlobalVisState *state); /* * Utility functions for implementing visibility routines in table AMs. -- 2.43.0