From f962eee2760f7f0927a318ac05b55e48eea3cec0 Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Tue, 29 Jul 2025 14:34:30 -0400 Subject: [PATCH v21 11/12] Allow on-access pruning to set pages all-visible Many queries do not modify the underlying relation. For such queries, if on-access pruning occurs during the scan, we can check whether the page has become all-visible and update the visibility map accordingly. Previously, only vacuum and COPY FREEZE marked pages as all-visible or all-frozen. Supporting this requires passing information about whether the relation is modified from the executor down to the scan descriptor. This commit implements on-access VM setting for sequential scans as well as for the underlying heap relation in index scans and bitmap heap scans. Author: Melanie Plageman Reviewed-by: Kirill Reshke Discussion: https://postgr.es/m/flat/CAAKRu_ZMw6Npd_qm2KM%2BFwQ3cMOMx1Dh3VMhp8-V7SOLxdK9-g%40mail.gmail.com --- contrib/pgrowlocks/pgrowlocks.c | 2 +- src/backend/access/brin/brin.c | 3 +- src/backend/access/gin/gininsert.c | 3 +- src/backend/access/heap/heapam.c | 15 +++- src/backend/access/heap/heapam_handler.c | 22 ++++-- src/backend/access/heap/pruneheap.c | 69 +++++++++++++++---- src/backend/access/index/genam.c | 4 +- src/backend/access/index/indexam.c | 6 +- src/backend/access/nbtree/nbtsort.c | 2 +- src/backend/access/table/tableam.c | 8 ++- src/backend/commands/constraint.c | 2 +- src/backend/commands/copyto.c | 2 +- src/backend/commands/tablecmds.c | 4 +- src/backend/commands/typecmds.c | 4 +- src/backend/executor/execIndexing.c | 2 +- src/backend/executor/execMain.c | 4 ++ src/backend/executor/execReplication.c | 8 +-- src/backend/executor/execUtils.c | 2 + src/backend/executor/nodeBitmapHeapscan.c | 9 ++- src/backend/executor/nodeIndexonlyscan.c | 2 +- src/backend/executor/nodeIndexscan.c | 11 ++- src/backend/executor/nodeSeqscan.c | 26 ++++++- src/backend/partitioning/partbounds.c | 2 +- src/backend/utils/adt/selfuncs.c | 2 +- src/include/access/genam.h | 3 +- src/include/access/heapam.h | 30 +++++++- src/include/access/tableam.h | 19 ++--- src/include/nodes/execnodes.h | 6 ++ .../t/035_standby_logical_decoding.pl | 3 +- 29 files changed, 210 insertions(+), 65 deletions(-) diff --git a/contrib/pgrowlocks/pgrowlocks.c b/contrib/pgrowlocks/pgrowlocks.c index f88269332b6..27f01d8055f 100644 --- a/contrib/pgrowlocks/pgrowlocks.c +++ b/contrib/pgrowlocks/pgrowlocks.c @@ -114,7 +114,7 @@ pgrowlocks(PG_FUNCTION_ARGS) RelationGetRelationName(rel)); /* Scan the relation */ - scan = table_beginscan(rel, GetActiveSnapshot(), 0, NULL); + scan = table_beginscan(rel, GetActiveSnapshot(), 0, NULL, 0); hscan = (HeapScanDesc) scan; attinmeta = TupleDescGetAttInMetadata(rsinfo->setDesc); diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index cb3331921cb..b9613787b85 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -2842,7 +2842,8 @@ _brin_parallel_scan_and_build(BrinBuildState *state, indexInfo->ii_Concurrent = brinshared->isconcurrent; scan = table_beginscan_parallel(heap, - ParallelTableScanFromBrinShared(brinshared)); + ParallelTableScanFromBrinShared(brinshared), + 0); reltuples = table_index_build_scan(heap, index, indexInfo, true, true, brinbuildCallbackParallel, state, scan); diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index c2b879b2bf6..147844690a1 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -2058,7 +2058,8 @@ _gin_parallel_scan_and_build(GinBuildState *state, indexInfo->ii_Concurrent = ginshared->isconcurrent; scan = table_beginscan_parallel(heap, - ParallelTableScanFromGinBuildShared(ginshared)); + ParallelTableScanFromGinBuildShared(ginshared), + 0); reltuples = table_index_build_scan(heap, index, indexInfo, true, progress, ginBuildCallbackParallel, state, scan); diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 2bff37e03b5..ae53e311ce1 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -555,6 +555,7 @@ heap_prepare_pagescan(TableScanDesc sscan) Buffer buffer = scan->rs_cbuf; BlockNumber block = scan->rs_cblock; Snapshot snapshot; + Buffer *vmbuffer = NULL; Page page; int lines; bool all_visible; @@ -569,7 +570,9 @@ heap_prepare_pagescan(TableScanDesc sscan) /* * Prune and repair fragmentation for the whole page, if possible. */ - heap_page_prune_opt(scan->rs_base.rs_rd, buffer); + if (sscan->rs_flags & SO_HINT_REL_READ_ONLY) + vmbuffer = &scan->rs_vmbuffer; + heap_page_prune_opt(scan->rs_base.rs_rd, buffer, vmbuffer); /* * We must hold share lock on the buffer content while examining tuple @@ -1246,6 +1249,7 @@ heap_beginscan(Relation relation, Snapshot snapshot, sizeof(TBMIterateResult)); } + scan->rs_vmbuffer = InvalidBuffer; return (TableScanDesc) scan; } @@ -1284,6 +1288,12 @@ heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params, scan->rs_cbuf = InvalidBuffer; } + if (BufferIsValid(scan->rs_vmbuffer)) + { + ReleaseBuffer(scan->rs_vmbuffer); + scan->rs_vmbuffer = InvalidBuffer; + } + /* * SO_TYPE_BITMAPSCAN would be cleaned up here, but it does not hold any * additional data vs a normal HeapScan @@ -1316,6 +1326,9 @@ heap_endscan(TableScanDesc sscan) if (BufferIsValid(scan->rs_cbuf)) ReleaseBuffer(scan->rs_cbuf); + if (BufferIsValid(scan->rs_vmbuffer)) + ReleaseBuffer(scan->rs_vmbuffer); + /* * Must free the read stream before freeing the BufferAccessStrategy. */ diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index bcbac844bb6..27e3498f5f4 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -79,12 +79,14 @@ heapam_slot_callbacks(Relation relation) */ static IndexFetchTableData * -heapam_index_fetch_begin(Relation rel) +heapam_index_fetch_begin(Relation rel, uint32 flags) { IndexFetchHeapData *hscan = palloc0(sizeof(IndexFetchHeapData)); hscan->xs_base.rel = rel; hscan->xs_cbuf = InvalidBuffer; + hscan->xs_vmbuffer = InvalidBuffer; + hscan->modifies_base_rel = !(flags & SO_HINT_REL_READ_ONLY); return &hscan->xs_base; } @@ -99,6 +101,12 @@ heapam_index_fetch_reset(IndexFetchTableData *scan) ReleaseBuffer(hscan->xs_cbuf); hscan->xs_cbuf = InvalidBuffer; } + + if (BufferIsValid(hscan->xs_vmbuffer)) + { + ReleaseBuffer(hscan->xs_vmbuffer); + hscan->xs_vmbuffer = InvalidBuffer; + } } static void @@ -138,7 +146,8 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, * Prune page, but only if we weren't already on this page */ if (prev_buf != hscan->xs_cbuf) - heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf); + heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf, + hscan->modifies_base_rel ? NULL : &hscan->xs_vmbuffer); } /* Obtain share-lock on the buffer so we can examine visibility */ @@ -753,7 +762,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, tableScan = NULL; heapScan = NULL; - indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, NULL, 0, 0); + indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, NULL, 0, 0, 0); index_rescan(indexScan, NULL, 0, NULL, 0); } else @@ -762,7 +771,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, PROGRESS_CLUSTER_PHASE_SEQ_SCAN_HEAP); - tableScan = table_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL); + tableScan = table_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL, 0); heapScan = (HeapScanDesc) tableScan; indexScan = NULL; @@ -2471,6 +2480,7 @@ BitmapHeapScanNextBlock(TableScanDesc scan, TBMIterateResult *tbmres; OffsetNumber offsets[TBM_MAX_TUPLES_PER_PAGE]; int noffsets = -1; + Buffer *vmbuffer = NULL; Assert(scan->rs_flags & SO_TYPE_BITMAPSCAN); Assert(hscan->rs_read_stream); @@ -2517,7 +2527,9 @@ BitmapHeapScanNextBlock(TableScanDesc scan, /* * Prune and repair fragmentation for the whole page, if possible. */ - heap_page_prune_opt(scan->rs_rd, buffer); + if (scan->rs_flags & SO_HINT_REL_READ_ONLY) + vmbuffer = &hscan->rs_vmbuffer; + heap_page_prune_opt(scan->rs_rd, buffer, vmbuffer); /* * We must hold share lock on the buffer content while examining tuple diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 90270081acd..124722f1778 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -203,7 +203,9 @@ static bool heap_page_will_set_vis(Relation relation, Buffer heap_buf, Buffer vmbuffer, bool blk_known_av, - const PruneState *prstate, + PruneReason reason, + bool do_prune, bool do_freeze, + PruneState *prstate, uint8 *vmflags, bool *do_set_pd_vis); @@ -218,9 +220,13 @@ static bool heap_page_will_set_vis(Relation relation, * if there's not any use in pruning. * * Caller must have pin on the buffer, and must *not* have a lock on it. + * + * If vmbuffer is not NULL, it is okay for pruning to set the visibility map if + * the page is all-visible. We will take care of pinning and, if needed, + * reading in the page of the visibility map. */ void -heap_page_prune_opt(Relation relation, Buffer buffer) +heap_page_prune_opt(Relation relation, Buffer buffer, Buffer *vmbuffer) { Page page = BufferGetPage(buffer); TransactionId prune_xid; @@ -297,6 +303,13 @@ heap_page_prune_opt(Relation relation, Buffer buffer) .vistest = vistest,.cutoffs = NULL }; + if (vmbuffer) + { + visibilitymap_pin(relation, BufferGetBlockNumber(buffer), vmbuffer); + params.options = HEAP_PAGE_PRUNE_UPDATE_VIS; + params.vmbuffer = *vmbuffer; + } + heap_page_prune_and_freeze(¶ms, &presult, &dummy_off_loc, NULL, NULL); @@ -785,6 +798,9 @@ heap_page_will_freeze(Relation relation, Buffer buffer, * have examined this page’s VM bits (e.g., VACUUM in the previous * heap_vac_scan_next_block() call) and can pass that along. * + * This should be called only after do_freeze has been decided (and do_prune + * has been set), as these factor into our heuristic-based decision. + * * Returns true if one or both VM bits should be set, along with the desired * flags in *vmflags. Also indicates via do_set_pd_vis whether PD_ALL_VISIBLE * should be set on the heap page. @@ -795,7 +811,9 @@ heap_page_will_set_vis(Relation relation, Buffer heap_buf, Buffer vmbuffer, bool blk_known_av, - const PruneState *prstate, + PruneReason reason, + bool do_prune, bool do_freeze, + PruneState *prstate, uint8 *vmflags, bool *do_set_pd_vis) { @@ -811,6 +829,23 @@ heap_page_will_set_vis(Relation relation, return false; } + /* + * If this is an on-access call and we're not actually pruning, avoid + * setting the visibility map if it would newly dirty the heap page or, if + * the page is already dirty, if doing so would require including a + * full-page image (FPI) of the heap page in the WAL. This situation + * should be rare, as on-access pruning is only attempted when + * pd_prune_xid is valid. + */ + if (reason == PRUNE_ON_ACCESS && + prstate->all_visible && + !do_prune && !do_freeze && + (!BufferIsDirty(heap_buf) || XLogCheckBufferNeedsBackup(heap_buf))) + { + prstate->all_visible = prstate->all_frozen = false; + return false; + } + if (prstate->all_visible && !PageIsAllVisible(heap_page)) *do_set_pd_vis = true; @@ -834,6 +869,11 @@ heap_page_will_set_vis(Relation relation, * page-level bit is clear. However, it's possible that in vacuum the bit * got cleared after heap_vac_scan_next_block() was called, so we must * recheck with buffer lock before concluding that the VM is corrupt. + * + * This will never trigger for on-access pruning because it couldn't have + * done a previous visibility map lookup and thus will always pass + * blk_known_av as false. A future vacuum will have to take care of fixing + * the corruption. */ else if (blk_known_av && !PageIsAllVisible(heap_page) && visibilitymap_get_status(relation, heap_blk, &vmbuffer) != 0) @@ -994,6 +1034,14 @@ heap_page_prune_and_freeze(PruneFreezeParams *params, prstate.ndead > 0 || prstate.nunused > 0; + /* + * Even if we don't prune anything, if we found a new value for the + * pd_prune_xid field or the page was marked full, we will update the hint + * bit. + */ + do_hint_prune = ((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid || + PageIsFull(page); + /* * After processing all the live tuples on the page, if the newest xmin * amongst them is not visible to everyone, the page cannot be @@ -1004,14 +1052,6 @@ heap_page_prune_and_freeze(PruneFreezeParams *params, !GlobalVisXidVisibleToAll(prstate.vistest, prstate.visibility_cutoff_xid)) prstate.all_visible = prstate.all_frozen = false; - /* - * Even if we don't prune anything, if we found a new value for the - * pd_prune_xid field or the page was marked full, we will update the hint - * bit. - */ - do_hint_prune = ((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid || - PageIsFull(page); - /* * Decide if we want to go ahead with freezing according to the freeze * plans we prepared, or not. @@ -1054,6 +1094,7 @@ heap_page_prune_and_freeze(PruneFreezeParams *params, */ do_set_vm = heap_page_will_set_vis(params->relation, blockno, buffer, vmbuffer, params->blk_known_av, + params->reason, do_prune, do_freeze, &prstate, &new_vmbits, &do_set_pd_vis); /* We should only set the VM if PD_ALL_VISIBLE is set or will be */ @@ -2340,7 +2381,7 @@ heap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples, /* * Calculate the conflict horizon for the whole XLOG_HEAP2_PRUNE_VACUUM_SCAN - * record. + * or XLOG_HEAP2_PRUNE_ON_ACCESS record. */ static TransactionId get_conflict_xid(bool do_prune, bool do_freeze, bool do_set_vm, @@ -2410,8 +2451,8 @@ get_conflict_xid(bool do_prune, bool do_freeze, bool do_set_vm, * - Reaping: During vacuum phase III, items that are already LP_DEAD are * marked as unused. * - * - VM updates: After vacuum phases I and III, the heap page may be marked - * all-visible and all-frozen. + * - VM updates: After vacuum phases I and III and on-access, the heap page + * may be marked all-visible and all-frozen. * * These changes all happen together, so we use a single WAL record for them * all. diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 0cb27af1310..1e7992dbeb3 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -447,7 +447,7 @@ systable_beginscan(Relation heapRelation, } sysscan->iscan = index_beginscan(heapRelation, irel, - snapshot, NULL, nkeys, 0); + snapshot, NULL, nkeys, 0, 0); index_rescan(sysscan->iscan, idxkey, nkeys, NULL, 0); sysscan->scan = NULL; @@ -708,7 +708,7 @@ systable_beginscan_ordered(Relation heapRelation, } sysscan->iscan = index_beginscan(heapRelation, indexRelation, - snapshot, NULL, nkeys, 0); + snapshot, NULL, nkeys, 0, 0); index_rescan(sysscan->iscan, idxkey, nkeys, NULL, 0); sysscan->scan = NULL; diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 0492d92d23b..b5523cf2ab1 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -257,7 +257,7 @@ index_beginscan(Relation heapRelation, Relation indexRelation, Snapshot snapshot, IndexScanInstrumentation *instrument, - int nkeys, int norderbys) + int nkeys, int norderbys, uint32 flags) { IndexScanDesc scan; @@ -284,7 +284,7 @@ index_beginscan(Relation heapRelation, scan->instrument = instrument; /* prepare to fetch index matches from table */ - scan->xs_heapfetch = table_index_fetch_begin(heapRelation); + scan->xs_heapfetch = table_index_fetch_begin(heapRelation, flags); return scan; } @@ -615,7 +615,7 @@ index_beginscan_parallel(Relation heaprel, Relation indexrel, scan->instrument = instrument; /* prepare to fetch index matches from table */ - scan->xs_heapfetch = table_index_fetch_begin(heaprel); + scan->xs_heapfetch = table_index_fetch_begin(heaprel, 0); return scan; } diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 454adaee7dc..02ab0233e59 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -1925,7 +1925,7 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, indexInfo = BuildIndexInfo(btspool->index); indexInfo->ii_Concurrent = btshared->isconcurrent; scan = table_beginscan_parallel(btspool->heap, - ParallelTableScanFromBTShared(btshared)); + ParallelTableScanFromBTShared(btshared), 0); reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo, true, progress, _bt_build_callback, &buildstate, scan); diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index 5e41404937e..558c4497993 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -50,6 +50,7 @@ char *default_table_access_method = DEFAULT_TABLE_ACCESS_METHOD; bool synchronize_seqscans = true; + /* ---------------------------------------------------------------------------- * Slot functions. * ---------------------------------------------------------------------------- @@ -163,10 +164,11 @@ table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, } TableScanDesc -table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan) +table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan, uint32 flags) { Snapshot snapshot; - uint32 flags = SO_TYPE_SEQSCAN | + + flags |= SO_TYPE_SEQSCAN | SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE; Assert(RelFileLocatorEquals(relation->rd_locator, pscan->phs_locator)); @@ -217,7 +219,7 @@ table_index_fetch_tuple_check(Relation rel, bool found; slot = table_slot_create(rel, NULL); - scan = table_index_fetch_begin(rel); + scan = table_index_fetch_begin(rel, 0); found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again, all_dead); table_index_fetch_end(scan); diff --git a/src/backend/commands/constraint.c b/src/backend/commands/constraint.c index 3497a8221f2..97c8278e36d 100644 --- a/src/backend/commands/constraint.c +++ b/src/backend/commands/constraint.c @@ -106,7 +106,7 @@ unique_key_recheck(PG_FUNCTION_ARGS) */ tmptid = checktid; { - IndexFetchTableData *scan = table_index_fetch_begin(trigdata->tg_relation); + IndexFetchTableData *scan = table_index_fetch_begin(trigdata->tg_relation, 0); bool call_again = false; if (!table_index_fetch_tuple(scan, &tmptid, SnapshotSelf, slot, diff --git a/src/backend/commands/copyto.c b/src/backend/commands/copyto.c index cef452584e5..22b453dc617 100644 --- a/src/backend/commands/copyto.c +++ b/src/backend/commands/copyto.c @@ -1156,7 +1156,7 @@ CopyRelationTo(CopyToState cstate, Relation rel, Relation root_rel, uint64 *proc AttrMap *map = NULL; TupleTableSlot *root_slot = NULL; - scandesc = table_beginscan(rel, GetActiveSnapshot(), 0, NULL); + scandesc = table_beginscan(rel, GetActiveSnapshot(), 0, NULL, 0); slot = table_slot_create(rel, NULL); /* diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 23ebaa3f230..66c418059fe 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -6345,7 +6345,7 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap) * checking all the constraints. */ snapshot = RegisterSnapshot(GetLatestSnapshot()); - scan = table_beginscan(oldrel, snapshot, 0, NULL); + scan = table_beginscan(oldrel, snapshot, 0, NULL, 0); /* * Switch to per-tuple memory context and reset it for each tuple @@ -13730,7 +13730,7 @@ validateForeignKeyConstraint(char *conname, */ snapshot = RegisterSnapshot(GetLatestSnapshot()); slot = table_slot_create(rel, NULL); - scan = table_beginscan(rel, snapshot, 0, NULL); + scan = table_beginscan(rel, snapshot, 0, NULL, 0); perTupCxt = AllocSetContextCreate(CurrentMemoryContext, "validateForeignKeyConstraint", diff --git a/src/backend/commands/typecmds.c b/src/backend/commands/typecmds.c index 5979580139f..35560ac60d9 100644 --- a/src/backend/commands/typecmds.c +++ b/src/backend/commands/typecmds.c @@ -3154,7 +3154,7 @@ validateDomainNotNullConstraint(Oid domainoid) /* Scan all tuples in this relation */ snapshot = RegisterSnapshot(GetLatestSnapshot()); - scan = table_beginscan(testrel, snapshot, 0, NULL); + scan = table_beginscan(testrel, snapshot, 0, NULL, 0); slot = table_slot_create(testrel, NULL); while (table_scan_getnextslot(scan, ForwardScanDirection, slot)) { @@ -3235,7 +3235,7 @@ validateDomainCheckConstraint(Oid domainoid, const char *ccbin, LOCKMODE lockmod /* Scan all tuples in this relation */ snapshot = RegisterSnapshot(GetLatestSnapshot()); - scan = table_beginscan(testrel, snapshot, 0, NULL); + scan = table_beginscan(testrel, snapshot, 0, NULL, 0); slot = table_slot_create(testrel, NULL); while (table_scan_getnextslot(scan, ForwardScanDirection, slot)) { diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index 401606f840a..4e39ac00f30 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -815,7 +815,7 @@ check_exclusion_or_unique_constraint(Relation heap, Relation index, retry: conflict = false; found_self = false; - index_scan = index_beginscan(heap, index, &DirtySnapshot, NULL, indnkeyatts, 0); + index_scan = index_beginscan(heap, index, &DirtySnapshot, NULL, indnkeyatts, 0, 0); index_rescan(index_scan, scankeys, indnkeyatts, NULL, 0); while (index_getnext_slot(index_scan, ForwardScanDirection, existing_slot)) diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 27c9eec697b..0630a5af79e 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -916,6 +916,10 @@ InitPlan(QueryDesc *queryDesc, int eflags) break; } + /* If it has a rowmark, the relation is modified */ + estate->es_modified_relids = bms_add_member(estate->es_modified_relids, + rc->rti); + /* Check that relation is a legal target for marking */ if (relation) CheckValidRowMarkRel(relation, rc->markType); diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index def32774c90..473d236e551 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -204,7 +204,7 @@ RelationFindReplTupleByIndex(Relation rel, Oid idxoid, skey_attoff = build_replindex_scan_key(skey, rel, idxrel, searchslot); /* Start an index scan. */ - scan = index_beginscan(rel, idxrel, &snap, NULL, skey_attoff, 0); + scan = index_beginscan(rel, idxrel, &snap, NULL, skey_attoff, 0, 0); retry: found = false; @@ -382,7 +382,7 @@ RelationFindReplTupleSeq(Relation rel, LockTupleMode lockmode, /* Start a heap scan. */ InitDirtySnapshot(snap); - scan = table_beginscan(rel, &snap, 0, NULL); + scan = table_beginscan(rel, &snap, 0, NULL, 0); scanslot = table_slot_create(rel, NULL); retry: @@ -601,7 +601,7 @@ RelationFindDeletedTupleInfoSeq(Relation rel, TupleTableSlot *searchslot, * not yet committed or those just committed prior to the scan are * excluded in update_most_recent_deletion_info(). */ - scan = table_beginscan(rel, SnapshotAny, 0, NULL); + scan = table_beginscan(rel, SnapshotAny, 0, NULL, 0); scanslot = table_slot_create(rel, NULL); table_rescan(scan, NULL); @@ -665,7 +665,7 @@ RelationFindDeletedTupleInfoByIndex(Relation rel, Oid idxoid, * not yet committed or those just committed prior to the scan are * excluded in update_most_recent_deletion_info(). */ - scan = index_beginscan(rel, idxrel, SnapshotAny, NULL, skey_attoff, 0); + scan = index_beginscan(rel, idxrel, SnapshotAny, NULL, skey_attoff, 0, 0); index_rescan(scan, skey, skey_attoff, NULL, 0); diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c index fdc65c2b42b..28a06dcd244 100644 --- a/src/backend/executor/execUtils.c +++ b/src/backend/executor/execUtils.c @@ -893,6 +893,8 @@ ExecInitResultRelation(EState *estate, ResultRelInfo *resultRelInfo, estate->es_result_relations = (ResultRelInfo **) palloc0(estate->es_range_table_size * sizeof(ResultRelInfo *)); estate->es_result_relations[rti - 1] = resultRelInfo; + estate->es_modified_relids = bms_add_member(estate->es_modified_relids, + rti); /* * Saving in the list allows to avoid needlessly traversing the whole diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index bf24f3d7fe0..0d854db51a1 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -105,11 +105,18 @@ BitmapTableScanSetup(BitmapHeapScanState *node) */ if (!node->ss.ss_currentScanDesc) { + uint32 flags = 0; + + if (!bms_is_member(((Scan *) node->ss.ps.plan)->scanrelid, + node->ss.ps.state->es_modified_relids)) + flags = SO_HINT_REL_READ_ONLY; + node->ss.ss_currentScanDesc = table_beginscan_bm(node->ss.ss_currentRelation, node->ss.ps.state->es_snapshot, 0, - NULL); + NULL, + flags); } node->ss.ss_currentScanDesc->st.rs_tbmiterator = tbmiterator; diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index f464cca9507..87b04b1b88e 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -94,7 +94,7 @@ IndexOnlyNext(IndexOnlyScanState *node) estate->es_snapshot, &node->ioss_Instrument, node->ioss_NumScanKeys, - node->ioss_NumOrderByKeys); + node->ioss_NumOrderByKeys, 0); node->ioss_ScanDesc = scandesc; diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index f36929deec3..90f929ce741 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -102,6 +102,12 @@ IndexNext(IndexScanState *node) if (scandesc == NULL) { + uint32 flags = 0; + + if (!bms_is_member(((Scan *) node->ss.ps.plan)->scanrelid, + estate->es_modified_relids)) + flags = SO_HINT_REL_READ_ONLY; + /* * We reach here if the index scan is not parallel, or if we're * serially executing an index scan that was planned to be parallel. @@ -111,7 +117,8 @@ IndexNext(IndexScanState *node) estate->es_snapshot, &node->iss_Instrument, node->iss_NumScanKeys, - node->iss_NumOrderByKeys); + node->iss_NumOrderByKeys, + flags); node->iss_ScanDesc = scandesc; @@ -207,7 +214,7 @@ IndexNextWithReorder(IndexScanState *node) estate->es_snapshot, &node->iss_Instrument, node->iss_NumScanKeys, - node->iss_NumOrderByKeys); + node->iss_NumOrderByKeys, 0); node->iss_ScanDesc = scandesc; diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c index 94047d29430..4d0cbb9dee4 100644 --- a/src/backend/executor/nodeSeqscan.c +++ b/src/backend/executor/nodeSeqscan.c @@ -65,13 +65,20 @@ SeqNext(SeqScanState *node) if (scandesc == NULL) { + uint32 flags = 0; + + if (!bms_is_member(((Scan *) node->ss.ps.plan)->scanrelid, + estate->es_modified_relids)) + flags = SO_HINT_REL_READ_ONLY; + /* * We reach here if the scan is not parallel, or if we're serially * executing a scan that was planned to be parallel. */ scandesc = table_beginscan(node->ss.ss_currentRelation, estate->es_snapshot, - 0, NULL); + 0, NULL, flags); + node->ss.ss_currentScanDesc = scandesc; } @@ -367,14 +374,20 @@ ExecSeqScanInitializeDSM(SeqScanState *node, { EState *estate = node->ss.ps.state; ParallelTableScanDesc pscan; + uint32 flags = 0; pscan = shm_toc_allocate(pcxt->toc, node->pscan_len); table_parallelscan_initialize(node->ss.ss_currentRelation, pscan, estate->es_snapshot); shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); + if (!bms_is_member(((Scan *) node->ss.ps.plan)->scanrelid, + estate->es_modified_relids)) + flags = SO_HINT_REL_READ_ONLY; + node->ss.ss_currentScanDesc = - table_beginscan_parallel(node->ss.ss_currentRelation, pscan); + table_beginscan_parallel(node->ss.ss_currentRelation, pscan, + flags); } /* ---------------------------------------------------------------- @@ -404,8 +417,15 @@ ExecSeqScanInitializeWorker(SeqScanState *node, ParallelWorkerContext *pwcxt) { ParallelTableScanDesc pscan; + uint32 flags = 0; + + if (!bms_is_member(((Scan *) node->ss.ps.plan)->scanrelid, + node->ss.ps.state->es_modified_relids)) + flags = SO_HINT_REL_READ_ONLY; pscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false); node->ss.ss_currentScanDesc = - table_beginscan_parallel(node->ss.ss_currentRelation, pscan); + table_beginscan_parallel(node->ss.ss_currentRelation, + pscan, + flags); } diff --git a/src/backend/partitioning/partbounds.c b/src/backend/partitioning/partbounds.c index 8ba038c5ef4..d3b340ee2a7 100644 --- a/src/backend/partitioning/partbounds.c +++ b/src/backend/partitioning/partbounds.c @@ -3370,7 +3370,7 @@ check_default_partition_contents(Relation parent, Relation default_rel, econtext = GetPerTupleExprContext(estate); snapshot = RegisterSnapshot(GetLatestSnapshot()); tupslot = table_slot_create(part_rel, &estate->es_tupleTable); - scan = table_beginscan(part_rel, snapshot, 0, NULL); + scan = table_beginscan(part_rel, snapshot, 0, NULL, 0); /* * Switch to per-tuple memory context and reset it for each tuple diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 540aa9628d7..28434146eba 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -7100,7 +7100,7 @@ get_actual_variable_endpoint(Relation heapRel, index_scan = index_beginscan(heapRel, indexRel, &SnapshotNonVacuumable, NULL, - 1, 0); + 1, 0, 0); /* Set it up for index-only scan */ index_scan->xs_want_itup = true; index_rescan(index_scan, scankeys, 1, NULL, 0); diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 9200a22bd9f..2f9e9ea6318 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -177,7 +177,7 @@ extern IndexScanDesc index_beginscan(Relation heapRelation, Relation indexRelation, Snapshot snapshot, IndexScanInstrumentation *instrument, - int nkeys, int norderbys); + int nkeys, int norderbys, uint32 flags); extern IndexScanDesc index_beginscan_bitmap(Relation indexRelation, Snapshot snapshot, IndexScanInstrumentation *instrument, @@ -204,6 +204,7 @@ extern IndexScanDesc index_beginscan_parallel(Relation heaprel, IndexScanInstrumentation *instrument, int nkeys, int norderbys, ParallelIndexScanDesc pscan); + extern ItemPointer index_getnext_tid(IndexScanDesc scan, ScanDirection direction); extern bool index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot); diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 2b6a521e4ea..1e3df54628b 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -95,6 +95,13 @@ typedef struct HeapScanDescData */ ParallelBlockTableScanWorkerData *rs_parallelworkerdata; + /* + * For sequential scans and bitmap heap scans. If the relation is not + * being modified, on-access pruning may read in the current heap page's + * corresponding VM block to this buffer. + */ + Buffer rs_vmbuffer; + /* these fields only used in page-at-a-time mode and for bitmap scans */ uint32 rs_cindex; /* current tuple's index in vistuples */ uint32 rs_ntuples; /* number of visible tuples on page */ @@ -117,8 +124,24 @@ typedef struct IndexFetchHeapData { IndexFetchTableData xs_base; /* AM independent part of the descriptor */ - Buffer xs_cbuf; /* current heap buffer in scan, if any */ - /* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ + /* + * Current heap buffer in scan, if any. NB: if xs_cbuf is not + * InvalidBuffer, we hold a pin on that buffer. + */ + Buffer xs_cbuf; + + /* + * For index scans that do not modify the underlying heap table, on-access + * pruning may read in the current heap page's corresponding VM block to + * this buffer. + */ + Buffer xs_vmbuffer; + + /* + * Some optimizations can only be performed if the query does not modify + * the underlying relation. Track that here. + */ + bool modifies_base_rel; } IndexFetchHeapData; /* Result codes for HeapTupleSatisfiesVacuum */ @@ -417,7 +440,8 @@ extern TransactionId heap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate); /* in heap/pruneheap.c */ -extern void heap_page_prune_opt(Relation relation, Buffer buffer); +extern void heap_page_prune_opt(Relation relation, Buffer buffer, + Buffer *vmbuffer); extern void heap_page_prune_and_freeze(PruneFreezeParams *params, PruneFreezeResult *presult, OffsetNumber *off_loc, diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index e16bf025692..0042636463f 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -63,6 +63,8 @@ typedef enum ScanOptions /* unregister snapshot at scan end? */ SO_TEMP_SNAPSHOT = 1 << 9, + /* set if the query doesn't modify the rel */ + SO_HINT_REL_READ_ONLY = 1 << 10, } ScanOptions; /* @@ -420,7 +422,7 @@ typedef struct TableAmRoutine * * Tuples for an index scan can then be fetched via index_fetch_tuple. */ - struct IndexFetchTableData *(*index_fetch_begin) (Relation rel); + struct IndexFetchTableData *(*index_fetch_begin) (Relation rel, uint32 flags); /* * Reset index fetch. Typically this will release cross index fetch @@ -874,9 +876,9 @@ extern TupleTableSlot *table_slot_create(Relation relation, List **reglist); */ static inline TableScanDesc table_beginscan(Relation rel, Snapshot snapshot, - int nkeys, ScanKeyData *key) + int nkeys, ScanKeyData *key, uint32 flags) { - uint32 flags = SO_TYPE_SEQSCAN | + flags |= SO_TYPE_SEQSCAN | SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE; return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); @@ -919,9 +921,9 @@ table_beginscan_strat(Relation rel, Snapshot snapshot, */ static inline TableScanDesc table_beginscan_bm(Relation rel, Snapshot snapshot, - int nkeys, ScanKeyData *key) + int nkeys, ScanKeyData *key, uint32 flags) { - uint32 flags = SO_TYPE_BITMAPSCAN | SO_ALLOW_PAGEMODE; + flags |= SO_TYPE_BITMAPSCAN | SO_ALLOW_PAGEMODE; return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); @@ -1128,7 +1130,8 @@ extern void table_parallelscan_initialize(Relation rel, * Caller must hold a suitable lock on the relation. */ extern TableScanDesc table_beginscan_parallel(Relation relation, - ParallelTableScanDesc pscan); + ParallelTableScanDesc pscan, + uint32 flags); /* * Restart a parallel scan. Call this in the leader process. Caller is @@ -1154,9 +1157,9 @@ table_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan) * Tuples for an index scan can then be fetched via table_index_fetch_tuple(). */ static inline IndexFetchTableData * -table_index_fetch_begin(Relation rel) +table_index_fetch_begin(Relation rel, uint32 flags) { - return rel->rd_tableam->index_fetch_begin(rel); + return rel->rd_tableam->index_fetch_begin(rel, flags); } /* diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 18ae8f0d4bb..0c3b0d60168 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -676,6 +676,12 @@ typedef struct EState * ExecDoInitialPruning() */ const char *es_sourceText; /* Source text from QueryDesc */ + /* + * RT indexes of relations modified by the query either through + * UPDATE/DELETE/INSERT/MERGE or SELECT FOR UPDATE + */ + Bitmapset *es_modified_relids; + JunkFilter *es_junkFilter; /* top-level junk filter, if any */ /* If query can insert/delete tuples, the command ID to mark them with */ diff --git a/src/test/recovery/t/035_standby_logical_decoding.pl b/src/test/recovery/t/035_standby_logical_decoding.pl index ebe2fae1789..bdd9f0a62cd 100644 --- a/src/test/recovery/t/035_standby_logical_decoding.pl +++ b/src/test/recovery/t/035_standby_logical_decoding.pl @@ -296,6 +296,7 @@ wal_level = 'logical' max_replication_slots = 4 max_wal_senders = 4 autovacuum = off +hot_standby_feedback = on }); $node_primary->dump_info; $node_primary->start; @@ -748,7 +749,7 @@ check_pg_recvlogical_stderr($handle, $logstart = -s $node_standby->logfile; reactive_slots_change_hfs_and_wait_for_xmins('shared_row_removal_', - 'no_conflict_', 0, 1); + 'no_conflict_', 1, 0); # This should not trigger a conflict wait_until_vacuum_can_remove( -- 2.43.0