From 8df8cf1d9c5baa8d07e623e80dfaeb5ff4b25228 Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Tue, 29 Jul 2025 14:34:30 -0400 Subject: [PATCH v15 22/23] Allow on-access pruning to set pages all-visible Many queries do not modify the underlying relation. For such queries, if on-access pruning occurs during the scan, we can check whether the page has become all-visible and update the visibility map accordingly. Previously, only vacuum marked pages as all-visible or all-frozen. Supporting this requires passing information about whether the relation is modified from the executor down to the scan descriptor. This commit implements on-access VM setting for sequential scans as well as for the underlying heap relation in index scans and bitmap heap scans. --- src/backend/access/heap/heapam.c | 15 +++- src/backend/access/heap/heapam_handler.c | 15 +++- src/backend/access/heap/pruneheap.c | 89 ++++++++++++++----- src/backend/access/index/indexam.c | 46 ++++++++++ src/backend/access/table/tableam.c | 39 +++++++- src/backend/executor/execMain.c | 4 + src/backend/executor/execUtils.c | 2 + src/backend/executor/nodeBitmapHeapscan.c | 7 +- src/backend/executor/nodeIndexscan.c | 18 ++-- src/backend/executor/nodeSeqscan.c | 24 +++-- src/include/access/genam.h | 11 +++ src/include/access/heapam.h | 24 ++++- src/include/access/relscan.h | 6 ++ src/include/access/tableam.h | 30 ++++++- src/include/nodes/execnodes.h | 6 ++ .../t/035_standby_logical_decoding.pl | 3 +- 16 files changed, 292 insertions(+), 47 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 14a2996b9ee..6181e355aaf 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -555,6 +555,7 @@ heap_prepare_pagescan(TableScanDesc sscan) Buffer buffer = scan->rs_cbuf; BlockNumber block = scan->rs_cblock; Snapshot snapshot; + Buffer *vmbuffer = NULL; Page page; int lines; bool all_visible; @@ -569,7 +570,9 @@ heap_prepare_pagescan(TableScanDesc sscan) /* * Prune and repair fragmentation for the whole page, if possible. */ - heap_page_prune_opt(scan->rs_base.rs_rd, buffer); + if (sscan->rs_flags & SO_ALLOW_VM_SET) + vmbuffer = &scan->rs_vmbuffer; + heap_page_prune_opt(scan->rs_base.rs_rd, buffer, vmbuffer); /* * We must hold share lock on the buffer content while examining tuple @@ -1246,6 +1249,7 @@ heap_beginscan(Relation relation, Snapshot snapshot, sizeof(TBMIterateResult)); } + scan->rs_vmbuffer = InvalidBuffer; return (TableScanDesc) scan; } @@ -1284,6 +1288,12 @@ heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params, scan->rs_cbuf = InvalidBuffer; } + if (BufferIsValid(scan->rs_vmbuffer)) + { + ReleaseBuffer(scan->rs_vmbuffer); + scan->rs_vmbuffer = InvalidBuffer; + } + /* * SO_TYPE_BITMAPSCAN would be cleaned up here, but it does not hold any * additional data vs a normal HeapScan @@ -1316,6 +1326,9 @@ heap_endscan(TableScanDesc sscan) if (BufferIsValid(scan->rs_cbuf)) ReleaseBuffer(scan->rs_cbuf); + if (BufferIsValid(scan->rs_vmbuffer)) + ReleaseBuffer(scan->rs_vmbuffer); + /* * Must free the read stream before freeing the BufferAccessStrategy. */ diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index bcbac844bb6..f05b9e4968d 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -85,6 +85,7 @@ heapam_index_fetch_begin(Relation rel) hscan->xs_base.rel = rel; hscan->xs_cbuf = InvalidBuffer; + hscan->xs_vmbuffer = InvalidBuffer; return &hscan->xs_base; } @@ -99,6 +100,12 @@ heapam_index_fetch_reset(IndexFetchTableData *scan) ReleaseBuffer(hscan->xs_cbuf); hscan->xs_cbuf = InvalidBuffer; } + + if (BufferIsValid(hscan->xs_vmbuffer)) + { + ReleaseBuffer(hscan->xs_vmbuffer); + hscan->xs_vmbuffer = InvalidBuffer; + } } static void @@ -138,7 +145,8 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, * Prune page, but only if we weren't already on this page */ if (prev_buf != hscan->xs_cbuf) - heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf); + heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf, + scan->modifies_base_rel ? NULL : &hscan->xs_vmbuffer); } /* Obtain share-lock on the buffer so we can examine visibility */ @@ -2471,6 +2479,7 @@ BitmapHeapScanNextBlock(TableScanDesc scan, TBMIterateResult *tbmres; OffsetNumber offsets[TBM_MAX_TUPLES_PER_PAGE]; int noffsets = -1; + Buffer *vmbuffer = NULL; Assert(scan->rs_flags & SO_TYPE_BITMAPSCAN); Assert(hscan->rs_read_stream); @@ -2517,7 +2526,9 @@ BitmapHeapScanNextBlock(TableScanDesc scan, /* * Prune and repair fragmentation for the whole page, if possible. */ - heap_page_prune_opt(scan->rs_rd, buffer); + if (scan->rs_flags & SO_ALLOW_VM_SET) + vmbuffer = &hscan->rs_vmbuffer; + heap_page_prune_opt(scan->rs_rd, buffer, vmbuffer); /* * We must hold share lock on the buffer content while examining tuple diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index e64addfdf5d..0d8fea346c5 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -45,6 +45,8 @@ typedef struct bool mark_unused_now; /* whether to attempt freezing tuples */ bool attempt_freeze; + /* whether or not to attempt updating the VM */ + bool attempt_update_vm; const struct VacuumCutoffs *cutoffs; /*------------------------------------------------------- @@ -185,9 +187,13 @@ static void page_verify_redirects(Page page); * if there's not any use in pruning. * * Caller must have pin on the buffer, and must *not* have a lock on it. + * + * If vmbuffer is not NULL, it is okay for pruning to set the visibility map if + * the page is all visible. We will take care of pinning and, if needed, + * reading in the page of the visibility map. */ void -heap_page_prune_opt(Relation relation, Buffer buffer) +heap_page_prune_opt(Relation relation, Buffer buffer, Buffer *vmbuffer) { Page page = BufferGetPage(buffer); TransactionId prune_xid; @@ -251,6 +257,13 @@ heap_page_prune_opt(Relation relation, Buffer buffer) { OffsetNumber dummy_off_loc; PruneFreezeResult presult; + int options = 0; + + if (vmbuffer) + { + visibilitymap_pin(relation, BufferGetBlockNumber(buffer), vmbuffer); + options = HEAP_PAGE_PRUNE_UPDATE_VIS; + } /* * For now, pass mark_unused_now as false regardless of whether or @@ -258,8 +271,9 @@ heap_page_prune_opt(Relation relation, Buffer buffer) * that during on-access pruning with the current implementation. */ heap_page_prune_and_freeze(relation, buffer, - InvalidBuffer, false, - PRUNE_ON_ACCESS, 0, NULL, + vmbuffer ? *vmbuffer : InvalidBuffer, + false, /* blk_known_av */ + PRUNE_ON_ACCESS, options, NULL, vistest, &presult, &dummy_off_loc, NULL, NULL); /* @@ -443,6 +457,8 @@ heap_page_will_set_vis(Relation relation, Buffer heap_buf, Buffer vmbuffer, bool blk_known_av, + PruneReason reason, + bool do_prune, bool do_freeze, PruneState *prstate, uint8 *vmflags, bool *do_set_pd_vis) @@ -450,6 +466,32 @@ heap_page_will_set_vis(Relation relation, Page heap_page = BufferGetPage(heap_buf); bool do_set_vm = false; + *do_set_pd_vis = false; + + if (!prstate->attempt_update_vm) + { + Assert(!prstate->all_visible && !prstate->all_frozen); + Assert(*vmflags == 0); + return false; + } + + /* + * If this is an on-access call and we're not actually pruning, avoid + * setting the visibility map if it would newly dirty the heap page or, if + * the page is already dirty, if doing so would require including a + * full-page image (FPI) of the heap page in the WAL. This situation + * should be rare, as on-access pruning is only attempted when + * pd_prune_xid is valid. + */ + if (reason == PRUNE_ON_ACCESS && + prstate->all_visible && + !do_prune && !do_freeze && + (!BufferIsDirty(heap_buf) || XLogCheckBufferNeedsBackup(heap_buf))) + { + prstate->all_visible = prstate->all_frozen = false; + return false; + } + if (prstate->all_visible && !PageIsAllVisible(heap_page)) *do_set_pd_vis = true; @@ -473,6 +515,9 @@ heap_page_will_set_vis(Relation relation, * page-level bit is clear. However, it's possible that in vacuum the bit * got cleared after heap_vac_scan_next_block() was called, so we must * recheck with buffer lock before concluding that the VM is corrupt. + * + * XXX: This will never trigger for on-access pruning because it passes + * blk_known_av as false. Should we remove that condition here? */ else if (blk_known_av && !PageIsAllVisible(heap_page) && visibilitymap_get_status(relation, heap_blk, &vmbuffer) != 0) @@ -615,6 +660,7 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, prstate.vistest = vistest; prstate.mark_unused_now = (options & HEAP_PAGE_PRUNE_MARK_UNUSED_NOW) != 0; prstate.attempt_freeze = (options & HEAP_PAGE_PRUNE_FREEZE) != 0; + prstate.attempt_update_vm = (options & HEAP_PAGE_PRUNE_UPDATE_VIS) != 0; prstate.cutoffs = cutoffs; /* @@ -692,7 +738,7 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, prstate.all_visible = true; prstate.all_frozen = true; } - else if ((options & HEAP_PAGE_PRUNE_UPDATE_VIS) != 0) + else if (prstate.attempt_update_vm) { prstate.all_visible = true; prstate.all_frozen = false; @@ -906,6 +952,14 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, prstate.ndead > 0 || prstate.nunused > 0; + /* + * Even if we don't prune anything, if we found a new value for the + * pd_prune_xid field or the page was marked full, we will update the hint + * bit. + */ + do_hint_prune = ((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid || + PageIsFull(page); + /* * After processing all the live tuples on the page, if the newest xmin * amongst them is not visible to everyone, the page cannot be @@ -916,14 +970,6 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, !GlobalVisXidVisibleToAll(prstate.vistest, prstate.visibility_cutoff_xid)) prstate.all_visible = prstate.all_frozen = false; - /* - * Even if we don't prune anything, if we found a new value for the - * pd_prune_xid field or the page was marked full, we will update the hint - * bit. - */ - do_hint_prune = ((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid || - PageIsFull(page); - /* * Decide if we want to go ahead with freezing according to the freeze * plans we prepared, or not. @@ -951,8 +997,6 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, if (prstate.lpdead_items > 0) prstate.all_visible = prstate.all_frozen = false; - Assert(!prstate.all_frozen || prstate.all_visible); - /* * Determine whether or not to set the page level PD_ALL_VISIBLE and the * visibility map bits based on information from the VM and from @@ -968,12 +1012,12 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, * As such, it is possible to only update the VM when PD_ALL_VISIBLE is * already set. */ - do_set_pd_vis = false; - do_set_vm = false; - if ((options & HEAP_PAGE_PRUNE_UPDATE_VIS) != 0) - do_set_vm = heap_page_will_set_vis(relation, - blockno, buffer, vmbuffer, blk_known_av, - &prstate, &new_vmbits, &do_set_pd_vis); + do_set_vm = heap_page_will_set_vis(relation, + blockno, buffer, vmbuffer, blk_known_av, + reason, do_prune, do_freeze, + &prstate, &new_vmbits, &do_set_pd_vis); + + Assert(!prstate.all_frozen || prstate.all_visible); /* Lock vmbuffer before entering a critical section */ if (do_set_vm) @@ -1134,7 +1178,6 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, bool debug_all_frozen; Assert(prstate.lpdead_items == 0); - Assert(prstate.cutoffs); if (!heap_page_is_all_visible(relation, buffer, prstate.vistest, @@ -2299,8 +2342,8 @@ heap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples, * - Reaping: During vacuum phase III, items that are already LP_DEAD are * marked as unused. * - * - VM updates: After vacuum phases I and III, the heap page may be marked - * all-visible and all-frozen. + * - VM updates: After vacuum phases I and III and on-access, the heap page + * may be marked all-visible and all-frozen. * * These changes all happen together, so we use a single WAL record for them * all. diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 0492d92d23b..8d582a8eafd 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -289,6 +289,32 @@ index_beginscan(Relation heapRelation, return scan; } +/* + * Similar to index_beginscan(), but allows the caller to indicate whether the + * query modifies the underlying base relation. This is used when the caller + * wants to attempt marking pages in the base relation as all-visible in the + * visibility map during on-access pruning. + */ +IndexScanDesc +index_beginscan_vmset(Relation heapRelation, + Relation indexRelation, + Snapshot snapshot, + IndexScanInstrumentation *instrument, + int nkeys, int norderbys, bool modifies_base_rel) +{ + IndexScanDesc scan; + + scan = index_beginscan(heapRelation, + indexRelation, + snapshot, + instrument, + nkeys, norderbys); + + scan->xs_heapfetch->modifies_base_rel = modifies_base_rel; + + return scan; +} + /* * index_beginscan_bitmap - start a scan of an index with amgetbitmap * @@ -620,6 +646,26 @@ index_beginscan_parallel(Relation heaprel, Relation indexrel, return scan; } +/* + * Parallel version of index_beginscan_vmset() + */ +IndexScanDesc +index_beginscan_parallel_vmset(Relation heaprel, Relation indexrel, + IndexScanInstrumentation *instrument, + int nkeys, int norderbys, + ParallelIndexScanDesc pscan, + bool modifies_base_rel) +{ + IndexScanDesc scan; + + scan = index_beginscan_parallel(heaprel, indexrel, + instrument, + nkeys, norderbys, + pscan); + scan->xs_heapfetch->modifies_base_rel = modifies_base_rel; + return scan; +} + /* ---------------- * index_getnext_tid - get the next TID from a scan * diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index a56c5eceb14..67dbf99f5b5 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -49,6 +49,10 @@ char *default_table_access_method = DEFAULT_TABLE_ACCESS_METHOD; bool synchronize_seqscans = true; +/* Helper for table_beginscan_parallel() and table_beginscan_parallel_vmset() */ +static TableScanDesc table_beginscan_parallel_common(Relation relation, ParallelTableScanDesc pscan, + uint32 flags); + /* ---------------------------------------------------------------------------- * Slot functions. @@ -162,12 +166,14 @@ table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, } } -TableScanDesc -table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan) +/* + * Common helper for table_beginscan_parallel() and table_beginscan_parallel_vmset() + */ +static TableScanDesc +table_beginscan_parallel_common(Relation relation, ParallelTableScanDesc pscan, + uint32 flags) { Snapshot snapshot; - uint32 flags = SO_TYPE_SEQSCAN | - SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE; Assert(RelFileLocatorEquals(relation->rd_locator, pscan->phs_locator)); @@ -188,6 +194,31 @@ table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan) pscan, flags); } +TableScanDesc +table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan) +{ + uint32 flags = SO_TYPE_SEQSCAN | + SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE; + + return table_beginscan_parallel_common(relation, pscan, flags); +} + +/* + * Parallel version of table_beginscan_vmset() + */ +TableScanDesc +table_beginscan_parallel_vmset(Relation relation, ParallelTableScanDesc pscan, + bool modifies_rel) +{ + uint32 flags = SO_TYPE_SEQSCAN | + SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE; + + if (!modifies_rel) + flags |= SO_ALLOW_VM_SET; + + return table_beginscan_parallel_common(relation, pscan, flags); +} + /* ---------------------------------------------------------------------------- * Index scan related functions. diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 831c55ce787..15be318fd41 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -916,6 +916,10 @@ InitPlan(QueryDesc *queryDesc, int eflags) break; } + /* If it has a rowmark, the relation is modified */ + estate->es_modified_relids = bms_add_member(estate->es_modified_relids, + rc->rti); + /* Check that relation is a legal target for marking */ if (relation) CheckValidRowMarkRel(relation, rc->markType); diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c index fdc65c2b42b..28a06dcd244 100644 --- a/src/backend/executor/execUtils.c +++ b/src/backend/executor/execUtils.c @@ -893,6 +893,8 @@ ExecInitResultRelation(EState *estate, ResultRelInfo *resultRelInfo, estate->es_result_relations = (ResultRelInfo **) palloc0(estate->es_range_table_size * sizeof(ResultRelInfo *)); estate->es_result_relations[rti - 1] = resultRelInfo; + estate->es_modified_relids = bms_add_member(estate->es_modified_relids, + rti); /* * Saving in the list allows to avoid needlessly traversing the whole diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index bf24f3d7fe0..af6db9f7919 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -105,11 +105,16 @@ BitmapTableScanSetup(BitmapHeapScanState *node) */ if (!node->ss.ss_currentScanDesc) { + bool modifies_rel = + bms_is_member(((Scan *) node->ss.ps.plan)->scanrelid, + node->ss.ps.state->es_modified_relids); + node->ss.ss_currentScanDesc = table_beginscan_bm(node->ss.ss_currentRelation, node->ss.ps.state->es_snapshot, 0, - NULL); + NULL, + modifies_rel); } node->ss.ss_currentScanDesc->st.rs_tbmiterator = tbmiterator; diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 7fcaa37fe62..c2ffbd3b08e 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -102,16 +102,22 @@ IndexNext(IndexScanState *node) if (scandesc == NULL) { + + bool modifies_base_rel = + bms_is_member(((Scan *) node->ss.ps.plan)->scanrelid, + estate->es_modified_relids); + /* * We reach here if the index scan is not parallel, or if we're * serially executing an index scan that was planned to be parallel. */ - scandesc = index_beginscan(node->ss.ss_currentRelation, - node->iss_RelationDesc, - estate->es_snapshot, - &node->iss_Instrument, - node->iss_NumScanKeys, - node->iss_NumOrderByKeys); + scandesc = index_beginscan_vmset(node->ss.ss_currentRelation, + node->iss_RelationDesc, + estate->es_snapshot, + &node->iss_Instrument, + node->iss_NumScanKeys, + node->iss_NumOrderByKeys, + modifies_base_rel); node->iss_ScanDesc = scandesc; diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c index 94047d29430..fd69275c181 100644 --- a/src/backend/executor/nodeSeqscan.c +++ b/src/backend/executor/nodeSeqscan.c @@ -65,13 +65,18 @@ SeqNext(SeqScanState *node) if (scandesc == NULL) { + bool modifies_rel = + bms_is_member(((Scan *) node->ss.ps.plan)->scanrelid, + estate->es_modified_relids); + /* * We reach here if the scan is not parallel, or if we're serially * executing a scan that was planned to be parallel. */ - scandesc = table_beginscan(node->ss.ss_currentRelation, - estate->es_snapshot, - 0, NULL); + scandesc = table_beginscan_vmset(node->ss.ss_currentRelation, + estate->es_snapshot, + 0, NULL, modifies_rel); + node->ss.ss_currentScanDesc = scandesc; } @@ -366,6 +371,7 @@ ExecSeqScanInitializeDSM(SeqScanState *node, ParallelContext *pcxt) { EState *estate = node->ss.ps.state; + bool modifies_rel; ParallelTableScanDesc pscan; pscan = shm_toc_allocate(pcxt->toc, node->pscan_len); @@ -373,8 +379,11 @@ ExecSeqScanInitializeDSM(SeqScanState *node, pscan, estate->es_snapshot); shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); + modifies_rel = bms_is_member(((Scan *) node->ss.ps.plan)->scanrelid, + estate->es_modified_relids); node->ss.ss_currentScanDesc = - table_beginscan_parallel(node->ss.ss_currentRelation, pscan); + table_beginscan_parallel_vmset(node->ss.ss_currentRelation, pscan, + modifies_rel); } /* ---------------------------------------------------------------- @@ -404,8 +413,13 @@ ExecSeqScanInitializeWorker(SeqScanState *node, ParallelWorkerContext *pwcxt) { ParallelTableScanDesc pscan; + bool modifies_rel = + bms_is_member(((Scan *) node->ss.ps.plan)->scanrelid, + node->ss.ps.state->es_modified_relids); pscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false); node->ss.ss_currentScanDesc = - table_beginscan_parallel(node->ss.ss_currentRelation, pscan); + table_beginscan_parallel_vmset(node->ss.ss_currentRelation, + pscan, + modifies_rel); } diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 0831c33b038..87827127d96 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -174,6 +174,11 @@ extern IndexScanDesc index_beginscan(Relation heapRelation, Snapshot snapshot, IndexScanInstrumentation *instrument, int nkeys, int norderbys); +extern IndexScanDesc index_beginscan_vmset(Relation heapRelation, + Relation indexRelation, + Snapshot snapshot, + IndexScanInstrumentation *instrument, + int nkeys, int norderbys, bool modifies_heap_rel); extern IndexScanDesc index_beginscan_bitmap(Relation indexRelation, Snapshot snapshot, IndexScanInstrumentation *instrument, @@ -200,6 +205,12 @@ extern IndexScanDesc index_beginscan_parallel(Relation heaprel, IndexScanInstrumentation *instrument, int nkeys, int norderbys, ParallelIndexScanDesc pscan); + +extern IndexScanDesc index_beginscan_parallel_vmset(Relation heaprel, Relation indexrel, + IndexScanInstrumentation *instrument, + int nkeys, int norderbys, + ParallelIndexScanDesc pscan, + bool modifies_rel); extern ItemPointer index_getnext_tid(IndexScanDesc scan, ScanDirection direction); struct TupleTableSlot; diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 34ee323a423..9dcf8d29496 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -94,6 +94,13 @@ typedef struct HeapScanDescData */ ParallelBlockTableScanWorkerData *rs_parallelworkerdata; + /* + * For sequential scans and bitmap heap scans. If the relation is not + * being modified, on-access pruning may read in the current heap page's + * corresponding VM block to this buffer. + */ + Buffer rs_vmbuffer; + /* these fields only used in page-at-a-time mode and for bitmap scans */ uint32 rs_cindex; /* current tuple's index in vistuples */ uint32 rs_ntuples; /* number of visible tuples on page */ @@ -116,8 +123,18 @@ typedef struct IndexFetchHeapData { IndexFetchTableData xs_base; /* AM independent part of the descriptor */ - Buffer xs_cbuf; /* current heap buffer in scan, if any */ - /* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ + /* + * Current heap buffer in scan, if any. NB: if xs_cbuf is not + * InvalidBuffer, we hold a pin on that buffer. + */ + Buffer xs_cbuf; + + /* + * For index scans that do not modify the underlying heap table, on-access + * pruning may read in the current heap page's corresponding VM block to + * this buffer. + */ + Buffer xs_vmbuffer; } IndexFetchHeapData; /* Result codes for HeapTupleSatisfiesVacuum */ @@ -363,7 +380,8 @@ extern TransactionId heap_index_delete_tuples(Relation rel, /* in heap/pruneheap.c */ struct GlobalVisState; -extern void heap_page_prune_opt(Relation relation, Buffer buffer); +extern void heap_page_prune_opt(Relation relation, Buffer buffer, + Buffer *vmbuffer); extern void heap_page_prune_and_freeze(Relation relation, Buffer buffer, Buffer vmbuffer, bool blk_known_av, PruneReason reason, diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index b5e0fb386c0..f496e0b4939 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -121,6 +121,12 @@ typedef struct ParallelBlockTableScanWorkerData *ParallelBlockTableScanWorker; typedef struct IndexFetchTableData { Relation rel; + + /* + * Some optimizations can only be performed if the query does not modify + * the underlying relation. Track that here. + */ + bool modifies_base_rel; } IndexFetchTableData; struct IndexScanInstrumentation; diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 77eb41eb6dc..6f5d4f9bb65 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -62,6 +62,8 @@ typedef enum ScanOptions /* unregister snapshot at scan end? */ SO_TEMP_SNAPSHOT = 1 << 9, + /* whether or not scan should attempt to set the VM */ + SO_ALLOW_VM_SET = 1 << 10, } ScanOptions; /* @@ -881,6 +883,25 @@ table_beginscan(Relation rel, Snapshot snapshot, return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); } +/* + * Similar to table_beginscan(), but allows the caller to indicate whether the + * query modifies the relation. This is used when the caller wants to attempt + * marking pages in the relation as all-visible in the visibility map during + * on-access pruning. + */ +static inline TableScanDesc +table_beginscan_vmset(Relation rel, Snapshot snapshot, + int nkeys, struct ScanKeyData *key, bool modifies_rel) +{ + uint32 flags = SO_TYPE_SEQSCAN | + SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE; + + if (!modifies_rel) + flags |= SO_ALLOW_VM_SET; + + return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); +} + /* * Like table_beginscan(), but for scanning catalog. It'll automatically use a * snapshot appropriate for scanning catalog relations. @@ -918,10 +939,13 @@ table_beginscan_strat(Relation rel, Snapshot snapshot, */ static inline TableScanDesc table_beginscan_bm(Relation rel, Snapshot snapshot, - int nkeys, struct ScanKeyData *key) + int nkeys, struct ScanKeyData *key, bool modifies_rel) { uint32 flags = SO_TYPE_BITMAPSCAN | SO_ALLOW_PAGEMODE; + if (!modifies_rel) + flags |= SO_ALLOW_VM_SET; + return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); } @@ -1130,6 +1154,10 @@ extern void table_parallelscan_initialize(Relation rel, extern TableScanDesc table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan); +extern TableScanDesc table_beginscan_parallel_vmset(Relation relation, + ParallelTableScanDesc pscan, + bool modifies_rel); + /* * Restart a parallel scan. Call this in the leader process. Caller is * responsible for making sure that all workers have finished the scan diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 3a920cc7d17..c854be93436 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -676,6 +676,12 @@ typedef struct EState * ExecDoInitialPruning() */ const char *es_sourceText; /* Source text from QueryDesc */ + /* + * RT indexes of relations modified by the query either through + * UPDATE/DELETE/INSERT/MERGE or SELECT FOR UPDATE + */ + Bitmapset *es_modified_relids; + JunkFilter *es_junkFilter; /* top-level junk filter, if any */ /* If query can insert/delete tuples, the command ID to mark them with */ diff --git a/src/test/recovery/t/035_standby_logical_decoding.pl b/src/test/recovery/t/035_standby_logical_decoding.pl index c9c182892cf..f5c0c65b260 100644 --- a/src/test/recovery/t/035_standby_logical_decoding.pl +++ b/src/test/recovery/t/035_standby_logical_decoding.pl @@ -296,6 +296,7 @@ wal_level = 'logical' max_replication_slots = 4 max_wal_senders = 4 autovacuum = off +hot_standby_feedback = on }); $node_primary->dump_info; $node_primary->start; @@ -745,7 +746,7 @@ check_pg_recvlogical_stderr($handle, $logstart = -s $node_standby->logfile; reactive_slots_change_hfs_and_wait_for_xmins('shared_row_removal_', - 'no_conflict_', 0, 1); + 'no_conflict_', 1, 0); # This should not trigger a conflict wait_until_vacuum_can_remove( -- 2.43.0