From f7cb1704e5716def42f8b0cdcbb6c390525c4cff Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Tue, 29 Jul 2025 14:34:30 -0400 Subject: [PATCH v13 17/20] Allow on-access pruning to set pages all-visible Many queries do not modify the underlying relation. For such queries, if on-access pruning occurs during the scan, we can check whether the page has become all-visible and update the visibility map accordingly. Previously, only vacuum marked pages as all-visible or all-frozen. Supporting this requires passing information about whether the relation is modified from the executor down to the scan descriptor. This commit implements on-access VM setting for sequential scans as well as for the underlying heap relation in index scans and bitmap heap scans. --- src/backend/access/heap/heapam.c | 15 ++++- src/backend/access/heap/heapam_handler.c | 15 ++++- src/backend/access/heap/pruneheap.c | 67 ++++++++++++++----- src/backend/access/index/indexam.c | 46 +++++++++++++ src/backend/access/table/tableam.c | 39 +++++++++-- src/backend/executor/execMain.c | 4 ++ src/backend/executor/execUtils.c | 2 + src/backend/executor/nodeBitmapHeapscan.c | 7 +- src/backend/executor/nodeIndexscan.c | 18 +++-- src/backend/executor/nodeSeqscan.c | 24 +++++-- src/include/access/genam.h | 11 +++ src/include/access/heapam.h | 24 ++++++- src/include/access/relscan.h | 6 ++ src/include/access/tableam.h | 30 ++++++++- src/include/nodes/execnodes.h | 6 ++ .../t/035_standby_logical_decoding.pl | 3 +- 16 files changed, 277 insertions(+), 40 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index dfa9d5a460d..eedc7cb07bf 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -556,6 +556,7 @@ heap_prepare_pagescan(TableScanDesc sscan) Buffer buffer = scan->rs_cbuf; BlockNumber block = scan->rs_cblock; Snapshot snapshot; + Buffer *vmbuffer = NULL; Page page; int lines; bool all_visible; @@ -570,7 +571,9 @@ heap_prepare_pagescan(TableScanDesc sscan) /* * Prune and repair fragmentation for the whole page, if possible. */ - heap_page_prune_opt(scan->rs_base.rs_rd, buffer); + if (sscan->rs_flags & SO_ALLOW_VM_SET) + vmbuffer = &scan->rs_vmbuffer; + heap_page_prune_opt(scan->rs_base.rs_rd, buffer, vmbuffer); /* * We must hold share lock on the buffer content while examining tuple @@ -1247,6 +1250,7 @@ heap_beginscan(Relation relation, Snapshot snapshot, sizeof(TBMIterateResult)); } + scan->rs_vmbuffer = InvalidBuffer; return (TableScanDesc) scan; } @@ -1285,6 +1289,12 @@ heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params, scan->rs_cbuf = InvalidBuffer; } + if (BufferIsValid(scan->rs_vmbuffer)) + { + ReleaseBuffer(scan->rs_vmbuffer); + scan->rs_vmbuffer = InvalidBuffer; + } + /* * SO_TYPE_BITMAPSCAN would be cleaned up here, but it does not hold any * additional data vs a normal HeapScan @@ -1317,6 +1327,9 @@ heap_endscan(TableScanDesc sscan) if (BufferIsValid(scan->rs_cbuf)) ReleaseBuffer(scan->rs_cbuf); + if (BufferIsValid(scan->rs_vmbuffer)) + ReleaseBuffer(scan->rs_vmbuffer); + /* * Must free the read stream before freeing the BufferAccessStrategy. */ diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index bcbac844bb6..f05b9e4968d 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -85,6 +85,7 @@ heapam_index_fetch_begin(Relation rel) hscan->xs_base.rel = rel; hscan->xs_cbuf = InvalidBuffer; + hscan->xs_vmbuffer = InvalidBuffer; return &hscan->xs_base; } @@ -99,6 +100,12 @@ heapam_index_fetch_reset(IndexFetchTableData *scan) ReleaseBuffer(hscan->xs_cbuf); hscan->xs_cbuf = InvalidBuffer; } + + if (BufferIsValid(hscan->xs_vmbuffer)) + { + ReleaseBuffer(hscan->xs_vmbuffer); + hscan->xs_vmbuffer = InvalidBuffer; + } } static void @@ -138,7 +145,8 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, * Prune page, but only if we weren't already on this page */ if (prev_buf != hscan->xs_cbuf) - heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf); + heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf, + scan->modifies_base_rel ? NULL : &hscan->xs_vmbuffer); } /* Obtain share-lock on the buffer so we can examine visibility */ @@ -2471,6 +2479,7 @@ BitmapHeapScanNextBlock(TableScanDesc scan, TBMIterateResult *tbmres; OffsetNumber offsets[TBM_MAX_TUPLES_PER_PAGE]; int noffsets = -1; + Buffer *vmbuffer = NULL; Assert(scan->rs_flags & SO_TYPE_BITMAPSCAN); Assert(hscan->rs_read_stream); @@ -2517,7 +2526,9 @@ BitmapHeapScanNextBlock(TableScanDesc scan, /* * Prune and repair fragmentation for the whole page, if possible. */ - heap_page_prune_opt(scan->rs_rd, buffer); + if (scan->rs_flags & SO_ALLOW_VM_SET) + vmbuffer = &hscan->rs_vmbuffer; + heap_page_prune_opt(scan->rs_rd, buffer, vmbuffer); /* * We must hold share lock on the buffer content while examining tuple diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 4f4a0af1f04..7523b936769 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -198,9 +198,13 @@ static bool identify_and_fix_vm_corruption(Relation relation, * if there's not any use in pruning. * * Caller must have pin on the buffer, and must *not* have a lock on it. + * + * If vmbuffer is not NULL, it is okay for pruning to set the visibility map if + * the page is all visible. We will take care of pinning and, if needed, + * reading in the page of the visibility map. */ void -heap_page_prune_opt(Relation relation, Buffer buffer) +heap_page_prune_opt(Relation relation, Buffer buffer, Buffer *vmbuffer) { Page page = BufferGetPage(buffer); TransactionId prune_xid; @@ -264,6 +268,13 @@ heap_page_prune_opt(Relation relation, Buffer buffer) { OffsetNumber dummy_off_loc; PruneFreezeResult presult; + int options = 0; + + if (vmbuffer) + { + visibilitymap_pin(relation, BufferGetBlockNumber(buffer), vmbuffer); + options = HEAP_PAGE_PRUNE_UPDATE_VM; + } /* * For now, pass mark_unused_now as false regardless of whether or @@ -271,9 +282,10 @@ heap_page_prune_opt(Relation relation, Buffer buffer) * that during on-access pruning with the current implementation. */ heap_page_prune_and_freeze(relation, buffer, false, - InvalidBuffer, - vistest, 0, - NULL, &presult, PRUNE_ON_ACCESS, &dummy_off_loc, NULL, NULL); + vmbuffer ? *vmbuffer : InvalidBuffer, + vistest, options, + NULL, &presult, PRUNE_ON_ACCESS, + &dummy_off_loc, NULL, NULL); /* * Report the number of tuples reclaimed to pgstats. This is @@ -519,12 +531,17 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, * all-frozen for use in opportunistic freezing and to update the VM if * the caller requests it. * - * Currently, only VACUUM attempts freezing and setting the VM bits. But - * other callers could do either one. The visibility bookkeeping is - * required for opportunistic freezing (in addition to setting the VM - * bits) because we only consider opportunistically freezing tuples if the - * whole page would become all-frozen or if the whole page will be frozen - * except for dead tuples that will be removed by vacuum. + * Currently, only VACUUM attempts freezing. But other callers could. The + * visibility bookkeeping is required for opportunistic freezing (in + * addition to setting the VM bits) because we only consider + * opportunistically freezing tuples if the whole page would become + * all-frozen or if the whole page will be frozen except for dead tuples + * that will be removed by vacuum. But if consider_update_vm is false, + * we'll not set the VM even if the page is discovered to be all-visible. + * + * If only HEAP_PAGE_PRUNE_UPDATE_VM is passed and not + * HEAP_PAGE_PRUNE_FREEZE, prstate.all_frozen must be initialized to false + * because we will not call heap_prepare_freeze_tuple() on each tuple. * * If only updating the VM, we must initialize all_frozen to false, as * heap_prepare_freeze_tuple() will not be called for each tuple on the @@ -536,7 +553,7 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, * whether or not to freeze but before deciding whether or not to update * the VM so that we don't set the VM bit incorrectly. * - * If not freezing or updating the VM, we otherwise avoid the extra + * If not freezing and not updating the VM, we avoid the extra * bookkeeping. Initializing all_visible to false allows skipping the work * to update them in heap_prune_record_unchanged_lp_normal(). */ @@ -885,12 +902,30 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, prstate.all_frozen = false; } + /* + * If this is an on-access call and we're not actually pruning, avoid + * setting the visibility map if it would newly dirty the heap page or, if + * the page is already dirty, if doing so would require including a + * full-page image (FPI) of the heap page in the WAL. This situation + * should be rare, as on-access pruning is only attempted when + * pd_prune_xid is valid. + */ + if (reason == PRUNE_ON_ACCESS && + prstate.consider_update_vm && + prstate.all_visible && + !do_prune && !do_freeze && + (!BufferIsDirty(buffer) || XLogCheckBufferNeedsBackup(buffer))) + { + prstate.consider_update_vm = false; + prstate.all_visible = prstate.all_frozen = false; + } + Assert(!prstate.all_frozen || prstate.all_visible); /* - * Handle setting visibility map bit based on information from the VM (as - * of last heap_vac_scan_next_block() call), and from all_visible and - * all_frozen variables. + * Handle setting visibility map bit based on information from the VM (if + * provided, e.g. by vacuum from the last heap_vac_scan_next_block() + * call), and from all_visible and all_frozen variables. */ if (prstate.consider_update_vm) { @@ -2284,8 +2319,8 @@ heap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples, * - Reaping: During vacuum phase III, items that are already LP_DEAD are * marked as unused. * - * - VM updates: After vacuum phases I and III, the heap page may be marked - * all-visible and all-frozen. + * - VM updates: After vacuum phases I and III and on-access, the heap page + * may be marked all-visible and all-frozen. * * These changes all happen together, so we use a single WAL record for them * all. diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 86d11f4ec79..4603ece09bd 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -289,6 +289,32 @@ index_beginscan(Relation heapRelation, return scan; } +/* + * Similar to index_beginscan(), but allows the caller to indicate whether the + * query modifies the underlying base relation. This is used when the caller + * wants to attempt marking pages in the base relation as all-visible in the + * visibility map during on-access pruning. + */ +IndexScanDesc +index_beginscan_vmset(Relation heapRelation, + Relation indexRelation, + Snapshot snapshot, + IndexScanInstrumentation *instrument, + int nkeys, int norderbys, bool modifies_base_rel) +{ + IndexScanDesc scan; + + scan = index_beginscan(heapRelation, + indexRelation, + snapshot, + instrument, + nkeys, norderbys); + + scan->xs_heapfetch->modifies_base_rel = modifies_base_rel; + + return scan; +} + /* * index_beginscan_bitmap - start a scan of an index with amgetbitmap * @@ -620,6 +646,26 @@ index_beginscan_parallel(Relation heaprel, Relation indexrel, return scan; } +/* + * Parallel version of index_beginscan_vmset() + */ +IndexScanDesc +index_beginscan_parallel_vmset(Relation heaprel, Relation indexrel, + IndexScanInstrumentation *instrument, + int nkeys, int norderbys, + ParallelIndexScanDesc pscan, + bool modifies_base_rel) +{ + IndexScanDesc scan; + + scan = index_beginscan_parallel(heaprel, indexrel, + instrument, + nkeys, norderbys, + pscan); + scan->xs_heapfetch->modifies_base_rel = modifies_base_rel; + return scan; +} + /* ---------------- * index_getnext_tid - get the next TID from a scan * diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index a56c5eceb14..67dbf99f5b5 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -49,6 +49,10 @@ char *default_table_access_method = DEFAULT_TABLE_ACCESS_METHOD; bool synchronize_seqscans = true; +/* Helper for table_beginscan_parallel() and table_beginscan_parallel_vmset() */ +static TableScanDesc table_beginscan_parallel_common(Relation relation, ParallelTableScanDesc pscan, + uint32 flags); + /* ---------------------------------------------------------------------------- * Slot functions. @@ -162,12 +166,14 @@ table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, } } -TableScanDesc -table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan) +/* + * Common helper for table_beginscan_parallel() and table_beginscan_parallel_vmset() + */ +static TableScanDesc +table_beginscan_parallel_common(Relation relation, ParallelTableScanDesc pscan, + uint32 flags) { Snapshot snapshot; - uint32 flags = SO_TYPE_SEQSCAN | - SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE; Assert(RelFileLocatorEquals(relation->rd_locator, pscan->phs_locator)); @@ -188,6 +194,31 @@ table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan) pscan, flags); } +TableScanDesc +table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan) +{ + uint32 flags = SO_TYPE_SEQSCAN | + SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE; + + return table_beginscan_parallel_common(relation, pscan, flags); +} + +/* + * Parallel version of table_beginscan_vmset() + */ +TableScanDesc +table_beginscan_parallel_vmset(Relation relation, ParallelTableScanDesc pscan, + bool modifies_rel) +{ + uint32 flags = SO_TYPE_SEQSCAN | + SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE; + + if (!modifies_rel) + flags |= SO_ALLOW_VM_SET; + + return table_beginscan_parallel_common(relation, pscan, flags); +} + /* ---------------------------------------------------------------------------- * Index scan related functions. diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index ff12e2e1364..2e0474c948a 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -916,6 +916,10 @@ InitPlan(QueryDesc *queryDesc, int eflags) break; } + /* If it has a rowmark, the relation is modified */ + estate->es_modified_relids = bms_add_member(estate->es_modified_relids, + rc->rti); + /* Check that relation is a legal target for marking */ if (relation) CheckValidRowMarkRel(relation, rc->markType); diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c index fdc65c2b42b..28a06dcd244 100644 --- a/src/backend/executor/execUtils.c +++ b/src/backend/executor/execUtils.c @@ -893,6 +893,8 @@ ExecInitResultRelation(EState *estate, ResultRelInfo *resultRelInfo, estate->es_result_relations = (ResultRelInfo **) palloc0(estate->es_range_table_size * sizeof(ResultRelInfo *)); estate->es_result_relations[rti - 1] = resultRelInfo; + estate->es_modified_relids = bms_add_member(estate->es_modified_relids, + rti); /* * Saving in the list allows to avoid needlessly traversing the whole diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index bf24f3d7fe0..af6db9f7919 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -105,11 +105,16 @@ BitmapTableScanSetup(BitmapHeapScanState *node) */ if (!node->ss.ss_currentScanDesc) { + bool modifies_rel = + bms_is_member(((Scan *) node->ss.ps.plan)->scanrelid, + node->ss.ps.state->es_modified_relids); + node->ss.ss_currentScanDesc = table_beginscan_bm(node->ss.ss_currentRelation, node->ss.ps.state->es_snapshot, 0, - NULL); + NULL, + modifies_rel); } node->ss.ss_currentScanDesc->st.rs_tbmiterator = tbmiterator; diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 7fcaa37fe62..c2ffbd3b08e 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -102,16 +102,22 @@ IndexNext(IndexScanState *node) if (scandesc == NULL) { + + bool modifies_base_rel = + bms_is_member(((Scan *) node->ss.ps.plan)->scanrelid, + estate->es_modified_relids); + /* * We reach here if the index scan is not parallel, or if we're * serially executing an index scan that was planned to be parallel. */ - scandesc = index_beginscan(node->ss.ss_currentRelation, - node->iss_RelationDesc, - estate->es_snapshot, - &node->iss_Instrument, - node->iss_NumScanKeys, - node->iss_NumOrderByKeys); + scandesc = index_beginscan_vmset(node->ss.ss_currentRelation, + node->iss_RelationDesc, + estate->es_snapshot, + &node->iss_Instrument, + node->iss_NumScanKeys, + node->iss_NumOrderByKeys, + modifies_base_rel); node->iss_ScanDesc = scandesc; diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c index 94047d29430..fd69275c181 100644 --- a/src/backend/executor/nodeSeqscan.c +++ b/src/backend/executor/nodeSeqscan.c @@ -65,13 +65,18 @@ SeqNext(SeqScanState *node) if (scandesc == NULL) { + bool modifies_rel = + bms_is_member(((Scan *) node->ss.ps.plan)->scanrelid, + estate->es_modified_relids); + /* * We reach here if the scan is not parallel, or if we're serially * executing a scan that was planned to be parallel. */ - scandesc = table_beginscan(node->ss.ss_currentRelation, - estate->es_snapshot, - 0, NULL); + scandesc = table_beginscan_vmset(node->ss.ss_currentRelation, + estate->es_snapshot, + 0, NULL, modifies_rel); + node->ss.ss_currentScanDesc = scandesc; } @@ -366,6 +371,7 @@ ExecSeqScanInitializeDSM(SeqScanState *node, ParallelContext *pcxt) { EState *estate = node->ss.ps.state; + bool modifies_rel; ParallelTableScanDesc pscan; pscan = shm_toc_allocate(pcxt->toc, node->pscan_len); @@ -373,8 +379,11 @@ ExecSeqScanInitializeDSM(SeqScanState *node, pscan, estate->es_snapshot); shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); + modifies_rel = bms_is_member(((Scan *) node->ss.ps.plan)->scanrelid, + estate->es_modified_relids); node->ss.ss_currentScanDesc = - table_beginscan_parallel(node->ss.ss_currentRelation, pscan); + table_beginscan_parallel_vmset(node->ss.ss_currentRelation, pscan, + modifies_rel); } /* ---------------------------------------------------------------- @@ -404,8 +413,13 @@ ExecSeqScanInitializeWorker(SeqScanState *node, ParallelWorkerContext *pwcxt) { ParallelTableScanDesc pscan; + bool modifies_rel = + bms_is_member(((Scan *) node->ss.ps.plan)->scanrelid, + node->ss.ps.state->es_modified_relids); pscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false); node->ss.ss_currentScanDesc = - table_beginscan_parallel(node->ss.ss_currentRelation, pscan); + table_beginscan_parallel_vmset(node->ss.ss_currentRelation, + pscan, + modifies_rel); } diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 5b2ab181b5f..bf272c2c37f 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -180,6 +180,11 @@ extern IndexScanDesc index_beginscan(Relation heapRelation, Snapshot snapshot, IndexScanInstrumentation *instrument, int nkeys, int norderbys); +extern IndexScanDesc index_beginscan_vmset(Relation heapRelation, + Relation indexRelation, + Snapshot snapshot, + IndexScanInstrumentation *instrument, + int nkeys, int norderbys, bool modifies_heap_rel); extern IndexScanDesc index_beginscan_bitmap(Relation indexRelation, Snapshot snapshot, IndexScanInstrumentation *instrument, @@ -206,6 +211,12 @@ extern IndexScanDesc index_beginscan_parallel(Relation heaprel, IndexScanInstrumentation *instrument, int nkeys, int norderbys, ParallelIndexScanDesc pscan); + +extern IndexScanDesc index_beginscan_parallel_vmset(Relation heaprel, Relation indexrel, + IndexScanInstrumentation *instrument, + int nkeys, int norderbys, + ParallelIndexScanDesc pscan, + bool modifies_rel); extern ItemPointer index_getnext_tid(IndexScanDesc scan, ScanDirection direction); struct TupleTableSlot; diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index fcd882cb03b..2210a5e0a79 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -94,6 +94,13 @@ typedef struct HeapScanDescData */ ParallelBlockTableScanWorkerData *rs_parallelworkerdata; + /* + * For sequential scans and bitmap heap scans. If the relation is not + * being modified, on-access pruning may read in the current heap page's + * corresponding VM block to this buffer. + */ + Buffer rs_vmbuffer; + /* these fields only used in page-at-a-time mode and for bitmap scans */ uint32 rs_cindex; /* current tuple's index in vistuples */ uint32 rs_ntuples; /* number of visible tuples on page */ @@ -116,8 +123,18 @@ typedef struct IndexFetchHeapData { IndexFetchTableData xs_base; /* AM independent part of the descriptor */ - Buffer xs_cbuf; /* current heap buffer in scan, if any */ - /* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ + /* + * Current heap buffer in scan, if any. NB: if xs_cbuf is not + * InvalidBuffer, we hold a pin on that buffer. + */ + Buffer xs_cbuf; + + /* + * For index scans that do not modify the underlying heap table, on-access + * pruning may read in the current heap page's corresponding VM block to + * this buffer. + */ + Buffer xs_vmbuffer; } IndexFetchHeapData; /* Result codes for HeapTupleSatisfiesVacuum */ @@ -374,7 +391,8 @@ extern TransactionId heap_index_delete_tuples(Relation rel, /* in heap/pruneheap.c */ struct GlobalVisState; -extern void heap_page_prune_opt(Relation relation, Buffer buffer); +extern void heap_page_prune_opt(Relation relation, Buffer buffer, + Buffer *vmbuffer); extern void heap_page_prune_and_freeze(Relation relation, Buffer buffer, bool blk_known_av, Buffer vmbuffer, diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index b5e0fb386c0..f496e0b4939 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -121,6 +121,12 @@ typedef struct ParallelBlockTableScanWorkerData *ParallelBlockTableScanWorker; typedef struct IndexFetchTableData { Relation rel; + + /* + * Some optimizations can only be performed if the query does not modify + * the underlying relation. Track that here. + */ + bool modifies_base_rel; } IndexFetchTableData; struct IndexScanInstrumentation; diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index b2ce35e2a34..e31c21cf8eb 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -62,6 +62,8 @@ typedef enum ScanOptions /* unregister snapshot at scan end? */ SO_TEMP_SNAPSHOT = 1 << 9, + /* whether or not scan should attempt to set the VM */ + SO_ALLOW_VM_SET = 1 << 10, } ScanOptions; /* @@ -881,6 +883,25 @@ table_beginscan(Relation rel, Snapshot snapshot, return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); } +/* + * Similar to table_beginscan(), but allows the caller to indicate whether the + * query modifies the relation. This is used when the caller wants to attempt + * marking pages in the relation as all-visible in the visibility map during + * on-access pruning. + */ +static inline TableScanDesc +table_beginscan_vmset(Relation rel, Snapshot snapshot, + int nkeys, struct ScanKeyData *key, bool modifies_rel) +{ + uint32 flags = SO_TYPE_SEQSCAN | + SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE; + + if (!modifies_rel) + flags |= SO_ALLOW_VM_SET; + + return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); +} + /* * Like table_beginscan(), but for scanning catalog. It'll automatically use a * snapshot appropriate for scanning catalog relations. @@ -918,10 +939,13 @@ table_beginscan_strat(Relation rel, Snapshot snapshot, */ static inline TableScanDesc table_beginscan_bm(Relation rel, Snapshot snapshot, - int nkeys, struct ScanKeyData *key) + int nkeys, struct ScanKeyData *key, bool modifies_rel) { uint32 flags = SO_TYPE_BITMAPSCAN | SO_ALLOW_PAGEMODE; + if (!modifies_rel) + flags |= SO_ALLOW_VM_SET; + return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); } @@ -1130,6 +1154,10 @@ extern void table_parallelscan_initialize(Relation rel, extern TableScanDesc table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan); +extern TableScanDesc table_beginscan_parallel_vmset(Relation relation, + ParallelTableScanDesc pscan, + bool modifies_rel); + /* * Restart a parallel scan. Call this in the leader process. Caller is * responsible for making sure that all workers have finished the scan diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index de782014b2d..839c1be1d7c 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -678,6 +678,12 @@ typedef struct EState * ExecDoInitialPruning() */ const char *es_sourceText; /* Source text from QueryDesc */ + /* + * RT indexes of relations modified by the query either through + * UPDATE/DELETE/INSERT/MERGE or SELECT FOR UPDATE + */ + Bitmapset *es_modified_relids; + JunkFilter *es_junkFilter; /* top-level junk filter, if any */ /* If query can insert/delete tuples, the command ID to mark them with */ diff --git a/src/test/recovery/t/035_standby_logical_decoding.pl b/src/test/recovery/t/035_standby_logical_decoding.pl index c9c182892cf..f5c0c65b260 100644 --- a/src/test/recovery/t/035_standby_logical_decoding.pl +++ b/src/test/recovery/t/035_standby_logical_decoding.pl @@ -296,6 +296,7 @@ wal_level = 'logical' max_replication_slots = 4 max_wal_senders = 4 autovacuum = off +hot_standby_feedback = on }); $node_primary->dump_info; $node_primary->start; @@ -745,7 +746,7 @@ check_pg_recvlogical_stderr($handle, $logstart = -s $node_standby->logfile; reactive_slots_change_hfs_and_wait_for_xmins('shared_row_removal_', - 'no_conflict_', 0, 1); + 'no_conflict_', 1, 0); # This should not trigger a conflict wait_until_vacuum_can_remove( -- 2.43.0