From 5be0a1763837c29e605d751b259d8a5f9336bf85 Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Fri, 14 Jun 2024 10:30:28 -0400 Subject: [PATCH v22 14/19] BitmapHeapScan: Push prefetch code into heap AM In order to completely remove the layering violation in bitmap table scan code, we must avoid using the VM for skipping prefetches as well. To accomplish this, push prefetch code down into the heap implementation fo the bitmap table scan AM functions. This fixes another layering violation related to prefetching mentioned in tableam.h. This commit moves prefetch-related members from the BitmapHeapScanState to the BitmapHeapScanDesc and localizes the prefetch functions to heap AM code. heapam_scan_bitmap_next_block() no longer needs to take blockno as a parameter because it was only being used in generic bitmap table scan code to check if the prefetch block was falling behind the current block. --- src/backend/access/heap/heapam.c | 23 ++- src/backend/access/heap/heapam_handler.c | 217 +++++++++++++--------- src/backend/executor/nodeBitmapHeapscan.c | 130 ++++--------- src/include/access/heapam.h | 28 +-- src/include/access/relscan.h | 4 + src/include/access/tableam.h | 42 ++--- src/include/nodes/execnodes.h | 14 -- 7 files changed, 235 insertions(+), 223 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 88da737d149..9847ce2c3aa 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -1230,7 +1230,8 @@ heap_endscan(TableScanDesc sscan) } BitmapTableScanDesc * -heap_beginscan_bm(Relation relation, Snapshot snapshot, uint32 flags) +heap_beginscan_bm(Relation relation, Snapshot snapshot, uint32 flags, + int prefetch_maximum) { BitmapHeapScanDesc *scan; @@ -1264,8 +1265,15 @@ heap_beginscan_bm(Relation relation, Snapshot snapshot, uint32 flags) scan->vis_ntuples = 0; scan->vmbuffer = InvalidBuffer; + scan->pvmbuffer = InvalidBuffer; scan->empty_tuples_pending = 0; + scan->prefetch_maximum = prefetch_maximum; + + /* Only used for serial BHS */ + scan->prefetch_target = -1; + scan->prefetch_pages = 0; + return (BitmapTableScanDesc *) scan; } @@ -1288,6 +1296,12 @@ heap_rescan_bm(BitmapTableScanDesc *sscan) scan->cblock = InvalidBlockNumber; + if (BufferIsValid(scan->pvmbuffer)) + { + ReleaseBuffer(scan->pvmbuffer); + scan->pvmbuffer = InvalidBuffer; + } + /* * Reset empty_tuples_pending, a field only used by bitmap heap scan, to * avoid incorrectly emitting NULL-filled tuples from a previous scan on @@ -1299,6 +1313,10 @@ heap_rescan_bm(BitmapTableScanDesc *sscan) scan->ctup.t_data = NULL; ItemPointerSetInvalid(&scan->ctup.t_self); + + /* Only used for serial BHS */ + scan->prefetch_target = -1; + scan->prefetch_pages = 0; } void @@ -1312,6 +1330,9 @@ heap_endscan_bm(BitmapTableScanDesc *sscan) if (BufferIsValid(scan->vmbuffer)) ReleaseBuffer(scan->vmbuffer); + if (BufferIsValid(scan->pvmbuffer)) + ReleaseBuffer(scan->pvmbuffer); + /* * decrement relation reference count and free scan descriptor storage */ diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 51aaff6c6de..e06054369d2 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -54,6 +54,9 @@ static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer, HeapTuple tuple, OffsetNumber tupoffset); +static inline void BitmapPrefetch(BitmapHeapScanDesc *scan); +static inline void BitmapAdjustPrefetchIterator(BitmapHeapScanDesc *scan); +static inline void BitmapAdjustPrefetchTarget(BitmapHeapScanDesc *scan); static BlockNumber heapam_scan_get_blocks_done(HeapScanDesc hscan); static const TableAmRoutine heapam_methods; @@ -2115,19 +2118,19 @@ heapam_estimate_rel_size(Relation rel, int32 *attr_widths, /* * BitmapPrefetch - Prefetch, if prefetch_pages are behind prefetch_target */ -void -BitmapPrefetch(BitmapHeapScanState *node, BitmapTableScanDesc *scan) +static inline void +BitmapPrefetch(BitmapHeapScanDesc *scan) { #ifdef USE_PREFETCH - ParallelBitmapHeapState *pstate = node->pstate; + ParallelBitmapHeapState *pstate = scan->base.pstate; if (pstate == NULL) { - TBMIterator *prefetch_iterator = &node->prefetch_iterator; + TBMIterator *prefetch_iterator = &scan->base.prefetch_iterator; if (!prefetch_iterator->exhausted) { - while (node->prefetch_pages < node->prefetch_target) + while (scan->prefetch_pages < scan->prefetch_target) { TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator); bool skip_fetch; @@ -2138,8 +2141,8 @@ BitmapPrefetch(BitmapHeapScanState *node, BitmapTableScanDesc *scan) tbm_end_iterate(prefetch_iterator); break; } - node->prefetch_pages++; - node->pfblockno = tbmpre->blockno; + scan->prefetch_pages++; + scan->pfblock = tbmpre->blockno; /* * If we expect not to have to actually read this heap page, @@ -2147,14 +2150,14 @@ BitmapPrefetch(BitmapHeapScanState *node, BitmapTableScanDesc *scan) * logic normally. (Would it be better not to increment * prefetch_pages?) */ - skip_fetch = (!(scan->flags & SO_NEED_TUPLES) && + skip_fetch = (!(scan->base.flags & SO_NEED_TUPLES) && !tbmpre->recheck && - VM_ALL_VISIBLE(node->ss.ss_currentRelation, + VM_ALL_VISIBLE(scan->base.rel, tbmpre->blockno, - &node->pvmbuffer)); + &scan->pvmbuffer)); if (!skip_fetch) - PrefetchBuffer(scan->rel, MAIN_FORKNUM, tbmpre->blockno); + PrefetchBuffer(scan->base.rel, MAIN_FORKNUM, tbmpre->blockno); } } @@ -2163,7 +2166,7 @@ BitmapPrefetch(BitmapHeapScanState *node, BitmapTableScanDesc *scan) if (pstate->prefetch_pages < pstate->prefetch_target) { - TBMIterator *prefetch_iterator = &node->prefetch_iterator; + TBMIterator *prefetch_iterator = &scan->base.prefetch_iterator; if (!prefetch_iterator->exhausted) { @@ -2196,17 +2199,17 @@ BitmapPrefetch(BitmapHeapScanState *node, BitmapTableScanDesc *scan) break; } - node->pfblockno = tbmpre->blockno; + scan->pfblock = tbmpre->blockno; /* As above, skip prefetch if we expect not to need page */ - skip_fetch = (!(scan->flags & SO_NEED_TUPLES) && + skip_fetch = (!(scan->base.flags & SO_NEED_TUPLES) && !tbmpre->recheck && - VM_ALL_VISIBLE(node->ss.ss_currentRelation, + VM_ALL_VISIBLE(scan->base.rel, tbmpre->blockno, - &node->pvmbuffer)); + &scan->pvmbuffer)); if (!skip_fetch) - PrefetchBuffer(scan->rel, MAIN_FORKNUM, tbmpre->blockno); + PrefetchBuffer(scan->base.rel, MAIN_FORKNUM, tbmpre->blockno); } } } @@ -2220,27 +2223,27 @@ BitmapPrefetch(BitmapHeapScanState *node, BitmapTableScanDesc *scan) * iterator in prefetch_pages. For each block the main iterator returns, we * decrement prefetch_pages. */ -void -BitmapAdjustPrefetchIterator(BitmapHeapScanState *node) +static inline void +BitmapAdjustPrefetchIterator(BitmapHeapScanDesc *scan) { #ifdef USE_PREFETCH - ParallelBitmapHeapState *pstate = node->pstate; + ParallelBitmapHeapState *pstate = scan->base.pstate; TBMIterateResult *tbmpre; if (pstate == NULL) { - TBMIterator *prefetch_iterator = &node->prefetch_iterator; + TBMIterator *prefetch_iterator = &scan->base.prefetch_iterator; - if (node->prefetch_pages > 0) + if (scan->prefetch_pages > 0) { /* The main iterator has closed the distance by one page */ - node->prefetch_pages--; + scan->prefetch_pages--; } else if (!prefetch_iterator->exhausted) { /* Do not let the prefetch iterator get behind the main one */ tbmpre = tbm_iterate(prefetch_iterator); - node->pfblockno = tbmpre ? tbmpre->blockno : InvalidBlockNumber; + scan->pfblock = tbmpre ? tbmpre->blockno : InvalidBlockNumber; } return; } @@ -2254,9 +2257,9 @@ BitmapAdjustPrefetchIterator(BitmapHeapScanState *node) * Note that moving the call site of BitmapAdjustPrefetchIterator() * exacerbates the effects of this bug. */ - if (node->prefetch_maximum > 0) + if (scan->prefetch_maximum > 0) { - TBMIterator *prefetch_iterator = &node->prefetch_iterator; + TBMIterator *prefetch_iterator = &scan->base.prefetch_iterator; SpinLockAcquire(&pstate->mutex); if (pstate->prefetch_pages > 0) @@ -2280,7 +2283,7 @@ BitmapAdjustPrefetchIterator(BitmapHeapScanState *node) if (!prefetch_iterator->exhausted) { tbmpre = tbm_iterate(prefetch_iterator); - node->pfblockno = tbmpre ? tbmpre->blockno : InvalidBlockNumber; + scan->pfblock = tbmpre ? tbmpre->blockno : InvalidBlockNumber; } } } @@ -2295,33 +2298,33 @@ BitmapAdjustPrefetchIterator(BitmapHeapScanState *node) * page/tuple, then to one after the second tuple is fetched, then * it doubles as later pages are fetched. */ -void -BitmapAdjustPrefetchTarget(BitmapHeapScanState *node) +static inline void +BitmapAdjustPrefetchTarget(BitmapHeapScanDesc *scan) { #ifdef USE_PREFETCH - ParallelBitmapHeapState *pstate = node->pstate; + ParallelBitmapHeapState *pstate = scan->base.pstate; if (pstate == NULL) { - if (node->prefetch_target >= node->prefetch_maximum) + if (scan->prefetch_target >= scan->prefetch_maximum) /* don't increase any further */ ; - else if (node->prefetch_target >= node->prefetch_maximum / 2) - node->prefetch_target = node->prefetch_maximum; - else if (node->prefetch_target > 0) - node->prefetch_target *= 2; + else if (scan->prefetch_target >= scan->prefetch_maximum / 2) + scan->prefetch_target = scan->prefetch_maximum; + else if (scan->prefetch_target > 0) + scan->prefetch_target *= 2; else - node->prefetch_target++; + scan->prefetch_target++; return; } /* Do an unlocked check first to save spinlock acquisitions. */ - if (pstate->prefetch_target < node->prefetch_maximum) + if (pstate->prefetch_target < scan->prefetch_maximum) { SpinLockAcquire(&pstate->mutex); - if (pstate->prefetch_target >= node->prefetch_maximum) + if (pstate->prefetch_target >= scan->prefetch_maximum) /* don't increase any further */ ; - else if (pstate->prefetch_target >= node->prefetch_maximum / 2) - pstate->prefetch_target = node->prefetch_maximum; + else if (pstate->prefetch_target >= scan->prefetch_maximum / 2) + pstate->prefetch_target = scan->prefetch_maximum; else if (pstate->prefetch_target > 0) pstate->prefetch_target *= 2; else @@ -2331,34 +2334,36 @@ BitmapAdjustPrefetchTarget(BitmapHeapScanState *node) #endif /* USE_PREFETCH */ } + /* ------------------------------------------------------------------------ * Executor related callbacks for the heap AM * ------------------------------------------------------------------------ */ static bool -heapam_scan_bitmap_next_block(BitmapTableScanDesc *scan, - BlockNumber *blockno, bool *recheck, +heapam_scan_bitmap_next_block(BitmapTableScanDesc *sscan, + bool *recheck, long *lossy_pages, long *exact_pages) { - BitmapHeapScanDesc *hscan = (BitmapHeapScanDesc *) scan; + BitmapHeapScanDesc *scan = (BitmapHeapScanDesc *) sscan; BlockNumber block; Buffer buffer; Snapshot snapshot; int ntup; TBMIterateResult *tbmres; - hscan->vis_idx = 0; - hscan->vis_ntuples = 0; + scan->vis_idx = 0; + scan->vis_ntuples = 0; - *blockno = InvalidBlockNumber; *recheck = true; + BitmapAdjustPrefetchIterator(scan); + do { CHECK_FOR_INTERRUPTS(); - tbmres = tbm_iterate(&scan->iterator); + tbmres = tbm_iterate(&scan->base.iterator); if (tbmres == NULL) return false; @@ -2371,10 +2376,10 @@ heapam_scan_bitmap_next_block(BitmapTableScanDesc *scan, * isolation though, as we need to examine all invisible tuples * reachable by the index. */ - } while (!IsolationIsSerializable() && tbmres->blockno >= hscan->nblocks); + } while (!IsolationIsSerializable() && tbmres->blockno >= scan->nblocks); /* Got a valid block */ - *blockno = tbmres->blockno; + block = tbmres->blockno; *recheck = tbmres->recheck; /* @@ -2382,37 +2387,35 @@ heapam_scan_bitmap_next_block(BitmapTableScanDesc *scan, * heap, the bitmap entries don't need rechecking, and all tuples on the * page are visible to our transaction. */ - if (!(scan->flags & SO_NEED_TUPLES) && + if (!(scan->base.flags & SO_NEED_TUPLES) && !tbmres->recheck && - VM_ALL_VISIBLE(scan->rel, tbmres->blockno, &hscan->vmbuffer)) + VM_ALL_VISIBLE(scan->base.rel, tbmres->blockno, &scan->vmbuffer)) { /* can't be lossy in the skip_fetch case */ Assert(tbmres->ntuples >= 0); - Assert(hscan->empty_tuples_pending >= 0); + Assert(scan->empty_tuples_pending >= 0); - hscan->empty_tuples_pending += tbmres->ntuples; + scan->empty_tuples_pending += tbmres->ntuples; return true; } - block = tbmres->blockno; - /* * Acquire pin on the target heap page, trading in any pin we held before. */ - hscan->cbuf = ReleaseAndReadBuffer(hscan->cbuf, - scan->rel, - block); - hscan->cblock = block; - buffer = hscan->cbuf; - snapshot = scan->snapshot; + scan->cbuf = ReleaseAndReadBuffer(scan->cbuf, + scan->base.rel, + block); + scan->cblock = block; + buffer = scan->cbuf; + snapshot = scan->base.snapshot; ntup = 0; /* * Prune and repair fragmentation for the whole page, if possible. */ - heap_page_prune_opt(scan->rel, buffer); + heap_page_prune_opt(scan->base.rel, buffer); /* * We must hold share lock on the buffer content while examining tuple @@ -2440,9 +2443,9 @@ heapam_scan_bitmap_next_block(BitmapTableScanDesc *scan, HeapTupleData heapTuple; ItemPointerSet(&tid, block, offnum); - if (heap_hot_search_buffer(&tid, scan->rel, buffer, snapshot, + if (heap_hot_search_buffer(&tid, scan->base.rel, buffer, snapshot, &heapTuple, NULL, true)) - hscan->vis_tuples[ntup++] = ItemPointerGetOffsetNumber(&tid); + scan->vis_tuples[ntup++] = ItemPointerGetOffsetNumber(&tid); } } else @@ -2466,16 +2469,16 @@ heapam_scan_bitmap_next_block(BitmapTableScanDesc *scan, continue; loctup.t_data = (HeapTupleHeader) PageGetItem(page, lp); loctup.t_len = ItemIdGetLength(lp); - loctup.t_tableOid = scan->rel->rd_id; + loctup.t_tableOid = scan->base.rel->rd_id; ItemPointerSet(&loctup.t_self, block, offnum); valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); if (valid) { - hscan->vis_tuples[ntup++] = offnum; - PredicateLockTID(scan->rel, &loctup.t_self, snapshot, + scan->vis_tuples[ntup++] = offnum; + PredicateLockTID(scan->base.rel, &loctup.t_self, snapshot, HeapTupleHeaderGetXmin(loctup.t_data)); } - HeapCheckForSerializableConflictOut(valid, scan->rel, &loctup, + HeapCheckForSerializableConflictOut(valid, scan->base.rel, &loctup, buffer, snapshot); } } @@ -2483,17 +2486,29 @@ heapam_scan_bitmap_next_block(BitmapTableScanDesc *scan, LockBuffer(buffer, BUFFER_LOCK_UNLOCK); Assert(ntup <= MaxHeapTuplesPerPage); - hscan->vis_ntuples = ntup; + scan->vis_ntuples = ntup; if (tbmres->ntuples < 0) (*lossy_pages)++; else (*exact_pages)++; + /* + * If serial, we can error out if the the prefetch block doesn't stay + * ahead of the current block. + */ + if (scan->base.pstate == NULL && + !scan->base.prefetch_iterator.exhausted && + scan->pfblock < block) + elog(ERROR, "prefetch and main iterators are out of sync"); + + /* Adjust the prefetch target */ + BitmapAdjustPrefetchTarget(scan); + /* * Return true to indicate that a valid block was found and the bitmap is * not exhausted. If there are no visible tuples on this page, - * hscan->rs_ntuples will be 0 and heapam_scan_bitmap_next_tuple() will + * scan->rs_ntuples will be 0 and heapam_scan_bitmap_next_tuple() will * return false returning control to this function to advance to the next * block in the bitmap. */ @@ -2501,51 +2516,83 @@ heapam_scan_bitmap_next_block(BitmapTableScanDesc *scan, } static bool -heapam_scan_bitmap_next_tuple(BitmapTableScanDesc *scan, +heapam_scan_bitmap_next_tuple(BitmapTableScanDesc *sscan, TupleTableSlot *slot) { - BitmapHeapScanDesc *hscan = (BitmapHeapScanDesc *) scan; + BitmapHeapScanDesc *scan = (BitmapHeapScanDesc *) sscan; + ParallelBitmapHeapState *pstate = sscan->pstate; OffsetNumber targoffset; Page page; ItemId lp; - if (hscan->empty_tuples_pending > 0) + if (scan->empty_tuples_pending > 0) { /* * If we don't have to fetch the tuple, just return nulls. */ ExecStoreAllNullTuple(slot); - hscan->empty_tuples_pending--; + scan->empty_tuples_pending--; + BitmapPrefetch(scan); return true; } /* * Out of range? If so, nothing more to look at on this page */ - if (hscan->vis_idx < 0 || hscan->vis_idx >= hscan->vis_ntuples) + if (scan->vis_idx < 0 || scan->vis_idx >= scan->vis_ntuples) return false; - targoffset = hscan->vis_tuples[hscan->vis_idx]; - page = BufferGetPage(hscan->cbuf); +#ifdef USE_PREFETCH + + /* + * Try to prefetch at least a few pages even before we get to the second + * page if we don't stop reading after the first tuple. + */ + if (!pstate) + { + if (scan->prefetch_target < scan->prefetch_maximum) + scan->prefetch_target++; + } + else if (pstate->prefetch_target < scan->prefetch_maximum) + { + /* take spinlock while updating shared state */ + SpinLockAcquire(&pstate->mutex); + if (pstate->prefetch_target < scan->prefetch_maximum) + pstate->prefetch_target++; + SpinLockRelease(&pstate->mutex); + } +#endif /* USE_PREFETCH */ + + /* + * We issue prefetch requests *after* fetching the current page to try to + * avoid having prefetching interfere with the main I/O. Also, this should + * happen only when we have determined there is still something to do on + * the current page, else we may uselessly prefetch the same page we are + * just about to request for real. + */ + BitmapPrefetch(scan); + + targoffset = scan->vis_tuples[scan->vis_idx]; + page = BufferGetPage(scan->cbuf); lp = PageGetItemId(page, targoffset); Assert(ItemIdIsNormal(lp)); - hscan->ctup.t_data = (HeapTupleHeader) PageGetItem(page, lp); - hscan->ctup.t_len = ItemIdGetLength(lp); - hscan->ctup.t_tableOid = scan->rel->rd_id; - ItemPointerSet(&hscan->ctup.t_self, hscan->cblock, targoffset); + scan->ctup.t_data = (HeapTupleHeader) PageGetItem(page, lp); + scan->ctup.t_len = ItemIdGetLength(lp); + scan->ctup.t_tableOid = sscan->rel->rd_id; + ItemPointerSet(&scan->ctup.t_self, scan->cblock, targoffset); - pgstat_count_heap_fetch(scan->rel); + pgstat_count_heap_fetch(sscan->rel); /* * Set up the result slot to point to this tuple. Note that the slot * acquires a pin on the buffer. */ - ExecStoreBufferHeapTuple(&hscan->ctup, + ExecStoreBufferHeapTuple(&scan->ctup, slot, - hscan->cbuf); + scan->cbuf); - hscan->vis_idx++; + scan->vis_idx++; return true; } diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index b7b03a767cb..e3b8e8baa6a 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -37,8 +37,6 @@ #include -/* XXX: temporary include only for review purposes */ -#include "access/heapam.h" #include "access/relscan.h" #include "access/tableam.h" #include "access/visibilitymap.h" @@ -81,17 +79,21 @@ BitmapHeapNext(BitmapHeapScanState *node) /* * If we haven't yet performed the underlying index scan, do it, and begin * the iteration over the bitmap. - * - * For prefetching, we use *two* iterators, one for the pages we are - * actually scanning and another that runs ahead of the first for - * prefetching. node->prefetch_pages tracks exactly how many pages ahead - * the prefetch iterator is. Also, node->prefetch_target tracks the - * desired prefetch distance, which starts small and increases up to the - * node->prefetch_maximum. This is to avoid doing a lot of prefetching in - * a scan that stops after a few tuples because of a LIMIT. */ if (!node->initialized) { + int prefetch_maximum = 0; + + /* + * Maximum number of prefetches for the tablespace if configured, + * otherwise the current value of the effective_io_concurrency GUC. + */ +#ifdef USE_PREFETCH + Relation rel = node->ss.ss_currentRelation; + + prefetch_maximum = get_tablespace_io_concurrency(rel->rd_rel->reltablespace); +#endif + if (!pstate) { node->tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node)); @@ -110,6 +112,11 @@ BitmapHeapNext(BitmapHeapScanState *node) if (!node->tbm || !IsA(node->tbm, TIDBitmap)) elog(ERROR, "unrecognized result from subplan"); + /* + * Two iterators are used -- one for the pages being scanned and + * one for the blocks being prefetched. + */ + /* * Prepare to iterate over the TBM. This will return the * dsa_pointer of the iterator state which will be used by @@ -117,7 +124,7 @@ BitmapHeapNext(BitmapHeapScanState *node) */ pstate->tbmiterator = tbm_prepare_shared_iterate(node->tbm); #ifdef USE_PREFETCH - if (node->prefetch_maximum > 0) + if (prefetch_maximum > 0) { pstate->prefetch_iterator = tbm_prepare_shared_iterate(node->tbm); @@ -127,14 +134,6 @@ BitmapHeapNext(BitmapHeapScanState *node) BitmapDoneInitializingSharedState(pstate); } -#ifdef USE_PREFETCH - if (node->prefetch_maximum > 0) - tbm_begin_iterate(&node->prefetch_iterator, node->tbm, dsa, - pstate ? - pstate->prefetch_iterator : - InvalidDsaPointer); -#endif /* USE_PREFETCH */ - /* * If this is the first scan of the underlying table, create the table * scan descriptor and begin the scan. @@ -156,7 +155,9 @@ BitmapHeapNext(BitmapHeapScanState *node) scan = table_beginscan_bm(node->ss.ss_currentRelation, node->ss.ps.state->es_snapshot, - need_tuples); + pstate, + need_tuples, + prefetch_maximum); node->scandesc = scan; node->scan_in_progress = true; } @@ -173,6 +174,23 @@ BitmapHeapNext(BitmapHeapScanState *node) pstate->tbmiterator : InvalidDsaPointer); + /* + * We use *two* iterators, one for the pages we are actually scanning + * and another that runs ahead of the first for prefetching. + * scan->prefetch_pages tracks exactly how many pages ahead the + * prefetch iterator is. Also, scan->prefetch_target tracks the + * desired prefetch distance, which starts small and increases up to + * the prefetch_maximum. This is to avoid doing a lot of prefetching + * in a scan that stops after a few tuples because of a LIMIT. + */ +#ifdef USE_PREFETCH + if (prefetch_maximum > 0) + tbm_begin_iterate(&scan->prefetch_iterator, node->tbm, dsa, + pstate ? + pstate->prefetch_iterator : + InvalidDsaPointer); +#endif /* USE_PREFETCH */ + node->initialized = true; goto new_page; @@ -188,37 +206,6 @@ BitmapHeapNext(BitmapHeapScanState *node) CHECK_FOR_INTERRUPTS(); -#ifdef USE_PREFETCH - - /* - * Try to prefetch at least a few pages even before we get to the - * second page if we don't stop reading after the first tuple. - */ - if (!pstate) - { - if (node->prefetch_target < node->prefetch_maximum) - node->prefetch_target++; - } - else if (pstate->prefetch_target < node->prefetch_maximum) - { - /* take spinlock while updating shared state */ - SpinLockAcquire(&pstate->mutex); - if (pstate->prefetch_target < node->prefetch_maximum) - pstate->prefetch_target++; - SpinLockRelease(&pstate->mutex); - } -#endif /* USE_PREFETCH */ - - /* - * We issue prefetch requests *after* fetching the current page to - * try to avoid having prefetching interfere with the main I/O. - * Also, this should happen only when we have determined there is - * still something to do on the current page, else we may - * uselessly prefetch the same page we are just about to request - * for real. - */ - BitmapPrefetch(node, scan); - /* * If we are using lossy info, we have to recheck the qual * conditions at every tuple. @@ -241,23 +228,9 @@ BitmapHeapNext(BitmapHeapScanState *node) new_page: - BitmapAdjustPrefetchIterator(node); - - if (!table_scan_bitmap_next_block(scan, &node->blockno, &node->recheck, + if (!table_scan_bitmap_next_block(scan, &node->recheck, &node->lossy_pages, &node->exact_pages)) break; - - /* - * If private, we can error out if the the prefetch block doesn't stay - * ahead of the current block. - */ - if (node->pstate == NULL && - !node->prefetch_iterator.exhausted && - node->pfblockno < node->blockno) - elog(ERROR, "prefetch and main iterators are out of sync. pfblockno: %d. blockno: %d", node->pfblockno, node->blockno); - - /* Adjust the prefetch target */ - BitmapAdjustPrefetchTarget(node); } /* @@ -323,20 +296,11 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node) PlanState *outerPlan = outerPlanState(node); /* release bitmaps and buffers if any */ - tbm_end_iterate(&node->prefetch_iterator); if (node->tbm) tbm_free(node->tbm); - if (node->pvmbuffer != InvalidBuffer) - ReleaseBuffer(node->pvmbuffer); node->tbm = NULL; node->initialized = false; - node->pvmbuffer = InvalidBuffer; node->recheck = true; - node->blockno = InvalidBlockNumber; - node->pfblockno = InvalidBlockNumber; - /* Only used for serial BHS */ - node->prefetch_pages = 0; - node->prefetch_target = -1; ExecScanReScan(&node->ss); @@ -374,17 +338,15 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node) if (scanDesc) { tbm_end_iterate(&scanDesc->iterator); + tbm_end_iterate(&scanDesc->prefetch_iterator); table_endscan_bm(scanDesc); } /* * release bitmaps and buffers if any */ - tbm_end_iterate(&node->prefetch_iterator); if (node->tbm) tbm_free(node->tbm); - if (node->pvmbuffer != InvalidBuffer) - ReleaseBuffer(node->pvmbuffer); } /* ---------------------------------------------------------------- @@ -417,17 +379,12 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags) scanstate->ss.ps.ExecProcNode = ExecBitmapHeapScan; scanstate->tbm = NULL; - scanstate->pvmbuffer = InvalidBuffer; scanstate->exact_pages = 0; scanstate->lossy_pages = 0; - scanstate->prefetch_pages = 0; - scanstate->prefetch_target = -1; scanstate->initialized = false; scanstate->scan_in_progress = false; scanstate->pstate = NULL; scanstate->recheck = true; - scanstate->blockno = InvalidBlockNumber; - scanstate->pfblockno = InvalidBlockNumber; /* * Miscellaneous initialization @@ -467,13 +424,6 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags) scanstate->bitmapqualorig = ExecInitQual(node->bitmapqualorig, (PlanState *) scanstate); - /* - * Maximum number of prefetches for the tablespace if configured, - * otherwise the current value of the effective_io_concurrency GUC. - */ - scanstate->prefetch_maximum = - get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace); - scanstate->ss.ss_currentRelation = currentRelation; /* diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 92f7dd7390c..285c46db54a 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -20,8 +20,6 @@ #include "access/skey.h" #include "access/table.h" /* for backward compatibility */ #include "access/tableam.h" -/* XXX: temporary include so lift and shift works */ -#include "nodes/execnodes.h" #include "nodes/lockoptions.h" #include "nodes/primnodes.h" #include "storage/bufpage.h" @@ -116,6 +114,17 @@ typedef struct BitmapHeapScanDesc BlockNumber cblock; /* current block # in scan, if any */ + /* used to validate pf stays ahead of current block */ + BlockNumber pfblock; + + /* maximum value for prefetch_target */ + int prefetch_maximum; + + /* Current target for prefetch distance */ + int prefetch_target; + /* # pages prefetch iterator is ahead of current */ + int prefetch_pages; + /* * These fields are only used for bitmap scans for the "skip fetch" * optimization. Bitmap scans needing no fields from the heap may skip @@ -123,7 +132,11 @@ typedef struct BitmapHeapScanDesc * block reported by the bitmap to determine how many NULL-filled tuples * to return. They are common to parallel and serial BitmapHeapScans */ + + /* page of VM containing info for current block */ Buffer vmbuffer; + /* page of VM containing info for prefetch block */ + Buffer pvmbuffer; int empty_tuples_pending; } BitmapHeapScanDesc; @@ -317,7 +330,8 @@ extern void heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params, extern void heap_endscan(TableScanDesc sscan); extern BitmapTableScanDesc *heap_beginscan_bm(Relation relation, - Snapshot snapshot, uint32 flags); + Snapshot snapshot, uint32 flags, + int prefetch_maximum); extern void heap_rescan_bm(BitmapTableScanDesc *sscan); void heap_endscan_bm(BitmapTableScanDesc *sscan); @@ -388,14 +402,6 @@ extern void simple_heap_update(Relation relation, ItemPointer otid, extern TransactionId heap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate); -/* in heapam_handler.c */ -extern void BitmapPrefetch(BitmapHeapScanState *node, - BitmapTableScanDesc *scan); - -extern void BitmapAdjustPrefetchIterator(BitmapHeapScanState *node); - -extern void BitmapAdjustPrefetchTarget(BitmapHeapScanState *node); - /* in heap/pruneheap.c */ struct GlobalVisState; extern void heap_page_prune_opt(Relation relation, Buffer buffer); diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 086fce35a8b..079bce61da8 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -57,10 +57,14 @@ typedef struct BitmapTableScanDesc Relation rel; /* heap relation descriptor */ struct SnapshotData *snapshot; /* snapshot to see */ + struct ParallelBitmapHeapState *pstate; + /* * Members common to Parallel and Serial BitmapTableScans */ TBMIterator iterator; + /* iterator for prefetching ahead of current page */ + TBMIterator prefetch_iterator; /* * Information about type and behaviour of the scan, a bitmask of members diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index f7f099b6449..a00b9bd296a 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -354,7 +354,7 @@ typedef struct TableAmRoutine */ BitmapTableScanDesc *(*scan_begin_bm) (Relation rel, Snapshot snapshot, - uint32 flags); + uint32 flags, int prefetch_maximum); void (*scan_rescan_bm) (BitmapTableScanDesc *scan); @@ -808,33 +808,26 @@ typedef struct TableAmRoutine */ /* - * Prepare to fetch / check / return tuples from `blockno` as part of a - * bitmap table scan. `scan` was started via table_beginscan_bm(). Return - * false if the bitmap is exhausted and true otherwise. + * Prepare to fetch / check / return tuples from as part of a bitmap table + * scan. `scan` was started via table_beginscan_bm(). Return false if the + * bitmap is exhausted and true otherwise. * - * This will typically read and pin the target block, and do the necessary - * work to allow scan_bitmap_next_tuple() to return tuples (e.g. it might - * make sense to perform tuple visibility checks at this time). + * This will typically read and pin a block, and do the necessary work to + * allow scan_bitmap_next_tuple() to return tuples (e.g. depending on the + * table AM, it might make sense to perform tuple visibility checks at + * this time). * * `lossy_pages` is incremented if the bitmap is lossy for the selected * block; otherwise, `exact_pages` is incremented. * - * XXX: Currently this may only be implemented if the AM uses md.c as its - * storage manager, and uses ItemPointer->ip_blkid in a manner that maps - * blockids directly to the underlying storage. nodeBitmapHeapscan.c - * performs prefetching directly using that interface. This probably - * needs to be rectified at a later point. - * - * XXX: Currently this may only be implemented if the AM uses the - * visibilitymap, as nodeBitmapHeapscan.c unconditionally accesses it to - * perform prefetching. This probably needs to be rectified at a later - * point. + * Prefetching future blocks indicated in the bitmap is left to the table + * AM. * * Optional callback, but either both scan_bitmap_next_block and * scan_bitmap_next_tuple need to exist, or neither. */ bool (*scan_bitmap_next_block) (BitmapTableScanDesc *scan, - BlockNumber *blockno, bool *recheck, + bool *recheck, long *lossy_pages, long *exact_pages); /* @@ -968,14 +961,19 @@ table_beginscan_strat(Relation rel, Snapshot snapshot, */ static inline BitmapTableScanDesc * table_beginscan_bm(Relation rel, Snapshot snapshot, - bool need_tuple) + struct ParallelBitmapHeapState *pstate, + bool need_tuple, + int prefetch_maximum) { + BitmapTableScanDesc *result; uint32 flags = SO_TYPE_BITMAPSCAN | SO_ALLOW_PAGEMODE; if (need_tuple) flags |= SO_NEED_TUPLES; - return rel->rd_tableam->scan_begin_bm(rel, snapshot, flags); + result = rel->rd_tableam->scan_begin_bm(rel, snapshot, flags, prefetch_maximum); + result->pstate = pstate; + return result; } /* @@ -2004,7 +2002,7 @@ table_relation_estimate_size(Relation rel, int32 *attr_widths, */ static inline bool table_scan_bitmap_next_block(BitmapTableScanDesc *scan, - BlockNumber *blockno, bool *recheck, + bool *recheck, long *lossy_pages, long *exact_pages) { @@ -2017,7 +2015,7 @@ table_scan_bitmap_next_block(BitmapTableScanDesc *scan, elog(ERROR, "unexpected table_scan_bitmap_next_block call during logical decoding"); return scan->rel->rd_tableam->scan_bitmap_next_block(scan, - blockno, recheck, + recheck, lossy_pages, exact_pages); } diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 0f34bac4baf..6b1c3bbe6fd 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1796,19 +1796,12 @@ typedef struct ParallelBitmapHeapState * scandesc current scan descriptor for scan (NULL if none) * bitmapqualorig execution state for bitmapqualorig expressions * tbm bitmap obtained from child index scan(s) - * pvmbuffer buffer for visibility-map lookups of prefetched pages * exact_pages total number of exact pages retrieved * lossy_pages total number of lossy pages retrieved - * prefetch_iterator iterator for prefetching ahead of current page - * prefetch_pages # pages prefetch iterator is ahead of current - * prefetch_target current target prefetch distance - * prefetch_maximum maximum value for prefetch_target * initialized is node is ready to iterate * scan_in_progress is this a rescan * pstate shared state for parallel bitmap scan * recheck do current page's tuples need recheck - * blockno used to validate pf and current block in sync - * pfblockno used to validate pf stays ahead of current block * ---------------- */ typedef struct BitmapHeapScanState @@ -1817,19 +1810,12 @@ typedef struct BitmapHeapScanState struct BitmapTableScanDesc *scandesc; ExprState *bitmapqualorig; TIDBitmap *tbm; - Buffer pvmbuffer; long exact_pages; long lossy_pages; - int prefetch_pages; - int prefetch_target; - int prefetch_maximum; bool initialized; bool scan_in_progress; - TBMIterator prefetch_iterator; ParallelBitmapHeapState *pstate; bool recheck; - BlockNumber blockno; - BlockNumber pfblockno; } BitmapHeapScanState; /* ---------------- -- 2.34.1