From 2e23e94b413f3f812887e9168e3c36e680c76354 Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Thu, 20 Feb 2025 12:30:26 -0500 Subject: [PATCH v31 2/4] Separate TBM[Shared|Private]Iterator and TBMIterateResult Remove the TBMIterateResult member from the TBMPrivateIterator and TBMSharedIterator and make tbm_[shared|private_]iterate() take a TBMIterateResult as a parameter. This allows tidbitmap API users to manage multiple TBMIterateResults per scan. This is required for bitmap heap scan to use the read stream API, with which there may be multiple I/Os in flight at once, each one with a TBMIterateResult. Reviewed-by: Tomas Vondra Discussion: https://postgr.es/m/d4bb26c9-fe07-439e-ac53-c0e244387e01%40vondra.me --- src/backend/access/gin/ginget.c | 39 +++++----- src/backend/access/gin/ginscan.c | 2 +- src/backend/access/heap/heapam_handler.c | 32 ++++----- src/backend/executor/nodeBitmapHeapscan.c | 39 +++++----- src/backend/nodes/tidbitmap.c | 88 ++++++++++++----------- src/include/access/gin_private.h | 7 +- src/include/nodes/tidbitmap.h | 6 +- 7 files changed, 110 insertions(+), 103 deletions(-) diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c index 9fbe178ad47..ea8ca593890 100644 --- a/src/backend/access/gin/ginget.c +++ b/src/backend/access/gin/ginget.c @@ -332,7 +332,7 @@ restartScanEntry: entry->list = NULL; entry->nlist = 0; entry->matchBitmap = NULL; - entry->matchResult = NULL; + entry->matchResult.blockno = InvalidBlockNumber; entry->reduceResult = false; entry->predictNumberResult = 0; @@ -824,20 +824,19 @@ entryGetItem(GinState *ginstate, GinScanEntry entry, { /* * If we've exhausted all items on this block, move to next block - * in the bitmap. + * in the bitmap. tbm_private_iterate() sets matchResult.blockno + * to InvalidBlockNumber when the bitmap is exhausted. */ - while (entry->matchResult == NULL || - (entry->matchResult->ntuples >= 0 && - entry->offset >= entry->matchResult->ntuples) || - entry->matchResult->blockno < advancePastBlk || + while ((!BlockNumberIsValid(entry->matchResult.blockno)) || + (entry->matchResult.ntuples >= 0 && + entry->offset >= entry->matchResult.ntuples) || + entry->matchResult.blockno < advancePastBlk || (ItemPointerIsLossyPage(&advancePast) && - entry->matchResult->blockno == advancePastBlk)) + entry->matchResult.blockno == advancePastBlk)) { - entry->matchResult = - tbm_private_iterate(entry->matchIterator); - - if (entry->matchResult == NULL) + if (!tbm_private_iterate(entry->matchIterator, &entry->matchResult)) { + Assert(!BlockNumberIsValid(entry->matchResult.blockno)); ItemPointerSetInvalid(&entry->curItem); tbm_end_private_iterate(entry->matchIterator); entry->matchIterator = NULL; @@ -850,13 +849,13 @@ entryGetItem(GinState *ginstate, GinScanEntry entry, * tbm_extract_page_tuple() will set ntuples to the correct * number. */ - if (entry->matchResult->ntuples == -2) - tbm_extract_page_tuple(entry->matchResult, + if (entry->matchResult.ntuples == -2) + tbm_extract_page_tuple(&entry->matchResult, entry->matchTupleOffsets); /* * Reset counter to the beginning of entry->matchResult. Note: - * entry->offset is still greater than matchResult->ntuples if + * entry->offset is still greater than matchResult.ntuples if * matchResult is lossy. So, on next call we will get next * result from TIDBitmap. */ @@ -869,10 +868,10 @@ entryGetItem(GinState *ginstate, GinScanEntry entry, * We're now on the first page after advancePast which has any * items on it. If it's a lossy result, return that. */ - if (entry->matchResult->ntuples < 0) + if (entry->matchResult.ntuples < 0) { ItemPointerSetLossyPage(&entry->curItem, - entry->matchResult->blockno); + entry->matchResult.blockno); /* * We might as well fall out of the loop; we could not @@ -886,17 +885,17 @@ entryGetItem(GinState *ginstate, GinScanEntry entry, * Not a lossy page. Skip over any offsets <= advancePast, and * return that. */ - if (entry->matchResult->blockno == advancePastBlk) + if (entry->matchResult.blockno == advancePastBlk) { /* * First, do a quick check against the last offset on the * page. If that's > advancePast, so are all the other * offsets, so just go back to the top to get the next page. */ - if (entry->matchTupleOffsets[entry->matchResult->ntuples - 1] <= + if (entry->matchTupleOffsets[entry->matchResult.ntuples - 1] <= advancePastOff) { - entry->offset = entry->matchResult->ntuples; + entry->offset = entry->matchResult.ntuples; continue; } @@ -906,7 +905,7 @@ entryGetItem(GinState *ginstate, GinScanEntry entry, } ItemPointerSet(&entry->curItem, - entry->matchResult->blockno, + entry->matchResult.blockno, entry->matchTupleOffsets[entry->offset]); entry->offset++; diff --git a/src/backend/access/gin/ginscan.c b/src/backend/access/gin/ginscan.c index 7d1e6615260..625140fdf25 100644 --- a/src/backend/access/gin/ginscan.c +++ b/src/backend/access/gin/ginscan.c @@ -106,7 +106,7 @@ ginFillScanEntry(GinScanOpaque so, OffsetNumber attnum, ItemPointerSetMin(&scanEntry->curItem); scanEntry->matchBitmap = NULL; scanEntry->matchIterator = NULL; - scanEntry->matchResult = NULL; + scanEntry->matchResult.blockno = InvalidBlockNumber; scanEntry->list = NULL; scanEntry->nlist = 0; scanEntry->offset = InvalidOffsetNumber; diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 408d4b44240..058470d0b0f 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -2126,7 +2126,7 @@ heapam_scan_bitmap_next_block(TableScanDesc scan, Buffer buffer; Snapshot snapshot; int ntup; - TBMIterateResult *tbmres; + TBMIterateResult tbmres; TBMOffsets offsets; Assert(scan->rs_flags & SO_TYPE_BITMAPSCAN); @@ -2141,17 +2141,15 @@ heapam_scan_bitmap_next_block(TableScanDesc scan, { CHECK_FOR_INTERRUPTS(); - tbmres = tbm_iterate(&scan->st.rs_tbmiterator); - - if (tbmres == NULL) + if (!tbm_iterate(&scan->st.rs_tbmiterator, &tbmres)) return false; /* * Exact pages need their tuple offsets extracted. * tbm_extract_page_tuple() will set ntuples to the correct number. */ - if (tbmres->ntuples == -2) - tbm_extract_page_tuple(tbmres, offsets); + if (tbmres.ntuples == -2) + tbm_extract_page_tuple(&tbmres, offsets); /* * Ignore any claimed entries past what we think is the end of the @@ -2162,11 +2160,11 @@ heapam_scan_bitmap_next_block(TableScanDesc scan, * reachable by the index. */ } while (!IsolationIsSerializable() && - tbmres->blockno >= hscan->rs_nblocks); + tbmres.blockno >= hscan->rs_nblocks); /* Got a valid block */ - *blockno = tbmres->blockno; - *recheck = tbmres->recheck; + *blockno = tbmres.blockno; + *recheck = tbmres.recheck; /* * We can skip fetching the heap page if we don't need any fields from the @@ -2174,19 +2172,19 @@ heapam_scan_bitmap_next_block(TableScanDesc scan, * page are visible to our transaction. */ if (!(scan->rs_flags & SO_NEED_TUPLES) && - !tbmres->recheck && - VM_ALL_VISIBLE(scan->rs_rd, tbmres->blockno, &bscan->rs_vmbuffer)) + !tbmres.recheck && + VM_ALL_VISIBLE(scan->rs_rd, tbmres.blockno, &bscan->rs_vmbuffer)) { /* can't be lossy in the skip_fetch case */ - Assert(tbmres->ntuples >= 0); + Assert(tbmres.ntuples >= 0); Assert(bscan->rs_empty_tuples_pending >= 0); - bscan->rs_empty_tuples_pending += tbmres->ntuples; + bscan->rs_empty_tuples_pending += tbmres.ntuples; return true; } - block = tbmres->blockno; + block = tbmres.blockno; /* * Acquire pin on the target heap page, trading in any pin we held before. @@ -2215,7 +2213,7 @@ heapam_scan_bitmap_next_block(TableScanDesc scan, /* * We need two separate strategies for lossy and non-lossy cases. */ - if (tbmres->ntuples >= 0) + if (tbmres.ntuples >= 0) { /* * Bitmap is non-lossy, so we just look through the offsets listed in @@ -2224,7 +2222,7 @@ heapam_scan_bitmap_next_block(TableScanDesc scan, */ int curslot; - for (curslot = 0; curslot < tbmres->ntuples; curslot++) + for (curslot = 0; curslot < tbmres.ntuples; curslot++) { OffsetNumber offnum = offsets[curslot]; ItemPointerData tid; @@ -2276,7 +2274,7 @@ heapam_scan_bitmap_next_block(TableScanDesc scan, Assert(ntup <= MaxHeapTuplesPerPage); hscan->rs_ntuples = ntup; - if (tbmres->ntuples >= 0) + if (tbmres.ntuples >= 0) (*exact_pages)++; else (*lossy_pages)++; diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index be0d24d901b..3b4ea0f6144 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -317,7 +317,7 @@ BitmapAdjustPrefetchIterator(BitmapHeapScanState *node) { #ifdef USE_PREFETCH ParallelBitmapHeapState *pstate = node->pstate; - TBMIterateResult *tbmpre; + TBMIterateResult tbmpre; if (pstate == NULL) { @@ -330,9 +330,8 @@ BitmapAdjustPrefetchIterator(BitmapHeapScanState *node) } else if (!tbm_exhausted(prefetch_iterator)) { - tbmpre = tbm_iterate(prefetch_iterator); - node->prefetch_blockno = tbmpre ? tbmpre->blockno : - InvalidBlockNumber; + tbm_iterate(prefetch_iterator, &tbmpre); + node->prefetch_blockno = tbmpre.blockno; } return; } @@ -371,9 +370,8 @@ BitmapAdjustPrefetchIterator(BitmapHeapScanState *node) */ if (!tbm_exhausted(prefetch_iterator)) { - tbmpre = tbm_iterate(prefetch_iterator); - node->prefetch_blockno = tbmpre ? tbmpre->blockno : - InvalidBlockNumber; + tbm_iterate(prefetch_iterator, &tbmpre); + node->prefetch_blockno = tbmpre.blockno; } } } @@ -441,17 +439,18 @@ BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan) { while (node->prefetch_pages < node->prefetch_target) { - TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator); + TBMIterateResult tbmpre; bool skip_fetch; - if (tbmpre == NULL) + if (!tbm_iterate(prefetch_iterator, &tbmpre)) { /* No more pages to prefetch */ + Assert(!BlockNumberIsValid(tbmpre.blockno)); tbm_end_iterate(prefetch_iterator); break; } node->prefetch_pages++; - node->prefetch_blockno = tbmpre->blockno; + node->prefetch_blockno = tbmpre.blockno; /* * If we expect not to have to actually read this heap page, @@ -460,13 +459,13 @@ BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan) * prefetch_pages?) */ skip_fetch = (!(scan->rs_flags & SO_NEED_TUPLES) && - !tbmpre->recheck && + !tbmpre.recheck && VM_ALL_VISIBLE(node->ss.ss_currentRelation, - tbmpre->blockno, + tbmpre.blockno, &node->pvmbuffer)); if (!skip_fetch) - PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); + PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre.blockno); } } @@ -481,7 +480,7 @@ BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan) { while (1) { - TBMIterateResult *tbmpre; + TBMIterateResult tbmpre; bool do_prefetch = false; bool skip_fetch; @@ -500,25 +499,25 @@ BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan) if (!do_prefetch) return; - tbmpre = tbm_iterate(prefetch_iterator); - if (tbmpre == NULL) + if (!tbm_iterate(prefetch_iterator, &tbmpre)) { + Assert(!BlockNumberIsValid(tbmpre.blockno)); /* No more pages to prefetch */ tbm_end_iterate(prefetch_iterator); break; } - node->prefetch_blockno = tbmpre->blockno; + node->prefetch_blockno = tbmpre.blockno; /* As above, skip prefetch if we expect not to need page */ skip_fetch = (!(scan->rs_flags & SO_NEED_TUPLES) && - !tbmpre->recheck && + !tbmpre.recheck && VM_ALL_VISIBLE(node->ss.ss_currentRelation, - tbmpre->blockno, + tbmpre.blockno, &node->pvmbuffer)); if (!skip_fetch) - PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); + PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre.blockno); } } } diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c index 13ba247824d..8756b48b9ec 100644 --- a/src/backend/nodes/tidbitmap.c +++ b/src/backend/nodes/tidbitmap.c @@ -172,7 +172,6 @@ struct TBMPrivateIterator int spageptr; /* next spages index */ int schunkptr; /* next schunks index */ int schunkbit; /* next bit to check in current schunk */ - TBMIterateResult output; }; /* @@ -213,7 +212,6 @@ struct TBMSharedIterator PTEntryArray *ptbase; /* pagetable element array */ PTIterationArray *ptpages; /* sorted exact page index list */ PTIterationArray *ptchunks; /* sorted lossy page index list */ - TBMIterateResult output; }; /* Local function prototypes */ @@ -950,20 +948,25 @@ tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp) /* * tbm_private_iterate - scan through next page of a TIDBitmap * - * Returns a TBMIterateResult representing one page, or NULL if there are - * no more pages to scan. Pages are guaranteed to be delivered in numerical - * order. If result->ntuples < 0, then the bitmap is "lossy" and failed to - * remember the exact tuples to look at on this page --- the caller must - * examine all tuples on the page and check if they meet the intended - * condition. If result->recheck is true, only the indicated tuples need - * be examined, but the condition must be rechecked anyway. (For ease of - * testing, recheck is always set true when ntuples < 0.) + * Caller must pass in a TBMIterateResult to be filled. + * + * Pages are guaranteed to be delivered in numerical order. + * + * Returns false when there are no more pages to scan and true otherwise. When + * there are no more pages to scan, tbmres->blockno is set to + * InvalidBlockNumber. + * + * If tbmres->ntuples < 0, then the bitmap is "lossy" and failed to remember + * the exact tuples to look at on this page --- the caller must examine all + * tuples on the page and check if they meet the intended condition. If + * tbmres->recheck is true, only the indicated tuples need be examined, but + * the condition must be rechecked anyway. (For ease of testing, recheck is + * always set true when ntuples < 0.) */ -TBMIterateResult * -tbm_private_iterate(TBMPrivateIterator *iterator) +bool +tbm_private_iterate(TBMPrivateIterator *iterator, TBMIterateResult *tbmres) { TIDBitmap *tbm = iterator->tbm; - TBMIterateResult *output = &(iterator->output); Assert(tbm->iterating == TBM_ITERATING_PRIVATE); @@ -1001,11 +1004,11 @@ tbm_private_iterate(TBMPrivateIterator *iterator) chunk_blockno < tbm->spages[iterator->spageptr]->blockno) { /* Return a lossy page indicator from the chunk */ - output->blockno = chunk_blockno; - output->ntuples = -1; - output->recheck = true; + tbmres->blockno = chunk_blockno; + tbmres->ntuples = -1; + tbmres->recheck = true; iterator->schunkbit++; - return output; + return true; } } @@ -1019,17 +1022,18 @@ tbm_private_iterate(TBMPrivateIterator *iterator) else page = tbm->spages[iterator->spageptr]; - output->internal_page = page; + tbmres->internal_page = page; /* ntuples will be calculated later */ - output->ntuples = -2; - output->blockno = page->blockno; - output->recheck = page->recheck; + tbmres->ntuples = -2; + tbmres->blockno = page->blockno; + tbmres->recheck = page->recheck; iterator->spageptr++; - return output; + return true; } /* Nothing more in the bitmap */ - return NULL; + tbmres->blockno = InvalidBlockNumber; + return false; } /* @@ -1039,10 +1043,9 @@ tbm_private_iterate(TBMPrivateIterator *iterator) * across multiple processes. We need to acquire the iterator LWLock, * before accessing the shared members. */ -TBMIterateResult * -tbm_shared_iterate(TBMSharedIterator *iterator) +bool +tbm_shared_iterate(TBMSharedIterator *iterator, TBMIterateResult *tbmres) { - TBMIterateResult *output = &iterator->output; TBMSharedIteratorState *istate = iterator->state; PagetableEntry *ptbase = NULL; int *idxpages = NULL; @@ -1093,13 +1096,13 @@ tbm_shared_iterate(TBMSharedIterator *iterator) chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno) { /* Return a lossy page indicator from the chunk */ - output->blockno = chunk_blockno; - output->ntuples = -1; - output->recheck = true; + tbmres->blockno = chunk_blockno; + tbmres->ntuples = -1; + tbmres->recheck = true; istate->schunkbit++; LWLockRelease(&istate->lock); - return output; + return true; } } @@ -1107,22 +1110,23 @@ tbm_shared_iterate(TBMSharedIterator *iterator) { PagetableEntry *page = &ptbase[idxpages[istate->spageptr]]; - output->internal_page = page; + tbmres->internal_page = page; /* ntuples will be calculated later */ - output->ntuples = -2; - output->blockno = page->blockno; - output->recheck = page->recheck; + tbmres->ntuples = -2; + tbmres->blockno = page->blockno; + tbmres->recheck = page->recheck; istate->spageptr++; LWLockRelease(&istate->lock); - return output; + return true; } LWLockRelease(&istate->lock); /* Nothing more in the bitmap */ - return NULL; + tbmres->blockno = InvalidBlockNumber; + return false; } /* @@ -1596,15 +1600,17 @@ tbm_end_iterate(TBMIterator *iterator) } /* - * Get the next TBMIterateResult from the shared or private bitmap iterator. + * Populate the next TBMIterateResult using the shared or private bitmap + * iterator. Returns false when there is nothing more to scan. */ -TBMIterateResult * -tbm_iterate(TBMIterator *iterator) +bool +tbm_iterate(TBMIterator *iterator, TBMIterateResult *tbmres) { Assert(iterator); + Assert(tbmres); if (iterator->shared) - return tbm_shared_iterate(iterator->i.shared_iterator); + return tbm_shared_iterate(iterator->i.shared_iterator, tbmres); else - return tbm_private_iterate(iterator->i.private_iterator); + return tbm_private_iterate(iterator->i.private_iterator, tbmres); } diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h index 9a9461cdfd1..dfda25686be 100644 --- a/src/include/access/gin_private.h +++ b/src/include/access/gin_private.h @@ -353,7 +353,12 @@ typedef struct GinScanEntryData /* for a partial-match or full-scan query, we accumulate all TIDs here */ TIDBitmap *matchBitmap; TBMPrivateIterator *matchIterator; - TBMIterateResult *matchResult; + + /* + * If blockno is InvalidBlockNumber, all of the other fields in the + * matchResult are meaningless. + */ + TBMIterateResult matchResult; TBMOffsets matchTupleOffsets; /* used for Posting list and one page in Posting tree */ diff --git a/src/include/nodes/tidbitmap.h b/src/include/nodes/tidbitmap.h index 17462fb554f..4a8784d54aa 100644 --- a/src/include/nodes/tidbitmap.h +++ b/src/include/nodes/tidbitmap.h @@ -110,8 +110,8 @@ extern bool tbm_is_empty(const TIDBitmap *tbm); extern TBMPrivateIterator *tbm_begin_private_iterate(TIDBitmap *tbm); extern dsa_pointer tbm_prepare_shared_iterate(TIDBitmap *tbm); -extern TBMIterateResult *tbm_private_iterate(TBMPrivateIterator *iterator); -extern TBMIterateResult *tbm_shared_iterate(TBMSharedIterator *iterator); +extern bool tbm_private_iterate(TBMPrivateIterator *iterator, TBMIterateResult *tbmres); +extern bool tbm_shared_iterate(TBMSharedIterator *iterator, TBMIterateResult *tbmres); extern void tbm_end_private_iterate(TBMPrivateIterator *iterator); extern void tbm_end_shared_iterate(TBMSharedIterator *iterator); extern TBMSharedIterator *tbm_attach_shared_iterate(dsa_area *dsa, @@ -122,7 +122,7 @@ extern TBMIterator tbm_begin_iterate(TIDBitmap *tbm, dsa_area *dsa, dsa_pointer dsp); extern void tbm_end_iterate(TBMIterator *iterator); -extern TBMIterateResult *tbm_iterate(TBMIterator *iterator); +extern bool tbm_iterate(TBMIterator *iterator, TBMIterateResult *tbmres); static inline bool tbm_exhausted(TBMIterator *iterator) -- 2.34.1