From b0b775d1f8e29be04c65c4030dbe0a38b5436dbe Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Thu, 1 May 2025 20:22:55 +0200 Subject: [PATCH v20250709 1/6] index prefetch infrastructure Extends the AM interface to allow optional use of ReadStream for heap access. The main changes are: * The AM may create the stream, and set it into scan->xs_rs, which indicates the heap should be accessed this way. For this the AM needs to also define the read_next callback, accessing the private "opaque" part of the scan state. Note: This patch does not implement this for any existing AMs, beyond updating the signature. That comes in later patches. * To create a read_stream, the ambeginscan() callback gets that as the first argument not. * The indexam.c then passes scan->xs_rs to index_fetch_begin(). If the pointer is NULL, the regular ReadBuffer() interface will be used. * A new GUC "enable_indexscan_prefetch" (default: on) is introduced, to make experimentation easier. Not sure we want to keep this. * The executor layer is almost untouched, except for index-only scans. The read_next callback needs to skip all-visible pages (we don't want to prefetch those), and the two sides (read_next and index_fetch_heap) need to have the same visibility result, even if the VM gets updated in between. The new scan->xs_visible flag is used to pass this to the executor. * The AM has to determine the block visibility before returning the block from the read_next callback (and not return it if it's all visible). And it needs to remember the value and use it for when returning the TID/tuple from amgettuple. These two places have to agree on the visibility. * The xs_visible has to be set even by AMs that don't support the read_stream. Maybe this should be rethought, and indexam.c should do that when xs_rs=NULL. * The read_next callback must not return duplicate blocks, i.e. blocks that are exactly the same as the last returned block. That's break the optimization that we don't read/pin blocks unnecessarily. --- contrib/bloom/bloom.h | 2 +- contrib/bloom/blscan.c | 4 +- doc/src/sgml/indexam.sgml | 35 ++++++- src/backend/access/brin/brin.c | 8 +- src/backend/access/gin/ginscan.c | 4 +- src/backend/access/gist/gistscan.c | 4 +- src/backend/access/hash/hash.c | 4 +- src/backend/access/heap/heapam_handler.c | 94 ++++++++++++++++++- src/backend/access/index/genam.c | 1 + src/backend/access/index/indexam.c | 21 +++-- src/backend/access/nbtree/nbtree.c | 6 +- src/backend/access/spgist/spgscan.c | 12 +-- src/backend/access/table/tableam.c | 2 +- src/backend/commands/constraint.c | 3 +- src/backend/executor/nodeIndexonlyscan.c | 10 +- src/backend/optimizer/path/costsize.c | 1 + src/backend/storage/buffer/bufmgr.c | 40 ++++++++ src/backend/utils/misc/guc_tables.c | 10 ++ src/backend/utils/misc/postgresql.conf.sample | 1 + src/include/access/amapi.h | 3 +- src/include/access/brin_internal.h | 3 +- src/include/access/gin_private.h | 3 +- src/include/access/gistscan.h | 3 +- src/include/access/hash.h | 3 +- src/include/access/nbtree.h | 3 +- src/include/access/relscan.h | 4 + src/include/access/spgist.h | 3 +- src/include/access/tableam.h | 12 ++- src/include/optimizer/cost.h | 1 + src/include/storage/bufmgr.h | 2 + .../modules/dummy_index_am/dummy_index_am.c | 4 +- src/test/regress/expected/sysviews.out | 3 +- 32 files changed, 254 insertions(+), 55 deletions(-) diff --git a/contrib/bloom/bloom.h b/contrib/bloom/bloom.h index 648167045f4..00d8be39953 100644 --- a/contrib/bloom/bloom.h +++ b/contrib/bloom/bloom.h @@ -190,7 +190,7 @@ extern bool blinsert(Relation index, Datum *values, bool *isnull, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); -extern IndexScanDesc blbeginscan(Relation r, int nkeys, int norderbys); +extern IndexScanDesc blbeginscan(Relation heap, Relation index, int nkeys, int norderbys); extern int64 blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); extern void blrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys); diff --git a/contrib/bloom/blscan.c b/contrib/bloom/blscan.c index d072f47fe28..ba84bf6a0c5 100644 --- a/contrib/bloom/blscan.c +++ b/contrib/bloom/blscan.c @@ -22,12 +22,12 @@ * Begin scan of bloom index. */ IndexScanDesc -blbeginscan(Relation r, int nkeys, int norderbys) +blbeginscan(Relation heap, Relation index, int nkeys, int norderbys) { IndexScanDesc scan; BloomScanOpaque so; - scan = RelationGetIndexScan(r, nkeys, norderbys); + scan = RelationGetIndexScan(index, nkeys, norderbys); so = (BloomScanOpaque) palloc(sizeof(BloomScanOpaqueData)); initBloomState(&so->state, scan->indexRelation); diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml index 1aa4741a8ea..b15bb241f5b 100644 --- a/doc/src/sgml/indexam.sgml +++ b/doc/src/sgml/indexam.sgml @@ -657,7 +657,8 @@ amadjustmembers (Oid opfamilyoid, IndexScanDesc -ambeginscan (Relation indexRelation, +ambeginscan (Relation heapRelation, + Relation indexRelation, int nkeys, int norderbys); @@ -674,6 +675,38 @@ ambeginscan (Relation indexRelation, the interesting parts of index-scan startup are in amrescan. + + The index scan may opt into asynchronous I/O by initializing a ReadStream + on the provided heapRelation, and storing it in xs_rs + field of the scan descriptor. If the field is left NULL, the synchronous + buffer API will be used. The heapRelation is left NULL + for bitmaps scans, which access it from a separate node. + + + + To create the ReadStream, the index has to implement a read_next + callback, returning a sequence of block numbers. The scan has to be split into multiple + partial sequences (e.g. one sequence per leaf page), the index has to reset the stream + when advancing to the next leaf page. + + + + If the index supports index-only-scans, it needs to set the xs_visible + field when returning an item from amgettuple. This value has + to be determined in the read_next callback and remembered, + and the callback must not queue all-visible blocks. Otherwise the queued blocks + and read blocks might disagree. The VM may be updated at any point, so a block + might be queued, but then not read as it's all-visible. Or vice versa. Regular + (non-IOS) scans don't need to worry about this. + + + + The read_next callback must not queue runs of the same block. + If the block number is the same as the last returned block, it has to be + skipped. If the stream is reset (e.g. when advancing to the next leaft + page), the last block is forgotten. + + void diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 4204088fa0d..31222e5a96d 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -536,16 +536,16 @@ brininsertcleanup(Relation index, IndexInfo *indexInfo) * holding lock on index, it's not necessary to recompute it during brinrescan. */ IndexScanDesc -brinbeginscan(Relation r, int nkeys, int norderbys) +brinbeginscan(Relation heap, Relation index, int nkeys, int norderbys) { IndexScanDesc scan; BrinOpaque *opaque; - scan = RelationGetIndexScan(r, nkeys, norderbys); + scan = RelationGetIndexScan(index, nkeys, norderbys); opaque = palloc_object(BrinOpaque); - opaque->bo_rmAccess = brinRevmapInitialize(r, &opaque->bo_pagesPerRange); - opaque->bo_bdesc = brin_build_desc(r); + opaque->bo_rmAccess = brinRevmapInitialize(index, &opaque->bo_pagesPerRange); + opaque->bo_bdesc = brin_build_desc(index); scan->opaque = opaque; return scan; diff --git a/src/backend/access/gin/ginscan.c b/src/backend/access/gin/ginscan.c index c2d1771bd77..4e11ed9626d 100644 --- a/src/backend/access/gin/ginscan.c +++ b/src/backend/access/gin/ginscan.c @@ -22,7 +22,7 @@ IndexScanDesc -ginbeginscan(Relation rel, int nkeys, int norderbys) +ginbeginscan(Relation heap, Relation index, int nkeys, int norderbys) { IndexScanDesc scan; GinScanOpaque so; @@ -30,7 +30,7 @@ ginbeginscan(Relation rel, int nkeys, int norderbys) /* no order by operators allowed */ Assert(norderbys == 0); - scan = RelationGetIndexScan(rel, nkeys, norderbys); + scan = RelationGetIndexScan(index, nkeys, norderbys); /* allocate private workspace */ so = (GinScanOpaque) palloc(sizeof(GinScanOpaqueData)); diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c index 700fa959d03..d8ba7f7eff5 100644 --- a/src/backend/access/gist/gistscan.c +++ b/src/backend/access/gist/gistscan.c @@ -71,14 +71,14 @@ pairingheap_GISTSearchItem_cmp(const pairingheap_node *a, const pairingheap_node */ IndexScanDesc -gistbeginscan(Relation r, int nkeys, int norderbys) +gistbeginscan(Relation heap, Relation index, int nkeys, int norderbys) { IndexScanDesc scan; GISTSTATE *giststate; GISTScanOpaque so; MemoryContext oldCxt; - scan = RelationGetIndexScan(r, nkeys, norderbys); + scan = RelationGetIndexScan(index, nkeys, norderbys); /* First, set up a GISTSTATE with a scan-lifespan memory context */ giststate = initGISTstate(scan->indexRelation); diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 53061c819fb..2133e454e9b 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -371,7 +371,7 @@ hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) * hashbeginscan() -- start a scan on a hash index */ IndexScanDesc -hashbeginscan(Relation rel, int nkeys, int norderbys) +hashbeginscan(Relation heap, Relation index, int nkeys, int norderbys) { IndexScanDesc scan; HashScanOpaque so; @@ -379,7 +379,7 @@ hashbeginscan(Relation rel, int nkeys, int norderbys) /* no order by operators allowed */ Assert(norderbys == 0); - scan = RelationGetIndexScan(rel, nkeys, norderbys); + scan = RelationGetIndexScan(index, nkeys, norderbys); so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData)); HashScanPosInvalidate(so->currPos); diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index cb4bc35c93e..a16aa3e56ae 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -79,13 +79,16 @@ heapam_slot_callbacks(Relation relation) */ static IndexFetchTableData * -heapam_index_fetch_begin(Relation rel) +heapam_index_fetch_begin(Relation rel, ReadStream *rs) { IndexFetchHeapData *hscan = palloc0(sizeof(IndexFetchHeapData)); hscan->xs_base.rel = rel; hscan->xs_cbuf = InvalidBuffer; + /* XXX Maybe the stream should be in IndexFetchHeapData instead? */ + hscan->xs_base.rs = rs; + return &hscan->xs_base; } @@ -129,16 +132,99 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, { /* Switch to correct buffer if we don't have it already */ Buffer prev_buf = hscan->xs_cbuf; + bool release_prev_buf = true; + + /* + * XXX We should compare the previous/this block, and only do the read + * if the blocks are different (and reuse the buffer otherwise). But the + * index AMs would need to do exactly the same thing, to keep both sides + * of the queue in sync. + */ + + /* + * If the scan is using read stream, get the block from it. If not, + * use the regular buffer read. + */ + if (scan->rs) + { + /* + * If we're trying to read the same block as the last time, don't + * try reading it from the stream again, but just return the last + * buffer. We need to check if the previous buffer is still pinned + * and contains the correct block (it might have been unpinned, + * used for a different block, so we need to be careful). + * + * The places scheduling the blocks (read_next callbacks) need to + * do the same thing and not schedule the blocks if it matches the + * previous one. Otherwise the stream will get out of sync, causing + * confusion. + * + * This is what ReleaseAndReadBuffer does too, but it does not + * have a queue of requests scheduled from somewhere else, so it + * does not need to worry about that. + * + * XXX Maybe we should remember the block in IndexFetchTableData, + * so that we can make the check even cheaper, without looking at + * the buffer descriptor? But that assumes the buffer was not + * unpinned (or repinned) elsewhere, before we got back here. But + * can that even happen? If yes, I guess we shouldn't be releasing + * the prev buffer anyway. + * + * XXX This has undesired impact on prefetch distance. The read + * stream schedules reads for a certain number of future blocks, + * but if we skip duplicate blocks, the prefetch distance may get + * unexpectedly large (e.g. for correlated indexes, with long runs + * of TIDs from the same heap page). We're however limited to items + * from a single leaf page. + * + * XXX What if we pinned the buffer twice (increase the refcount), + * so that if the caller unpins the buffer, we still keep the + * second pin. Wouldn't that mean we don't need to worry about the + * possibility someone loaded another page into the buffer? + * + * XXX We might also keep a longer history of recent blocks, not + * just the immediately preceding one. But that makes it harder, + * because the two places (read_next callback and here) need to + * have a slightly different view. + */ + if (BufferMatches(hscan->xs_cbuf, + hscan->xs_base.rel, + ItemPointerGetBlockNumber(tid))) + release_prev_buf = false; + else + hscan->xs_cbuf = read_stream_next_buffer(scan->rs, NULL); + } + else + hscan->xs_cbuf = ReleaseAndReadBuffer(hscan->xs_cbuf, + hscan->xs_base.rel, + ItemPointerGetBlockNumber(tid)); - hscan->xs_cbuf = ReleaseAndReadBuffer(hscan->xs_cbuf, - hscan->xs_base.rel, - ItemPointerGetBlockNumber(tid)); + /* We should always get a valid buffer for a valid TID. */ + Assert(BufferIsValid(hscan->xs_cbuf)); + + /* + * Did we read the expected block number (per the TID)? + * + * For the regular buffer reads this should always match, but with the + * read stream it might disagree due to a bug / subtle difference in the + * read_next callback. + */ + Assert(BufferGetBlockNumber(hscan->xs_cbuf) == ItemPointerGetBlockNumber(tid)); /* * Prune page, but only if we weren't already on this page */ if (prev_buf != hscan->xs_cbuf) heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf); + + /* + * The read stream does not release the buffer, the caller is expected + * to do that (unlike ReleaseAndReadBuffer). But that would mean the + * behavior with/without read stream is different, and the contract for + * index_fetch_tuple would change. So we release the old bufffer here. + */ + if (scan->rs && (prev_buf != InvalidBuffer) && release_prev_buf) + ReleaseBuffer(prev_buf); } /* Obtain share-lock on the buffer so we can examine visibility */ diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 0cb27af1310..10dc832d4f7 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -125,6 +125,7 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->xs_itupdesc = NULL; scan->xs_hitup = NULL; scan->xs_hitupdesc = NULL; + scan->xs_rs = NULL; return scan; } diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 219df1971da..73b531a9eff 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -104,7 +104,7 @@ do { \ CppAsString(pname), RelationGetRelationName(scan->indexRelation)); \ } while(0) -static IndexScanDesc index_beginscan_internal(Relation indexRelation, +static IndexScanDesc index_beginscan_internal(Relation heapRelation, Relation indexRelation, int nkeys, int norderbys, Snapshot snapshot, ParallelIndexScanDesc pscan, bool temp_snap); static inline void validate_relation_kind(Relation r); @@ -263,7 +263,8 @@ index_beginscan(Relation heapRelation, Assert(snapshot != InvalidSnapshot); - scan = index_beginscan_internal(indexRelation, nkeys, norderbys, snapshot, NULL, false); + scan = index_beginscan_internal(heapRelation, indexRelation, + nkeys, norderbys, snapshot, NULL, false); /* * Save additional parameters into the scandesc. Everything else was set @@ -274,7 +275,7 @@ index_beginscan(Relation heapRelation, scan->instrument = instrument; /* prepare to fetch index matches from table */ - scan->xs_heapfetch = table_index_fetch_begin(heapRelation); + scan->xs_heapfetch = table_index_fetch_begin(heapRelation, scan->xs_rs); return scan; } @@ -295,7 +296,7 @@ index_beginscan_bitmap(Relation indexRelation, Assert(snapshot != InvalidSnapshot); - scan = index_beginscan_internal(indexRelation, nkeys, 0, snapshot, NULL, false); + scan = index_beginscan_internal(NULL, indexRelation, nkeys, 0, snapshot, NULL, false); /* * Save additional parameters into the scandesc. Everything else was set @@ -311,7 +312,7 @@ index_beginscan_bitmap(Relation indexRelation, * index_beginscan_internal --- common code for index_beginscan variants */ static IndexScanDesc -index_beginscan_internal(Relation indexRelation, +index_beginscan_internal(Relation heapRelation, Relation indexRelation, int nkeys, int norderbys, Snapshot snapshot, ParallelIndexScanDesc pscan, bool temp_snap) { @@ -331,8 +332,8 @@ index_beginscan_internal(Relation indexRelation, /* * Tell the AM to open a scan. */ - scan = indexRelation->rd_indam->ambeginscan(indexRelation, nkeys, - norderbys); + scan = indexRelation->rd_indam->ambeginscan(heapRelation, indexRelation, + nkeys, norderbys); /* Initialize information for parallel scan. */ scan->parallel_scan = pscan; scan->xs_temp_snap = temp_snap; @@ -593,8 +594,8 @@ index_beginscan_parallel(Relation heaprel, Relation indexrel, snapshot = RestoreSnapshot(pscan->ps_snapshot_data); RegisterSnapshot(snapshot); - scan = index_beginscan_internal(indexrel, nkeys, norderbys, snapshot, - pscan, true); + scan = index_beginscan_internal(heaprel, indexrel, nkeys, norderbys, + snapshot, pscan, true); /* * Save additional parameters into the scandesc. Everything else was set @@ -605,7 +606,7 @@ index_beginscan_parallel(Relation heaprel, Relation indexrel, scan->instrument = instrument; /* prepare to fetch index matches from table */ - scan->xs_heapfetch = table_index_fetch_begin(heaprel); + scan->xs_heapfetch = table_index_fetch_begin(heaprel, scan->xs_rs); return scan; } diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index fdff960c130..619b356e848 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -333,7 +333,7 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) * btbeginscan() -- start a scan on a btree index */ IndexScanDesc -btbeginscan(Relation rel, int nkeys, int norderbys) +btbeginscan(Relation heap, Relation index, int nkeys, int norderbys) { IndexScanDesc scan; BTScanOpaque so; @@ -342,7 +342,7 @@ btbeginscan(Relation rel, int nkeys, int norderbys) Assert(norderbys == 0); /* get the scan */ - scan = RelationGetIndexScan(rel, nkeys, norderbys); + scan = RelationGetIndexScan(index, nkeys, norderbys); /* allocate private workspace */ so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData)); @@ -371,7 +371,7 @@ btbeginscan(Relation rel, int nkeys, int norderbys) */ so->currTuples = so->markTuples = NULL; - scan->xs_itupdesc = RelationGetDescr(rel); + scan->xs_itupdesc = RelationGetDescr(index); scan->opaque = so; diff --git a/src/backend/access/spgist/spgscan.c b/src/backend/access/spgist/spgscan.c index 25893050c58..655f5cdc1eb 100644 --- a/src/backend/access/spgist/spgscan.c +++ b/src/backend/access/spgist/spgscan.c @@ -301,13 +301,13 @@ spgPrepareScanKeys(IndexScanDesc scan) } IndexScanDesc -spgbeginscan(Relation rel, int keysz, int orderbysz) +spgbeginscan(Relation heap, Relation index, int keysz, int orderbysz) { IndexScanDesc scan; SpGistScanOpaque so; int i; - scan = RelationGetIndexScan(rel, keysz, orderbysz); + scan = RelationGetIndexScan(index, keysz, orderbysz); so = (SpGistScanOpaque) palloc0(sizeof(SpGistScanOpaqueData)); if (keysz > 0) @@ -330,7 +330,7 @@ spgbeginscan(Relation rel, int keysz, int orderbysz) * most opclasses we can re-use the index reldesc instead of making one.) */ so->reconTupDesc = scan->xs_hitupdesc = - getSpGistTupleDesc(rel, &so->state.attType); + getSpGistTupleDesc(index, &so->state.attType); /* Allocate various arrays needed for order-by scans */ if (scan->numberOfOrderBys > 0) @@ -362,14 +362,14 @@ spgbeginscan(Relation rel, int keysz, int orderbysz) } fmgr_info_copy(&so->innerConsistentFn, - index_getprocinfo(rel, 1, SPGIST_INNER_CONSISTENT_PROC), + index_getprocinfo(index, 1, SPGIST_INNER_CONSISTENT_PROC), CurrentMemoryContext); fmgr_info_copy(&so->leafConsistentFn, - index_getprocinfo(rel, 1, SPGIST_LEAF_CONSISTENT_PROC), + index_getprocinfo(index, 1, SPGIST_LEAF_CONSISTENT_PROC), CurrentMemoryContext); - so->indexCollation = rel->rd_indcollation[0]; + so->indexCollation = index->rd_indcollation[0]; scan->opaque = so; diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index a56c5eceb14..be8e02a9c45 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -217,7 +217,7 @@ table_index_fetch_tuple_check(Relation rel, bool found; slot = table_slot_create(rel, NULL); - scan = table_index_fetch_begin(rel); + scan = table_index_fetch_begin(rel, NULL); found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again, all_dead); table_index_fetch_end(scan); diff --git a/src/backend/commands/constraint.c b/src/backend/commands/constraint.c index 3497a8221f2..31279bd82b1 100644 --- a/src/backend/commands/constraint.c +++ b/src/backend/commands/constraint.c @@ -106,7 +106,8 @@ unique_key_recheck(PG_FUNCTION_ARGS) */ tmptid = checktid; { - IndexFetchTableData *scan = table_index_fetch_begin(trigdata->tg_relation); + IndexFetchTableData *scan + = table_index_fetch_begin(trigdata->tg_relation, NULL); bool call_again = false; if (!table_index_fetch_tuple(scan, &tmptid, SnapshotSelf, slot, diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index f464cca9507..ffebdfa7abd 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -157,10 +157,14 @@ IndexOnlyNext(IndexOnlyScanState *node) * * It's worth going through this complexity to avoid needing to lock * the VM buffer, which could cause significant contention. + * + * XXX This expects the index AM to set the xs_visible flag. Maybe we + * should not assume that. The subsequent patches do that for all the + * built in AMs, but until that time it's effectively broken - the AMs + * currently do not set xs_visible. Maybe we should only use this if + * the AM uses ReadStream, and call VM_ALL_VISIBLE otherwise? */ - if (!VM_ALL_VISIBLE(scandesc->heapRelation, - ItemPointerGetBlockNumber(tid), - &node->ioss_VMBuffer)) + if (!scandesc->xs_visible) { /* * Rats, we have to visit the heap to check visibility. diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 1f04a2c182c..f030a34cc62 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -144,6 +144,7 @@ int max_parallel_workers_per_gather = 2; bool enable_seqscan = true; bool enable_indexscan = true; +bool enable_indexscan_prefetch = true; bool enable_indexonlyscan = true; bool enable_bitmapscan = true; bool enable_tidscan = true; diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index bd68d7e0ca9..2a69d6c3ec8 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -3045,6 +3045,46 @@ ReleaseAndReadBuffer(Buffer buffer, return ReadBuffer(relation, blockNum); } +/* + * BufferMatches + * Check if the buffer (still) contains the expected page. + * + * Check if the buffer contains the expected page. The buffer may be invalid, + * or valid and pinned. + */ +bool +BufferMatches(Buffer buffer, + Relation relation, + BlockNumber blockNum) +{ + ForkNumber forkNum = MAIN_FORKNUM; + BufferDesc *bufHdr; + + if (BufferIsValid(buffer)) + { + Assert(BufferIsPinned(buffer)); + if (BufferIsLocal(buffer)) + { + bufHdr = GetLocalBufferDescriptor(-buffer - 1); + if (bufHdr->tag.blockNum == blockNum && + BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) && + BufTagGetForkNum(&bufHdr->tag) == forkNum) + return true; + } + else + { + bufHdr = GetBufferDescriptor(buffer - 1); + /* we have pin, so it's ok to examine tag without spinlock */ + if (bufHdr->tag.blockNum == blockNum && + BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) && + BufTagGetForkNum(&bufHdr->tag) == forkNum) + return true; + } + } + + return false; +} + /* * PinBuffer -- make buffer unavailable for replacement. * diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 511dc32d519..7aa782aa070 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -809,6 +809,16 @@ struct config_bool ConfigureNamesBool[] = true, NULL, NULL, NULL }, + { + {"enable_indexscan_prefetch", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables prefetching for index-scans."), + NULL, + GUC_EXPLAIN + }, + &enable_indexscan_prefetch, + true, + NULL, NULL, NULL + }, { {"enable_indexonlyscan", PGC_USERSET, QUERY_TUNING_METHOD, gettext_noop("Enables the planner's use of index-only-scan plans."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 341f88adc87..c048bb03f30 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -412,6 +412,7 @@ #enable_incremental_sort = on #enable_indexscan = on #enable_indexonlyscan = on +#enable_indexscan_prefetch = on #enable_material = on #enable_memoize = on #enable_mergejoin = on diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 52916bab7a3..1ef3daee885 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -181,7 +181,8 @@ typedef void (*amadjustmembers_function) (Oid opfamilyoid, List *functions); /* prepare for index scan */ -typedef IndexScanDesc (*ambeginscan_function) (Relation indexRelation, +typedef IndexScanDesc (*ambeginscan_function) (Relation heapRelation, + Relation indexRelation, int nkeys, int norderbys); diff --git a/src/include/access/brin_internal.h b/src/include/access/brin_internal.h index d093a0bf130..186b4b43f72 100644 --- a/src/include/access/brin_internal.h +++ b/src/include/access/brin_internal.h @@ -97,7 +97,8 @@ extern bool brininsert(Relation idxRel, Datum *values, bool *nulls, bool indexUnchanged, struct IndexInfo *indexInfo); extern void brininsertcleanup(Relation index, struct IndexInfo *indexInfo); -extern IndexScanDesc brinbeginscan(Relation r, int nkeys, int norderbys); +extern IndexScanDesc brinbeginscan(Relation heap, Relation index, + int nkeys, int norderbys); extern int64 bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm); extern void brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys); diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h index aee1f70c22e..45f19594c79 100644 --- a/src/include/access/gin_private.h +++ b/src/include/access/gin_private.h @@ -393,7 +393,8 @@ typedef struct GinScanOpaqueData typedef GinScanOpaqueData *GinScanOpaque; -extern IndexScanDesc ginbeginscan(Relation rel, int nkeys, int norderbys); +extern IndexScanDesc ginbeginscan(Relation heap, Relation index, + int nkeys, int norderbys); extern void ginendscan(IndexScanDesc scan); extern void ginrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys); diff --git a/src/include/access/gistscan.h b/src/include/access/gistscan.h index 518034c36d5..372f8be5817 100644 --- a/src/include/access/gistscan.h +++ b/src/include/access/gistscan.h @@ -16,7 +16,8 @@ #include "access/amapi.h" -extern IndexScanDesc gistbeginscan(Relation r, int nkeys, int norderbys); +extern IndexScanDesc gistbeginscan(Relation heap, Relation index, + int nkeys, int norderbys); extern void gistrescan(IndexScanDesc scan, ScanKey key, int nkeys, ScanKey orderbys, int norderbys); extern void gistendscan(IndexScanDesc scan); diff --git a/src/include/access/hash.h b/src/include/access/hash.h index 073ad29b19b..6befa3ebf60 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -370,7 +370,8 @@ extern bool hashinsert(Relation rel, Datum *values, bool *isnull, struct IndexInfo *indexInfo); extern bool hashgettuple(IndexScanDesc scan, ScanDirection dir); extern int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); -extern IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys); +extern IndexScanDesc hashbeginscan(Relation heap, Relation index, + int nkeys, int norderbys); extern void hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys); extern void hashendscan(IndexScanDesc scan); diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index e709d2e0afe..e6e52210b15 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1188,7 +1188,8 @@ extern bool btinsert(Relation rel, Datum *values, bool *isnull, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); -extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys); +extern IndexScanDesc btbeginscan(Relation heap, Relation index, + int nkeys, int norderbys); extern Size btestimateparallelscan(Relation rel, int nkeys, int norderbys); extern void btinitparallelscan(void *target); extern bool btgettuple(IndexScanDesc scan, ScanDirection dir); diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index b5e0fb386c0..56e6c6245e5 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -19,6 +19,7 @@ #include "nodes/tidbitmap.h" #include "port/atomics.h" #include "storage/buf.h" +#include "storage/read_stream.h" #include "storage/relfilelocator.h" #include "storage/spin.h" #include "utils/relcache.h" @@ -121,6 +122,7 @@ typedef struct ParallelBlockTableScanWorkerData *ParallelBlockTableScanWorker; typedef struct IndexFetchTableData { Relation rel; + ReadStream *rs; } IndexFetchTableData; struct IndexScanInstrumentation; @@ -168,11 +170,13 @@ typedef struct IndexScanDescData struct TupleDescData *xs_itupdesc; /* rowtype descriptor of xs_itup */ HeapTuple xs_hitup; /* index data returned by AM, as HeapTuple */ struct TupleDescData *xs_hitupdesc; /* rowtype descriptor of xs_hitup */ + bool xs_visible; /* heap page is all-visible */ ItemPointerData xs_heaptid; /* result */ bool xs_heap_continue; /* T if must keep walking, potential * further results */ IndexFetchTableData *xs_heapfetch; + ReadStream *xs_rs; /* read_stream (if supported by the AM) */ bool xs_recheck; /* T means scan keys must be rechecked */ diff --git a/src/include/access/spgist.h b/src/include/access/spgist.h index cbe9b347d8f..69588d18124 100644 --- a/src/include/access/spgist.h +++ b/src/include/access/spgist.h @@ -203,7 +203,8 @@ extern bool spginsert(Relation index, Datum *values, bool *isnull, struct IndexInfo *indexInfo); /* spgscan.c */ -extern IndexScanDesc spgbeginscan(Relation rel, int keysz, int orderbysz); +extern IndexScanDesc spgbeginscan(Relation heap, Relation index, + int keysz, int orderbysz); extern void spgendscan(IndexScanDesc scan); extern void spgrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys); diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 1c9e802a6b1..749a68ed861 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -413,8 +413,14 @@ typedef struct TableAmRoutine * structure with additional information. * * Tuples for an index scan can then be fetched via index_fetch_tuple. + * + * The 'rs' parameter is the read stream initialized by the AM, in + * which case the read stream has to be used to fetch tuples. If the + * AM does not support read stream, it's set to NULL and the regular + * synchronous API to read buffers shall be used. */ - struct IndexFetchTableData *(*index_fetch_begin) (Relation rel); + struct IndexFetchTableData *(*index_fetch_begin) (Relation rel, + ReadStream *rs); /* * Reset index fetch. Typically this will release cross index fetch @@ -1149,9 +1155,9 @@ table_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan) * Tuples for an index scan can then be fetched via table_index_fetch_tuple(). */ static inline IndexFetchTableData * -table_index_fetch_begin(Relation rel) +table_index_fetch_begin(Relation rel, ReadStream *rs) { - return rel->rd_tableam->index_fetch_begin(rel); + return rel->rd_tableam->index_fetch_begin(rel, rs); } /* diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index b523bcda8f3..00f4c3d0011 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -51,6 +51,7 @@ extern PGDLLIMPORT Cost disable_cost; extern PGDLLIMPORT int max_parallel_workers_per_gather; extern PGDLLIMPORT bool enable_seqscan; extern PGDLLIMPORT bool enable_indexscan; +extern PGDLLIMPORT bool enable_indexscan_prefetch; extern PGDLLIMPORT bool enable_indexonlyscan; extern PGDLLIMPORT bool enable_bitmapscan; extern PGDLLIMPORT bool enable_tidscan; diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 41fdc1e7693..3b7d4e6a6a2 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -237,6 +237,8 @@ extern void IncrBufferRefCount(Buffer buffer); extern void CheckBufferIsPinnedOnce(Buffer buffer); extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum); +extern bool BufferMatches(Buffer buffer, Relation relation, + BlockNumber blockNum); extern Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, diff --git a/src/test/modules/dummy_index_am/dummy_index_am.c b/src/test/modules/dummy_index_am/dummy_index_am.c index 94ef639b6fc..622a8ed0757 100644 --- a/src/test/modules/dummy_index_am/dummy_index_am.c +++ b/src/test/modules/dummy_index_am/dummy_index_am.c @@ -241,12 +241,12 @@ divalidate(Oid opclassoid) * Begin scan of index AM. */ static IndexScanDesc -dibeginscan(Relation r, int nkeys, int norderbys) +dibeginscan(Relation heap, Relation index, int nkeys, int norderbys) { IndexScanDesc scan; /* Let's pretend we are doing something */ - scan = RelationGetIndexScan(r, nkeys, norderbys); + scan = RelationGetIndexScan(index, nkeys, norderbys); return scan; } diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 83228cfca29..3a7603b24e2 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -158,6 +158,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_incremental_sort | on enable_indexonlyscan | on enable_indexscan | on + enable_indexscan_prefetch | on enable_material | on enable_memoize | on enable_mergejoin | on @@ -172,7 +173,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_seqscan | on enable_sort | on enable_tidscan | on -(24 rows) +(25 rows) -- There are always wait event descriptions for various types. InjectionPoint -- may be present or absent, depending on history since last postmaster start. -- 2.50.0