From 02ab6053d9ea64b46a69a55fe197526d0c95956e Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Sat, 15 Nov 2025 14:03:58 -0500 Subject: [PATCH v7 2/4] Add prefetching to index scans using batch interfaces. This commit implements I/O prefetching for index scans, made possible by the recent addition of batching interfaces to both the table AM and index AM APIs. The amgetbatch index AM interface provides batches of TIDs (rather than one at a time) from a single index leaf page, and allows multiple batches to be held in memory/pinned simultaneously. This gives the table AM the freedom to readahead within an index scan, which is crucial for I/O prefetching with certain workloads (workloads that would otherwise be unable to keep a sufficiently high prefetch distance for heap block I/O). Prefetching is implemented using a read stream under the control of the table AM. XXX When the batch ring buffer reaches capacity, the stream pauses until the scan catches up and frees some batches. We need a more principled approach here. Essentially, we need infrastructure that allows a read stream call back to tell the read stream to temporarily yield without it fully ending/resetting the read stream. Author: Tomas Vondra Author: Peter Geoghegan Reviewed-By: Andres Freund Reviewed-By: Thomas Munro Discussion: https://postgr.es/m/cf85f46f-b02f-05b2-5248-5000b894ebab@enterprisedb.com --- src/include/access/relscan.h | 41 ++- src/include/optimizer/cost.h | 1 + src/backend/access/heap/heapam_handler.c | 330 +++++++++++++++++- src/backend/access/index/indexam.c | 10 +- src/backend/access/index/indexbatch.c | 17 +- src/backend/optimizer/path/costsize.c | 1 + src/backend/storage/aio/read_stream.c | 14 +- src/backend/utils/misc/guc_parameters.dat | 7 + src/backend/utils/misc/postgresql.conf.sample | 1 + src/test/regress/expected/sysviews.out | 3 +- 10 files changed, 418 insertions(+), 7 deletions(-) diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index ca1207be6..a517abe08 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -20,6 +20,7 @@ #include "nodes/tidbitmap.h" #include "port/atomics.h" #include "storage/buf.h" +#include "storage/read_stream.h" #include "storage/relfilelocator.h" #include "storage/spin.h" #include "utils/relcache.h" @@ -124,6 +125,7 @@ typedef struct ParallelBlockTableScanWorkerData *ParallelBlockTableScanWorker; typedef struct IndexFetchTableData { Relation rel; + ReadStream *rs; } IndexFetchTableData; /* @@ -221,8 +223,14 @@ typedef struct BatchIndexScanData *BatchIndexScan; * Maximum number of batches (leaf pages) we can keep in memory. We need a * minimum of two, since we'll only consider releasing one batch when another * is read. + * + * The choice of 64 batches is arbitrary. It's about 1MB of data with 8KB + * pages (512kB for pages, and then a bit of overhead). We should not really + * need this many batches in most cases, though. The read stream looks ahead + * just enough to queue enough IOs, adjusting the distance (TIDs, but + * ultimately the number of future batches) to meet that. */ -#define INDEX_SCAN_MAX_BATCHES 2 +#define INDEX_SCAN_MAX_BATCHES 64 #define INDEX_SCAN_CACHE_BATCHES 2 #define INDEX_SCAN_BATCH_COUNT(scan) \ ((scan)->ringbuf->nextBatch - (scan)->ringbuf->headBatch) @@ -270,15 +278,46 @@ typedef struct BatchIndexScanData *BatchIndexScan; * matches in. However, table AMs are free to fetch table tuples in whatever * order is most convenient/efficient -- provided that such reordering cannot * affect the order that table_index_getnext_slot later returns tuples in. + * + * This data structure also provides table AMs with a way to read ahead of the + * current read position by _multiple_ batches/index pages. The further out + * the table AM reads ahead like this, the further it can see into the future. + * That way the table AM is able to reorder work as aggressively as desired. + * For example, index scans sometimes need to readahead by as many as a few + * dozen amgetbatch batches in order to maintain an optimal I/O prefetch + * distance (distance for reading table blocks/fetching table tuples). */ typedef struct BatchRingBuffer { + bool reset; + + /* + * Did we disable prefetching/use of a read stream because it didn't pay + * for itself? + */ + bool prefetchingLockedIn; + bool disabled; + + /* + * During prefetching, currentPrefetchBlock is the table AM block number + * that was returned by our read stream callback most recently. Used to + * suppress duplicate successive read stream block requests. + * + * Prefetching can still perform non-successive requests for the same + * block number (in general we're prefetching in exactly the same order + * that the scan will return table AM TIDs in). We need to avoid + * duplicate successive requests because table AMs expect to be able to + * hang on to buffer pins across table_index_fetch_tuple calls. + */ + BlockNumber currentPrefetchBlock; + /* Current scan direction, for the currently loaded batches */ ScanDirection direction; /* current positions in batches[] for scan */ BatchRingItemPos scanPos; /* scan's read position */ BatchRingItemPos markPos; /* mark/restore position */ + BatchRingItemPos prefetchPos; /* prefetching position */ BatchIndexScan markBatch; diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 07b8bfa63..31236ceac 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -51,6 +51,7 @@ extern PGDLLIMPORT Cost disable_cost; extern PGDLLIMPORT int max_parallel_workers_per_gather; extern PGDLLIMPORT bool enable_seqscan; extern PGDLLIMPORT bool enable_indexscan; +extern PGDLLIMPORT bool enable_indexscan_prefetch; extern PGDLLIMPORT bool enable_indexonlyscan; extern PGDLLIMPORT bool enable_bitmapscan; extern PGDLLIMPORT bool enable_tidscan; diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 03b4fada9..deef73069 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -37,6 +37,7 @@ #include "commands/progress.h" #include "executor/executor.h" #include "miscadmin.h" +#include "optimizer/cost.h" #include "pgstat.h" #include "storage/bufmgr.h" #include "storage/bufpage.h" @@ -60,6 +61,9 @@ static BlockNumber heapam_scan_get_blocks_done(HeapScanDesc hscan); static bool BitmapHeapScanNextBlock(TableScanDesc scan, bool *recheck, uint64 *lossy_pages, uint64 *exact_pages); +static BlockNumber heapam_getnext_stream(ReadStream *stream, + void *callback_private_data, + void *per_buffer_data); /* ------------------------------------------------------------------------ @@ -85,6 +89,7 @@ heapam_index_fetch_begin(Relation rel) IndexFetchHeapData *hscan = palloc_object(IndexFetchHeapData); hscan->xs_base.rel = rel; + hscan->xs_base.rs = NULL; hscan->xs_cbuf = InvalidBuffer; hscan->xs_blk = InvalidBlockNumber; hscan->vmbuf = InvalidBuffer; @@ -97,6 +102,9 @@ heapam_index_fetch_reset(IndexFetchTableData *scan) { IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan; + if (scan->rs) + read_stream_reset(scan->rs); + /* deliberately don't drop VM buffer pin here */ if (BufferIsValid(hscan->xs_cbuf)) { @@ -113,6 +121,9 @@ heapam_index_fetch_end(IndexFetchTableData *scan) heapam_index_fetch_reset(scan); + if (scan->rs) + read_stream_end(scan->rs); + if (hscan->vmbuf != InvalidBuffer) { ReleaseBuffer(hscan->vmbuf); @@ -150,7 +161,13 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, * When using a read stream, the stream will already know which block * number comes next (though an assertion will verify a match below) */ - hscan->xs_cbuf = ReadBuffer(hscan->xs_base.rel, hscan->xs_blk); + if (scan->rs) + hscan->xs_cbuf = read_stream_next_buffer(scan->rs, NULL); + else + hscan->xs_cbuf = ReadBuffer(hscan->xs_base.rel, hscan->xs_blk); + + Assert(BufferIsValid(hscan->xs_cbuf)); + Assert(BufferGetBlockNumber(hscan->xs_cbuf) == ItemPointerGetBlockNumber(tid)); /* * Prune page when it is pinned for the first time @@ -248,6 +265,15 @@ heapam_batch_return_tid(IndexScanDesc scan, BatchIndexScan scanBatch, /* * heap_batch_resolve_visibility * Obtain visibility information for every TID from caller's batch. + * + * heapam_batch_getnext_tid must reliably agree with heapam_getnext_stream + * about which heap blocks/TIDs will require a heap fetch (and which TIDs + * won't due to pointing to an all-visible heap page). Otherwise we risk + * allowing the read stream to return unexpected heap buffers/pages. + * + * Caching visibility information up front avoids that problem. If a VM bit + * is concurrently set (or unset), it can't matter, since everybody will have + * works off of this immutable local cache. */ static void heap_batch_resolve_visibility(IndexScanDesc scan, BatchIndexScan batch) @@ -377,6 +403,19 @@ heap_batch_getnext(IndexScanDesc scan, BatchIndexScan priorbatch, DEBUG_LOG("batch_getnext headBatch %d nextBatch %d batch %p", ringbuf->headBatch, ringbuf->nextBatch, batch); + + /* Delay initializing stream until reading from scan's second batch */ + if (!scan->xs_heapfetch->rs && !ringbuf->disabled && priorbatch && + enable_indexscan_prefetch) + { + Assert(INDEX_SCAN_POS_INVALID(&ringbuf->prefetchPos)); + Assert(ringbuf->currentPrefetchBlock == InvalidBlockNumber); + + scan->xs_heapfetch->rs = + read_stream_begin_relation(READ_STREAM_DEFAULT, NULL, + scan->heapRelation, MAIN_FORKNUM, + heapam_getnext_stream, scan, 0); + } } /* xs_hitup is not supported by amgetbatch scans */ @@ -411,9 +450,53 @@ heapam_batch_getnext_tid(IndexScanDesc scan, ScanDirection direction) /* Initialize direction on first call */ if (ringbuf->direction == NoMovementScanDirection) ringbuf->direction = direction; + else if (unlikely(ringbuf->disabled && scan->xs_heapfetch->rs)) + { + /* + * Handle cancelling the use of the read stream for prefetching + */ + batch_reset_pos(&ringbuf->prefetchPos); + ringbuf->currentPrefetchBlock = InvalidBlockNumber; + read_stream_reset(scan->xs_heapfetch->rs); + scan->xs_heapfetch->rs = NULL; + } + else if (unlikely(ringbuf->reset)) + { + ringbuf->reset = false; + + /* + * Need to reset the stream position, it might be too far behind. + * Ultimately we want to set it to scanPos, but we can't do that yet - + * scanPos still point sat the old batch, so just reset it and we'll + * init it to scanPos later in the callback. + */ + batch_reset_pos(&ringbuf->prefetchPos); + ringbuf->currentPrefetchBlock = InvalidBlockNumber; + + if (scan->xs_heapfetch->rs) + read_stream_reset(scan->xs_heapfetch->rs); + } + + /* + * XXX Shouldn't this also update the ringbuf->direction? If we get to the + * next block hangling direction change, then we will remember it (because + * heapam_batch_rewind will store it). But if we return in the next block, + * won't we forget about it? + * + * XXX It's a bit weird we handle the direction change in two places. + * Would be good to explain why that's necessary. + * + * XXX How come this doesn't need to do heapam_batch_rewind too? Could + * there be some future batches already loaded? + */ if (unlikely(ringbuf->direction != direction)) { + if (scan->xs_heapfetch->rs) + read_stream_reset(scan->xs_heapfetch->rs); + batch_reset_pos(&ringbuf->prefetchPos); + ringbuf->currentPrefetchBlock = InvalidBlockNumber; + /* We may change direction after reading the last batch. */ scan->finished = false; } @@ -497,6 +580,251 @@ heapam_batch_getnext_tid(IndexScanDesc scan, ScanDirection direction) return heapam_batch_return_tid(scan, scanBatch, scanPos); } +/* + * Controls when we cancel use of a read stream to do prefetching + */ +#define INDEX_SCAN_MIN_DISTANCE_NBATCHES 20 +#define INDEX_SCAN_MIN_TUPLE_DISTANCE 7 + +/* + * heapam_getnext_stream + * return the next block to pass to the read stream + * + * The initial batch is always loaded by heapam_batch_getnext_tid. We don't + * get called until the first read_stream_next_buffer() call, when a heap + * block is requested from the scan's stream for the first time. + * + * The position of the read_stream is stored in prefetchPos. It is typical for + * prefetchPos to consistently stay ahead of the scanPos position that's used to + * track the next TID to be returned to the scan by heapam_batch_getnext_tid + * after the first time we get called. However, that isn't a precondition. + * There is a strict postcondition, though: when we return we'll always leave + * scanPos <= prefetchPos (except in cases where we return InvalidBlockNumber). + */ +static BlockNumber +heapam_getnext_stream(ReadStream *stream, void *callback_private_data, + void *per_buffer_data) +{ + IndexScanDesc scan = (IndexScanDesc) callback_private_data; + BatchRingBuffer *ringbuf = scan->ringbuf; + BatchRingItemPos *scanPos = &ringbuf->scanPos; + BatchRingItemPos *prefetchPos = &ringbuf->prefetchPos; + ScanDirection direction = ringbuf->direction; + BatchIndexScan prefetchBatch; + bool fromReadPos = false; + + Assert(!scan->finished && !ringbuf->disabled); + + /* + * scanPos must always be valid when we're called -- there has to be at + * least one batch, loaded, for scanBatch. prefetchPos might not yet be + * valid, in which case it'll be initialized using scanPos. + */ + Assert(INDEX_SCAN_BATCH_COUNT(scan) > 0); + batch_assert_pos_valid(scan, scanPos); + + /* + * It is possible for the scan's direction to change, but that's handled + * elsewhere. We don't know how to deal with any variation in scan + * direction here. We assume that all loaded and newly requested batches + * must use the same scan direction. + */ + Assert(direction != NoMovementScanDirection); + + /* + * If the stream position has not been initialized yet, initialize it + * using the current read position. + * + * We do this now (rather than doing it when the read stream is created) + * to avoid incorrectly returning the scan's IndexFetchHeapData.xs_blk as + * it was at the time of read stream creation. Note that the scan might + * have to hold onto its existing not-managed-by-read-stream buffer pin + * after the read stream is created; there'll often be a few more heap + * TIDs that point to the same pinned heap page from before. + */ + if (INDEX_SCAN_POS_INVALID(prefetchPos)) + { + Assert(ringbuf->currentPrefetchBlock == InvalidBlockNumber); + + *prefetchPos = *scanPos; + fromReadPos = true; + } + + prefetchBatch = INDEX_SCAN_BATCH(scan, prefetchPos->batch); + for (;;) + { + BatchMatchingItem *item; + ItemPointer tid; + + if (fromReadPos) + { + /* + * Don't increment item when prefetchPos was just initialized + * using scanPos. We return the scanPos item's heap block + * directly on the first call here. + */ + fromReadPos = false; + } + else if (!heap_batchpos_advance(prefetchBatch, prefetchPos, direction)) + { + /* + * Ran out of items from prefetchBatch. Try to advance it to next + * batch. + */ + if (INDEX_SCAN_BATCH_LOADED(scan, prefetchPos->batch + 1)) + { + /* + * The next batch was already loaded for us. + * + * Typically, prefetchPos is ahead of scanPos for the entire + * duration of the scan (at least after we're first called). + * However, prefetchPos can sometimes fall behind scanPos. + * That's why we need to handle already-loaded batches here. + * + * This happens when some blocks are skipped and not returned + * to the read_stream. An example is an index scan on a + * correlated index, with many duplicate blocks are skipped, + * or an IOS where all-visible blocks are skipped. + */ + prefetchBatch = INDEX_SCAN_BATCH(scan, prefetchPos->batch + 1); + } + else + { + /* + * If we already used the maximum number of batch slots + * available, it's pointless to try loading another one. This + * can happen for various reasons, e.g. for index-only scans + * on all-visible table, or skipping duplicate blocks on + * perfectly correlated indexes, etc. + * + * We could enlarge the array to allow more batches, but + * that's futile, we can always construct a case using more + * memory. Not only it would risk OOM, it'd also be + * inefficient because this happens early in the scan (so it'd + * interfere with LIMIT queries). + */ + if (INDEX_SCAN_BATCH_FULL(scan)) + { + DEBUG_LOG("batch_getnext: ran out of space for batches"); + scan->ringbuf->reset = true; + break; + } + + prefetchBatch = heap_batch_getnext(scan, prefetchBatch, direction); + if (!prefetchBatch) + { + /* + * Failed to load next batch, so all the batches that the + * scan will ever require (barring a change in scan + * direction) are now loaded + */ + scan->finished = true; + break; + } + + /* + * Consider disabling prefetching when we can't keep a + * sufficiently large "index tuple distance" between scanPos + * and prefetchPos. + * + * Only consider doing this when we're not on the scan's + * initial batch, when scanPos and prefetchPos share the same + * batch. + */ + if (!ringbuf->prefetchingLockedIn) + { + int itemdiff; + + if (prefetchPos->batch <= INDEX_SCAN_MIN_DISTANCE_NBATCHES) + { + /* + * Too early to check if prefetching should be + * disabled + */ + } + else if (scanPos->batch == prefetchPos->batch) + { + if (ScanDirectionIsForward(direction)) + itemdiff = prefetchPos->item - scanPos->item; + else + { + BatchIndexScan scanBatch = + INDEX_SCAN_BATCH(scan, scanPos->batch); + + itemdiff = (scanPos->item - scanBatch->firstItem) - + (prefetchPos->item - scanBatch->firstItem); + } + + if (itemdiff < INDEX_SCAN_MIN_TUPLE_DISTANCE) + { + ringbuf->disabled = true; + return InvalidBlockNumber; + } + else + { + ringbuf->prefetchingLockedIn = true; + } + } + else + ringbuf->prefetchingLockedIn = true; + } + } + + /* Position prefetchPos to the start of new prefetchBatch */ + heap_batchpos_newbatch(prefetchBatch, prefetchPos, direction); + Assert(INDEX_SCAN_BATCH(scan, prefetchPos->batch) == prefetchBatch); + } + + /* + * We advanced the position. Either return the block for the TID, or + * skip it (and then try advancing again). + */ + Assert(prefetchBatch->dir == direction); + Assert(scanPos->batch < prefetchPos->batch || + (scanPos->batch == prefetchPos->batch && + ScanDirectionIsForward(direction) ? + scanPos->item <= prefetchPos->item : + scanPos->item >= prefetchPos->item)); + + /* + * The block may be "skipped" for two reasons. First, the caller may + * define a "prefetch" callback that tells us to skip items (IOS does + * this to skip all-visible pages). Second, currentPrefetchBlock is + * used to skip duplicate block numbers (a sequence of TIDS for the + * same block). + */ + batch_assert_pos_valid(scan, prefetchPos); + item = &prefetchBatch->items[prefetchPos->item]; + tid = &item->heapTid; + + DEBUG_LOG("heapam_getnext_stream: item %d, TID (%u,%u)", + prefetchPos->item, + ItemPointerGetBlockNumber(tid), + ItemPointerGetOffsetNumber(tid)); + + /* + * For index-only scans, determine if the page is all-visible now. If + * it is, we won't need the block and can skip it too. + */ + if (scan->xs_want_itup && item->allVisible) + continue; + + /* same block as before, don't need to read it */ + if (ringbuf->currentPrefetchBlock == ItemPointerGetBlockNumber(tid)) + { + DEBUG_LOG("heapam_getnext_stream: skip block (currentPrefetchBlock)"); + continue; + } + + ringbuf->currentPrefetchBlock = ItemPointerGetBlockNumber(tid); + + return ringbuf->currentPrefetchBlock; + } + + /* no more items in this scan */ + return InvalidBlockNumber; +} + /* ---------------- * index_fetch_heap - get the scan's next heap tuple * diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 9eaecd943..d15a2ee81 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -467,7 +467,15 @@ index_restrpos(IndexScanDesc scan) CHECK_SCAN_PROCEDURE(amgetbatch); CHECK_SCAN_PROCEDURE(amposreset); - /* release resources (like buffer pins) from table accesses */ + /* + * release resources (like buffer pins) from table accesses + * + * XXX: Currently, the distance is always remembered across any + * read_stream_reset calls (to work around the scan->ringbuf->reset + * behavior of resetting the stream to deal with running out of batches). + * We probably _should_ be forgetting the distance when we reset the + * stream here (through our table_index_fetch_reset call), though. + */ if (scan->xs_heapfetch) table_index_fetch_reset(scan->xs_heapfetch); diff --git a/src/backend/access/index/indexbatch.c b/src/backend/access/index/indexbatch.c index 86a1c6f56..4df92e7af 100644 --- a/src/backend/access/index/indexbatch.c +++ b/src/backend/access/index/indexbatch.c @@ -74,11 +74,16 @@ index_batch_init(IndexScanDesc scan) (!scan->xs_want_itup && IsMVCCSnapshot(scan->xs_snapshot) && RelationNeedsWAL(scan->indexRelation)); scan->finished = false; + scan->ringbuf->reset = false; + scan->ringbuf->prefetchingLockedIn = false; + scan->ringbuf->disabled = false; + scan->ringbuf->currentPrefetchBlock = InvalidBlockNumber; scan->ringbuf->direction = NoMovementScanDirection; /* positions in the ring buffer of batches */ batch_reset_pos(&scan->ringbuf->scanPos); batch_reset_pos(&scan->ringbuf->markPos); + batch_reset_pos(&scan->ringbuf->prefetchPos); scan->ringbuf->markBatch = NULL; scan->ringbuf->headBatch = 0; /* initial head batch */ @@ -107,9 +112,12 @@ index_batch_reset(IndexScanDesc scan, bool complete) batch_assert_batches_valid(scan); batch_debug_print_batches("index_batch_reset", scan); Assert(scan->xs_heapfetch); + if (scan->xs_heapfetch->rs) + read_stream_reset(scan->xs_heapfetch->rs); /* reset the positions */ batch_reset_pos(&ringbuf->scanPos); + batch_reset_pos(&ringbuf->prefetchPos); /* * With "complete" reset, make sure to also free the marked batch, either @@ -153,6 +161,8 @@ index_batch_reset(IndexScanDesc scan, bool complete) ringbuf->nextBatch = 0; /* initial batch is empty */ scan->finished = false; + ringbuf->reset = false; + ringbuf->currentPrefetchBlock = InvalidBlockNumber; batch_assert_batches_valid(scan); } @@ -213,9 +223,13 @@ index_batch_restore_pos(IndexScanDesc scan) { BatchRingBuffer *ringbuf = scan->ringbuf; BatchRingItemPos *markPos = &ringbuf->markPos; - BatchRingItemPos *scanPos = &ringbuf->scanPos ; BatchIndexScan markBatch = ringbuf->markBatch; + /* + * XXX Disable this optimization when I/O prefetching is in use, at least + * until the possible interactions with prefetchPos are fully understood. + */ +#if 0 if (scanPos->batch == markPos->batch && scanPos->batch == ringbuf->headBatch) { @@ -226,6 +240,7 @@ index_batch_restore_pos(IndexScanDesc scan) scanPos->item = markPos->item; return; } +#endif /* * Call amposreset to let index AM know to invalidate any private state diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 16bf1f61a..23e7c0a2f 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -144,6 +144,7 @@ int max_parallel_workers_per_gather = 2; bool enable_seqscan = true; bool enable_indexscan = true; +bool enable_indexscan_prefetch = true; bool enable_indexonlyscan = true; bool enable_bitmapscan = true; bool enable_tidscan = true; diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c index 88717c2ff..7463651e0 100644 --- a/src/backend/storage/aio/read_stream.c +++ b/src/backend/storage/aio/read_stream.c @@ -99,6 +99,7 @@ struct ReadStream int16 forwarded_buffers; int16 pinned_buffers; int16 distance; + int16 distance_old; int16 initialized_buffers; int read_buffers_flags; bool sync_mode; /* using io_method=sync */ @@ -464,6 +465,7 @@ read_stream_look_ahead(ReadStream *stream) if (blocknum == InvalidBlockNumber) { /* End of stream. */ + stream->distance_old = stream->distance; stream->distance = 0; break; } @@ -862,6 +864,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data) else { /* No more blocks, end of stream. */ + stream->distance_old = stream->distance; stream->distance = 0; stream->oldest_buffer_index = stream->next_buffer_index; stream->pinned_buffers = 0; @@ -1046,6 +1049,9 @@ read_stream_reset(ReadStream *stream) int16 index; Buffer buffer; + /* remember the old distance (if we reset before end of the stream) */ + stream->distance_old = Max(stream->distance, stream->distance_old); + /* Stop looking ahead. */ stream->distance = 0; @@ -1078,8 +1084,12 @@ read_stream_reset(ReadStream *stream) Assert(stream->pinned_buffers == 0); Assert(stream->ios_in_progress == 0); - /* Start off assuming data is cached. */ - stream->distance = 1; + /* + * Restore the old distance, if we have one. Otherwise start assuming data + * is cached. + */ + stream->distance = Max(1, stream->distance_old); + stream->distance_old = 0; } /* diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index 7c60b1255..a99aa41db 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -891,6 +891,13 @@ boot_val => 'true', }, +{ name => 'enable_indexscan_prefetch', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables prefetching for index scans and index-only-scans.', + flags => 'GUC_EXPLAIN', + variable => 'enable_indexscan_prefetch', + boot_val => 'true', +}, + { name => 'enable_material', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', short_desc => 'Enables the planner\'s use of materialization.', flags => 'GUC_EXPLAIN', diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index dc9e2255f..da50ae15f 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -412,6 +412,7 @@ #enable_incremental_sort = on #enable_indexscan = on #enable_indexonlyscan = on +#enable_indexscan_prefetch = on #enable_material = on #enable_memoize = on #enable_mergejoin = on diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 3dd63fd88..b5628736b 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -159,6 +159,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_incremental_sort | on enable_indexonlyscan | on enable_indexscan | on + enable_indexscan_prefetch | on enable_material | on enable_memoize | on enable_mergejoin | on @@ -173,7 +174,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_seqscan | on enable_sort | on enable_tidscan | on -(25 rows) +(26 rows) -- There are always wait event descriptions for various types. InjectionPoint -- may be present or absent, depending on history since last postmaster start. -- 2.51.0