From b22280f0833c3f9638aa3fbecae10bf0ba9da455 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Sat, 15 Nov 2025 14:03:58 -0500 Subject: [PATCH v6 2/4] Add prefetching to index scans using batch interfaces. This commit implements I/O prefetching for index scans, made possible by the recent addition of batching interfaces to both the table AM and index AM APIs. The amgetbatch index AM interface provides batches of TIDs (rather than one at a time) from a single index leaf page, and allows multiple batches to be held in memory/pinned simultaneously. This gives the table AM the freedom to readahead within an index scan, which is crucial for I/O prefetching with certain workloads (workloads that would otherwise be unable to keep a sufficiently high prefetch distance for heap block I/O). Prefetching is implemented using a read stream under the control of the table AM. XXX When the batch queue reaches capacity, the stream pauses until the scan catches up and frees some batches. We need a more principled approach here. Essentially, we need infrastructure that allows a read stream call back to tell the read stream to "back off" without it fully ending/resetting the read stream. Note: For now prefetching is temporarily disabled during index-only scans, pending the reintroduction of visibility map caching in batches. Previous versions of the patch series had that, but it was removed when we moved over to the new table AM interface. Author: Tomas Vondra Author: Peter Geoghegan Reviewed-By: Andres Freund Reviewed-By: Thomas Munro Discussion: https://postgr.es/m/cf85f46f-b02f-05b2-5248-5000b894ebab@enterprisedb.com --- src/include/access/relscan.h | 33 +- src/include/access/tableam.h | 15 + src/include/optimizer/cost.h | 1 + src/backend/access/heap/heapam_handler.c | 386 +++++++++++++++++- src/backend/access/index/indexam.c | 10 +- src/backend/access/index/indexbatch.c | 17 +- src/backend/optimizer/path/costsize.c | 1 + src/backend/storage/aio/read_stream.c | 14 +- src/backend/utils/misc/guc_parameters.dat | 7 + src/backend/utils/misc/postgresql.conf.sample | 1 + src/test/regress/expected/sysviews.out | 3 +- 11 files changed, 481 insertions(+), 7 deletions(-) diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index d6a34c193..8dcfeae6d 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -20,6 +20,7 @@ #include "nodes/tidbitmap.h" #include "port/atomics.h" #include "storage/buf.h" +#include "storage/read_stream.h" #include "storage/relfilelocator.h" #include "storage/spin.h" #include "utils/relcache.h" @@ -124,6 +125,7 @@ typedef struct ParallelBlockTableScanWorkerData *ParallelBlockTableScanWorker; typedef struct IndexFetchTableData { Relation rel; + ReadStream *rs; } IndexFetchTableData; /* @@ -220,8 +222,14 @@ typedef struct BatchIndexScanData *BatchIndexScan; * Maximum number of batches (leaf pages) we can keep in memory. We need a * minimum of two, since we'll only consider releasing one batch when another * is read. + * + * The choice of 64 batches is arbitrary. It's about 1MB of data with 8KB + * pages (512kB for pages, and then a bit of overhead). We should not really + * need this many batches in most cases, though. The read stream looks ahead + * just enough to queue enough IOs, adjusting the distance (TIDs, but + * ultimately the number of future batches) to meet that. */ -#define INDEX_SCAN_MAX_BATCHES 2 +#define INDEX_SCAN_MAX_BATCHES 64 #define INDEX_SCAN_CACHE_BATCHES 2 #define INDEX_SCAN_BATCH_COUNT(scan) \ ((scan)->batchqueue->nextBatch - (scan)->batchqueue->headBatch) @@ -268,12 +276,35 @@ typedef struct BatchIndexScanData *BatchIndexScan; */ typedef struct BatchQueue { + bool reset; + + /* + * Did we disable prefetching/use of a read stream because it didn't pay + * for itself? + */ + bool prefetchingLockedIn; + bool disabled; + + /* + * During prefetching, currentPrefetchBlock is the table AM block number + * that was returned by our read stream callback most recently. Used to + * suppress duplicate successive read stream block requests. + * + * Prefetching can still perform non-successive requests for the same + * block number (in general we're prefetching in exactly the same order + * that the scan will return table AM TIDs in). We need to avoid + * duplicate successive requests because table AMs expect to be able to + * hang on to buffer pins across table_index_fetch_tuple calls. + */ + BlockNumber currentPrefetchBlock; + /* Current scan direction, for the currently loaded batches */ ScanDirection direction; /* current positions in batches[] for scan */ BatchQueueItemPos readPos; /* read position */ BatchQueueItemPos markPos; /* mark/restore position */ + BatchQueueItemPos streamPos; /* stream position (for prefetching) */ BatchIndexScan markBatch; diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 384656ce1..07695add0 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -448,6 +448,21 @@ typedef struct TableAmRoutine ScanDirection direction, TupleTableSlot *slot); + /* + * Read stream callback, used to perform I/O prefetching of table AM pages + * during `index_getnext_slot` index scans. + * + * This callback is directly passed to read_stream_begin_relation, from + * batch_getnext routine. It will only be used during scans whose index + * AM uses the amgetbatch interface. (Scans with amgettuple-based index + * AMs cannot reasonably be used for I/O prefetching, since its opaque + * tuple-at-a-time interface makes it impossible to schedule index scan + * work sensibly.) + */ + BlockNumber (*index_getnext_stream) (ReadStream *stream, + void *callback_private_data, + void *per_buffer_data); + /* * Fetch tuple at `tid` into `slot`, after doing a visibility test * according to `snapshot`. If a tuple was found and passed the visibility diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 07b8bfa63..31236ceac 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -51,6 +51,7 @@ extern PGDLLIMPORT Cost disable_cost; extern PGDLLIMPORT int max_parallel_workers_per_gather; extern PGDLLIMPORT bool enable_seqscan; extern PGDLLIMPORT bool enable_indexscan; +extern PGDLLIMPORT bool enable_indexscan_prefetch; extern PGDLLIMPORT bool enable_indexonlyscan; extern PGDLLIMPORT bool enable_bitmapscan; extern PGDLLIMPORT bool enable_tidscan; diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index dd46ccb3b..ad4987d58 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -36,6 +36,7 @@ #include "commands/progress.h" #include "executor/executor.h" #include "miscadmin.h" +#include "optimizer/cost.h" #include "pgstat.h" #include "storage/bufmgr.h" #include "storage/bufpage.h" @@ -59,6 +60,9 @@ static BlockNumber heapam_scan_get_blocks_done(HeapScanDesc hscan); static bool BitmapHeapScanNextBlock(TableScanDesc scan, bool *recheck, uint64 *lossy_pages, uint64 *exact_pages); +static BlockNumber heapam_getnext_stream(ReadStream *stream, + void *callback_private_data, + void *per_buffer_data); /* ------------------------------------------------------------------------ @@ -84,6 +88,7 @@ heapam_index_fetch_begin(Relation rel) IndexFetchHeapData *hscan = palloc_object(IndexFetchHeapData); hscan->xs_base.rel = rel; + hscan->xs_base.rs = NULL; hscan->xs_cbuf = InvalidBuffer; hscan->xs_blk = InvalidBlockNumber; hscan->vmbuf = InvalidBuffer; @@ -96,6 +101,9 @@ heapam_index_fetch_reset(IndexFetchTableData *scan) { IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan; + if (scan->rs) + read_stream_reset(scan->rs); + /* deliberately don't drop VM buffer pin here */ if (BufferIsValid(hscan->xs_cbuf)) { @@ -112,6 +120,9 @@ heapam_index_fetch_end(IndexFetchTableData *scan) heapam_index_fetch_reset(scan); + if (scan->rs) + read_stream_end(scan->rs); + if (hscan->vmbuf != InvalidBuffer) { ReleaseBuffer(hscan->vmbuf); @@ -149,7 +160,10 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, * When using a read stream, the stream will already know which block * number comes next (though an assertion will verify a match below) */ - hscan->xs_cbuf = ReadBuffer(hscan->xs_base.rel, hscan->xs_blk); + if (scan->rs) + hscan->xs_cbuf = read_stream_next_buffer(scan->rs, NULL); + else + hscan->xs_cbuf = ReadBuffer(hscan->xs_base.rel, hscan->xs_blk); /* * Prune page when it is pinned for the first time @@ -275,6 +289,17 @@ heap_batch_getnext(IndexScanDesc scan, BatchIndexScan priorbatch, DEBUG_LOG("batch_getnext headBatch %d nextBatch %d batch %p", batchqueue->headBatch, batchqueue->nextBatch, batch); + + /* Delay initializing stream until reading from scan's second batch */ + if (!scan->xs_heapfetch->rs && !batchqueue->disabled && priorbatch && + !scan->xs_want_itup && /* XXX prefetching disabled for IoS, for + * now */ + enable_indexscan_prefetch) + scan->xs_heapfetch->rs = + read_stream_begin_relation(READ_STREAM_DEFAULT, NULL, + scan->heapRelation, MAIN_FORKNUM, + scan->heapRelation->rd_tableam->index_getnext_stream, + scan, 0); } batch_assert_batches_valid(scan); @@ -307,6 +332,16 @@ heapam_batch_getnext_tid(IndexScanDesc scan, ScanDirection direction) /* Initialize direction on first call */ if (batchqueue->direction == NoMovementScanDirection) batchqueue->direction = direction; + else if (unlikely(batchqueue->disabled && scan->xs_heapfetch->rs)) + { + /* + * Handle cancelling the use of the read stream for prefetching + */ + batch_reset_pos(&batchqueue->streamPos); + + read_stream_reset(scan->xs_heapfetch->rs); + scan->xs_heapfetch->rs = NULL; + } /* * Try advancing the batch position. If that doesn't succeed, it means we @@ -337,6 +372,46 @@ nextbatch: { heapam_batch_rewind(scan, batchqueue, direction); readPos->batch = batchqueue->nextBatch - 1; + + if (scan->xs_heapfetch->rs) + read_stream_reset(scan->xs_heapfetch->rs); + batch_reset_pos(&batchqueue->streamPos); + } + + if (INDEX_SCAN_BATCH_LOADED(scan, readPos->batch + 1)) + { + /* advance to the next batch */ + readPos->batch++; + + readBatch = INDEX_SCAN_BATCH(scan, readPos->batch); + + if (ScanDirectionIsForward(direction)) + readPos->item = readBatch->firstItem; + else + readPos->item = readBatch->lastItem; + + batch_assert_pos_valid(scan, readPos); + + if (readPos->batch != batchqueue->headBatch) + { + BatchIndexScan headBatch = INDEX_SCAN_BATCH(scan, + batchqueue->headBatch); + + /* Free the head batch (except when it's markBatch) */ + batch_free(scan, headBatch); + + /* + * In any case, remove the batch from the regular queue, even if + * we kept it for mark/restore + */ + batchqueue->headBatch++; + + /* we can't skip any batches */ + Assert(batchqueue->headBatch == readPos->batch); + } + + pgstat_count_index_tuples(scan->indexRelation, 1); + return heapam_batch_return_tid(scan, readBatch, readPos); } if ((readBatch = heap_batch_getnext(scan, readBatch, direction)) != NULL) @@ -395,6 +470,314 @@ nextbatch: return NULL; } +/* + * heap_batch_advance_streampos + * Advance streamPos to the next item during prefetching. + * + * Move to the next item within the batch pointed to by caller's pos. + * Advances the position to the next item, either in the same batch or the + * following one (if already available). + * + * We can advance only if we already have some batches loaded, and there's + * either enough items in the current batch, or some more items in the + * subsequent batches. + * + * If this is the first advance (right after loading the initial/head batch), + * position is still undefined. Otherwise we expect the position to be valid. + * + * Returns true if the position was advanced, false otherwise. The position + * is guaranteed to be valid only after a successful advance. + */ +pg_attribute_always_inline +static bool +heap_batch_advance_streampos(IndexScanDesc scan, BatchQueueItemPos *streamPos, + ScanDirection direction) +{ + BatchIndexScan streamBatch; + + /* make sure we have batching initialized and consistent */ + batch_assert_batches_valid(scan); + + /* should know direction by now */ + Assert(direction == scan->batchqueue->direction); + Assert(direction != NoMovementScanDirection); + + /* We can't advance if there are no batches available. */ + if (INDEX_SCAN_BATCH_COUNT(scan) == 0) + return false; + + /* + * The position is already defined, so we should have some batches loaded + * and the position has to be valid with respect to those. + */ + Assert(!INDEX_SCAN_POS_INVALID(streamPos)); + batch_assert_pos_valid(scan, streamPos); + + /* + * Advance to the next item in the same batch, if there are more items. If + * we're at the last item, we'll try advancing to the next batch later. + */ + streamBatch = INDEX_SCAN_BATCH(scan, streamPos->batch); + + if (ScanDirectionIsForward(direction)) + { + if (++streamPos->item <= streamBatch->lastItem) + { + batch_assert_pos_valid(scan, streamPos); + + return true; + } + } + else /* ScanDirectionIsBackward */ + { + if (--streamPos->item >= streamBatch->firstItem) + { + batch_assert_pos_valid(scan, streamPos); + + return true; + } + } + + /* + * We couldn't advance within the same batch, try advancing to the next + * batch, if it's already loaded. + */ + if (INDEX_SCAN_BATCH_LOADED(scan, streamPos->batch + 1)) + { + /* advance to the next batch */ + streamPos->batch++; + + streamBatch = INDEX_SCAN_BATCH(scan, streamPos->batch); + + if (ScanDirectionIsForward(direction)) + streamPos->item = streamBatch->firstItem; + else + streamPos->item = streamBatch->lastItem; + + batch_assert_pos_valid(scan, streamPos); + + return true; + } + + /* can't advance */ + return false; +} + +/* + * Controls when we cancel use of a read stream to do prefetching + */ +#define INDEX_SCAN_MIN_DISTANCE_NBATCHES 20 +#define INDEX_SCAN_MIN_TUPLE_DISTANCE 7 + +/* + * heapam_getnext_stream + * return the next block to pass to the read stream + * + * This assumes the "current" scan direction, requested by the caller. + * + * If the direction changes before consuming all blocks, we'll reset the stream + * and start from scratch. The scan direction change is handled elsewhere. + * Here we rely on having the correct value in batchqueue->direction. + * + * The position of the read_stream is stored in streamPos, which may be ahead of + * the current readPos (which is what got consumed by the scan). + * + * The streamPos can however also get behind readPos too, when some blocks are + * skipped and not returned to the read_stream. An example is an index scan on + * a correlated index, with many duplicate blocks are skipped, or an IOS where + * all-visible blocks are skipped. + * + * The initial batch is always loaded by heapam_batch_getnext_tid. We don't + * get here until the first read_stream_next_buffer() call, when pulling the + * first heap tuple from the stream. After that, most batches should be loaded + * by this callback, driven by the read_stream look-ahead distance. However, + * with disabled prefetching (that is, with effective_io_concurrency=0), all + * batches will be loaded in heapam_batch_getnext_tid. + * + * It's possible we got here only fairly late in the scan, e.g. if many tuples + * got skipped in the index-only scan, etc. In this case just use the read + * position as a streamPos starting point. + */ +static BlockNumber +heapam_getnext_stream(ReadStream *stream, void *callback_private_data, + void *per_buffer_data) +{ + IndexScanDesc scan = (IndexScanDesc) callback_private_data; + BatchQueue *batchqueue = scan->batchqueue; + BatchQueueItemPos *streamPos = &batchqueue->streamPos; + ScanDirection direction = batchqueue->direction; + + /* By now we should know the direction of the scan. */ + Assert(direction != NoMovementScanDirection); + + /* + * The read position (readPos) has to be valid. + * + * We initialize/advance it before even attempting to read the heap tuple, + * and it gets invalidated when we reach the end of the scan (but then we + * don't invoke the callback again). + * + * XXX This applies to the readPos. We'll use streamPos to determine which + * blocks to pass to the stream, and readPos may be used to initialize it. + */ + batch_assert_pos_valid(scan, &batchqueue->readPos); + + /* + * Try to advance the streamPos to the next item, and if that doesn't + * succeed (if there are no more items in loaded batches), try loading the + * next one. + * + * FIXME This can loop more than twice. If many blocks get skipped due to + * currentPrefetchBlock or all-visibility (per the "prefetch" callback), + * we get to load additional batches. In the worst case we'll hit the + * INDEX_SCAN_MAX_BATCHES limit and have to "pause"/reset the stream. + */ + while (true) + { + bool advanced = false; + BatchIndexScan priorbatch = NULL; + + /* + * If the stream position has not been initialized yet, set it to the + * current read position. This is the item the caller is trying to + * read, so it's what we should return to the stream. + */ + if (INDEX_SCAN_POS_INVALID(streamPos)) + { + *streamPos = batchqueue->readPos; + advanced = true; + } + else if (heap_batch_advance_streampos(scan, streamPos, direction)) + { + advanced = true; + } + + /* + * FIXME Maybe check the streamPos is not behind readPos? + * + * FIXME Actually, could streamPos get stale/lagging behind readPos, + * and if yes how much. Could it get so far behind to not be valid, + * pointing at a freed batch? In that case we can't even advance it, + * and we should just initialize it to readPos. We might do that + * anyway, I guess, just to save on "pointless" advances (it must + * agree with readPos, we can't allow "retroactively" changing the + * block sequence). + */ + + /* + * If we advanced the position, either return the block for the TID, + * or skip it (and then try advancing again). + * + * The block may be "skipped" for two reasons. First, the caller may + * define a "prefetch" callback that tells us to skip items (IOS does + * this to skip all-visible pages). Second, currentPrefetchBlock is + * used to skip duplicate block numbers (a sequence of TIDS for the + * same block). + */ + if (advanced) + { + BatchIndexScan streamBatch = INDEX_SCAN_BATCH(scan, streamPos->batch); + ItemPointer tid = &streamBatch->items[streamPos->item].heapTid; + + DEBUG_LOG("heapam_getnext_stream: item %d, TID (%u,%u)", + streamPos->item, + ItemPointerGetBlockNumber(tid), + ItemPointerGetOffsetNumber(tid)); + + /* same block as before, don't need to read it */ + if (batchqueue->currentPrefetchBlock == ItemPointerGetBlockNumber(tid)) + { + DEBUG_LOG("heapam_getnext_stream: skip block (currentPrefetchBlock)"); + continue; + } + + batchqueue->currentPrefetchBlock = ItemPointerGetBlockNumber(tid); + + return batchqueue->currentPrefetchBlock; + } + + if (scan->finished) + break; + + /* + * If we already used the maximum number of batch slots available, + * it's pointless to try loading another one. This can happen for + * various reasons, e.g. for index-only scans on all-visible table, or + * skipping duplicate blocks on perfectly correlated indexes, etc. + * + * We could enlarge the array to allow more batches, but that's + * futile, we can always construct a case using more memory. Not only + * it would risk OOM, it'd also be inefficient because this happens + * early in the scan (so it'd interfere with LIMIT queries). + */ + if (INDEX_SCAN_BATCH_FULL(scan)) + { + DEBUG_LOG("batch_getnext: ran out of space for batches"); + scan->batchqueue->reset = true; + break; + } + + /* + * Couldn't advance the position, no more items in the loaded batches. + * Try loading the next batch - if that succeeds, try advancing again + * (this time the advance should work, but we may skip all the items). + * + * If we fail to load the next batch, we're done. + */ + if (batchqueue->headBatch < batchqueue->nextBatch) + priorbatch = INDEX_SCAN_BATCH(scan, batchqueue->nextBatch - 1); + if (!heap_batch_getnext(scan, priorbatch, direction)) + break; + + /* + * Consider disabling prefetching when we can't keep a sufficiently + * large "index tuple distance" between readPos and streamPos. + * + * Only consider doing this when we're not on the scan's initial + * batch, when readPos and streamPos share the same batch. + */ + if (!scan->finished && !batchqueue->prefetchingLockedIn) + { + int itemdiff; + + if (streamPos->batch <= INDEX_SCAN_MIN_DISTANCE_NBATCHES) + { + /* Too early to check if prefetching should be disabled */ + } + else if (batchqueue->readPos.batch == streamPos->batch) + { + BatchQueueItemPos *readPos = &batchqueue->readPos; + + if (ScanDirectionIsForward(direction)) + itemdiff = streamPos->item - readPos->item; + else + { + BatchIndexScan readBatch = + INDEX_SCAN_BATCH(scan, readPos->batch); + + itemdiff = (readPos->item - readBatch->firstItem) - + (streamPos->item - readBatch->firstItem); + } + + if (itemdiff < INDEX_SCAN_MIN_TUPLE_DISTANCE) + { + batchqueue->disabled = true; + return InvalidBlockNumber; + } + else + { + batchqueue->prefetchingLockedIn = true; + } + } + else + batchqueue->prefetchingLockedIn = true; + } + } + + /* no more items in this scan */ + return InvalidBlockNumber; +} + /* ---------------- * index_fetch_heap - get the scan's next heap tuple * @@ -3063,6 +3446,7 @@ static const TableAmRoutine heapam_methods = { .index_fetch_reset = heapam_index_fetch_reset, .index_fetch_end = heapam_index_fetch_end, .index_getnext_slot = heapam_index_getnext_slot, + .index_getnext_stream = heapam_getnext_stream, .index_fetch_tuple = heapam_index_fetch_tuple, .tuple_insert = heapam_tuple_insert, diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index a59c76750..aaf2b39b4 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -467,7 +467,15 @@ index_restrpos(IndexScanDesc scan) CHECK_SCAN_PROCEDURE(amgetbatch); CHECK_SCAN_PROCEDURE(amposreset); - /* release resources (like buffer pins) from table accesses */ + /* + * release resources (like buffer pins) from table accesses + * + * XXX: Currently, the distance is always remembered across any + * read_stream_reset calls (to work around the scan->batchqueue->reset + * behavior of resetting the stream to deal with running out of batches). + * We probably _should_ be forgetting the distance when we reset the + * stream here (through our table_index_fetch_reset call), though. + */ if (scan->xs_heapfetch) table_index_fetch_reset(scan->xs_heapfetch); diff --git a/src/backend/access/index/indexbatch.c b/src/backend/access/index/indexbatch.c index 7fad00084..d57ed0e7a 100644 --- a/src/backend/access/index/indexbatch.c +++ b/src/backend/access/index/indexbatch.c @@ -74,11 +74,16 @@ index_batch_init(IndexScanDesc scan) (!scan->xs_want_itup && IsMVCCSnapshot(scan->xs_snapshot) && RelationNeedsWAL(scan->indexRelation)); scan->finished = false; + scan->batchqueue->reset = false; + scan->batchqueue->prefetchingLockedIn = false; + scan->batchqueue->disabled = false; + scan->batchqueue->currentPrefetchBlock = InvalidBlockNumber; scan->batchqueue->direction = NoMovementScanDirection; /* positions in the queue of batches */ batch_reset_pos(&scan->batchqueue->readPos); batch_reset_pos(&scan->batchqueue->markPos); + batch_reset_pos(&scan->batchqueue->streamPos); scan->batchqueue->markBatch = NULL; scan->batchqueue->headBatch = 0; /* initial head batch */ @@ -107,9 +112,12 @@ index_batch_reset(IndexScanDesc scan, bool complete) batch_assert_batches_valid(scan); batch_debug_print_batches("index_batch_reset", scan); Assert(scan->xs_heapfetch); + if (scan->xs_heapfetch->rs) + read_stream_reset(scan->xs_heapfetch->rs); /* reset the positions */ batch_reset_pos(&batchqueue->readPos); + batch_reset_pos(&batchqueue->streamPos); /* * With "complete" reset, make sure to also free the marked batch, either @@ -155,6 +163,8 @@ index_batch_reset(IndexScanDesc scan, bool complete) batchqueue->nextBatch = 0; /* initial batch is empty */ scan->finished = false; + batchqueue->reset = false; + batchqueue->currentPrefetchBlock = InvalidBlockNumber; batch_assert_batches_valid(scan); } @@ -218,9 +228,13 @@ index_batch_restore_pos(IndexScanDesc scan) { BatchQueue *batchqueue = scan->batchqueue; BatchQueueItemPos *markPos = &batchqueue->markPos; - BatchQueueItemPos *readPos = &batchqueue->readPos; BatchIndexScan markBatch = batchqueue->markBatch; + /* + * XXX Disable this optimization when I/O prefetching is in use, at least + * until the possible interactions with streamPos are fully understood. + */ +#if 0 if (readPos->batch == markPos->batch && readPos->batch == batchqueue->headBatch) { @@ -231,6 +245,7 @@ index_batch_restore_pos(IndexScanDesc scan) readPos->item = markPos->item; return; } +#endif /* * Call amposreset to let index AM know to invalidate any private state diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 16bf1f61a..23e7c0a2f 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -144,6 +144,7 @@ int max_parallel_workers_per_gather = 2; bool enable_seqscan = true; bool enable_indexscan = true; +bool enable_indexscan_prefetch = true; bool enable_indexonlyscan = true; bool enable_bitmapscan = true; bool enable_tidscan = true; diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c index 88717c2ff..7463651e0 100644 --- a/src/backend/storage/aio/read_stream.c +++ b/src/backend/storage/aio/read_stream.c @@ -99,6 +99,7 @@ struct ReadStream int16 forwarded_buffers; int16 pinned_buffers; int16 distance; + int16 distance_old; int16 initialized_buffers; int read_buffers_flags; bool sync_mode; /* using io_method=sync */ @@ -464,6 +465,7 @@ read_stream_look_ahead(ReadStream *stream) if (blocknum == InvalidBlockNumber) { /* End of stream. */ + stream->distance_old = stream->distance; stream->distance = 0; break; } @@ -862,6 +864,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data) else { /* No more blocks, end of stream. */ + stream->distance_old = stream->distance; stream->distance = 0; stream->oldest_buffer_index = stream->next_buffer_index; stream->pinned_buffers = 0; @@ -1046,6 +1049,9 @@ read_stream_reset(ReadStream *stream) int16 index; Buffer buffer; + /* remember the old distance (if we reset before end of the stream) */ + stream->distance_old = Max(stream->distance, stream->distance_old); + /* Stop looking ahead. */ stream->distance = 0; @@ -1078,8 +1084,12 @@ read_stream_reset(ReadStream *stream) Assert(stream->pinned_buffers == 0); Assert(stream->ios_in_progress == 0); - /* Start off assuming data is cached. */ - stream->distance = 1; + /* + * Restore the old distance, if we have one. Otherwise start assuming data + * is cached. + */ + stream->distance = Max(1, stream->distance_old); + stream->distance_old = 0; } /* diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index 7c60b1255..a99aa41db 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -891,6 +891,13 @@ boot_val => 'true', }, +{ name => 'enable_indexscan_prefetch', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables prefetching for index scans and index-only-scans.', + flags => 'GUC_EXPLAIN', + variable => 'enable_indexscan_prefetch', + boot_val => 'true', +}, + { name => 'enable_material', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', short_desc => 'Enables the planner\'s use of materialization.', flags => 'GUC_EXPLAIN', diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index dc9e2255f..da50ae15f 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -412,6 +412,7 @@ #enable_incremental_sort = on #enable_indexscan = on #enable_indexonlyscan = on +#enable_indexscan_prefetch = on #enable_material = on #enable_memoize = on #enable_mergejoin = on diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 0411db832..a2a8c3afa 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -159,6 +159,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_incremental_sort | on enable_indexonlyscan | on enable_indexscan | on + enable_indexscan_prefetch | on enable_material | on enable_memoize | on enable_mergejoin | on @@ -173,7 +174,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_seqscan | on enable_sort | on enable_tidscan | on -(25 rows) +(26 rows) -- There are always wait event descriptions for various types. InjectionPoint -- may be present or absent, depending on history since last postmaster start. -- 2.51.0