From a3335da2a7a28dbb258380fa23d9ddd7c887f1d9 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Fri, 17 Nov 2023 23:54:19 +0100 Subject: [PATCH v20231209 1/2] prefetch 2023-11-24 Patch version shared on 2023/11/24. --- src/backend/access/heap/heapam_handler.c | 12 +- src/backend/access/index/genam.c | 31 +- src/backend/access/index/indexam.c | 645 ++++++++++++++++++++++- src/backend/commands/explain.c | 18 + src/backend/executor/execIndexing.c | 6 +- src/backend/executor/execReplication.c | 9 +- src/backend/executor/instrument.c | 4 + src/backend/executor/nodeIndexonlyscan.c | 97 +++- src/backend/executor/nodeIndexscan.c | 80 ++- src/backend/utils/adt/selfuncs.c | 3 +- src/include/access/genam.h | 110 +++- src/include/access/relscan.h | 10 + src/include/executor/instrument.h | 2 + 13 files changed, 997 insertions(+), 30 deletions(-) diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 7c28dafb728..89474078951 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -44,6 +44,7 @@ #include "storage/smgr.h" #include "utils/builtins.h" #include "utils/rel.h" +#include "utils/spccache.h" static void reform_and_rewrite_tuple(HeapTuple tuple, Relation OldHeap, Relation NewHeap, @@ -747,6 +748,14 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, PROGRESS_CLUSTER_INDEX_RELID }; int64 ci_val[2]; + int prefetch_max; + + /* + * Get the prefetch target for the old tablespace (which is what we'll + * read using the index). We'll use it as a reset value too, although + * there should be no rescans for CLUSTER etc. + */ + prefetch_max = get_tablespace_io_concurrency(OldHeap->rd_rel->reltablespace); /* Set phase and OIDOldIndex to columns */ ci_val[0] = PROGRESS_CLUSTER_PHASE_INDEX_SCAN_HEAP; @@ -755,7 +764,8 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, tableScan = NULL; heapScan = NULL; - indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0); + indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0, + prefetch_max); index_rescan(indexScan, NULL, 0, NULL, 0); } else diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 4ca12006843..d45a209ee3a 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -126,6 +126,9 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->xs_hitup = NULL; scan->xs_hitupdesc = NULL; + /* Information used for asynchronous prefetching during index scans. */ + scan->xs_prefetch = NULL; + return scan; } @@ -440,8 +443,20 @@ systable_beginscan(Relation heapRelation, elog(ERROR, "column is not in index"); } + /* + * We don't do any prefetching on system catalogs, for two main reasons. + * + * Firstly, we usually do PK lookups, which makes prefetching pointles, + * or we often don't know how many rows to expect (and the numbers tend + * to be fairly low). So it's not clear it'd help. Furthermore, places + * that are sensitive tend to use syscache anyway. + * + * Secondly, we can't call get_tablespace_io_concurrency() because that + * does a sysscan internally, so it might lead to a cycle. We could use + * use effective_io_concurrency, but it doesn't seem worth it. + */ sysscan->iscan = index_beginscan(heapRelation, irel, - snapshot, nkeys, 0); + snapshot, nkeys, 0, 0); index_rescan(sysscan->iscan, key, nkeys, NULL, 0); sysscan->scan = NULL; } @@ -696,8 +711,20 @@ systable_beginscan_ordered(Relation heapRelation, elog(ERROR, "column is not in index"); } + /* + * We don't do any prefetching on system catalogs, for two main reasons. + * + * Firstly, we usually do PK lookups, which makes prefetching pointles, + * or we often don't know how many rows to expect (and the numbers tend + * to be fairly low). So it's not clear it'd help. Furthermore, places + * that are sensitive tend to use syscache anyway. + * + * Secondly, we can't call get_tablespace_io_concurrency() because that + * does a sysscan internally, so it might lead to a cycle. We could use + * use effective_io_concurrency, but it doesn't seem worth it. + */ sysscan->iscan = index_beginscan(heapRelation, indexRelation, - snapshot, nkeys, 0); + snapshot, nkeys, 0, 0); index_rescan(sysscan->iscan, key, nkeys, NULL, 0); sysscan->scan = NULL; diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index f23e0199f08..e493548b68a 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -49,16 +49,19 @@ #include "access/relscan.h" #include "access/tableam.h" #include "access/transam.h" +#include "access/visibilitymap.h" #include "access/xlog.h" #include "catalog/index.h" #include "catalog/pg_amproc.h" #include "catalog/pg_type.h" #include "commands/defrem.h" +#include "common/hashfn.h" #include "nodes/makefuncs.h" #include "pgstat.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" #include "storage/predicate.h" +#include "utils/lsyscache.h" #include "utils/ruleutils.h" #include "utils/snapmgr.h" #include "utils/syscache.h" @@ -106,7 +109,12 @@ do { \ static IndexScanDesc index_beginscan_internal(Relation indexRelation, int nkeys, int norderbys, Snapshot snapshot, - ParallelIndexScanDesc pscan, bool temp_snap); + ParallelIndexScanDesc pscan, bool temp_snap, + int prefetch_max); + +static void index_prefetch_tids(IndexScanDesc scan, ScanDirection direction); +static ItemPointer index_prefetch_get_tid(IndexScanDesc scan, ScanDirection direction, bool *all_visible); +static void index_prefetch(IndexScanDesc scan, ItemPointer tid, bool skip_all_visible, bool *all_visible); /* ---------------------------------------------------------------- @@ -215,18 +223,42 @@ index_insert_cleanup(Relation indexRelation, * index_beginscan - start a scan of an index with amgettuple * * Caller must be holding suitable locks on the heap and the index. + * + * prefetch_max determines if prefetching is requested for this index scan, + * and how far ahead we want to prefetch + * + * Setting prefetch_max to 0 disables prefetching for the index scan. We do + * this for two reasons - for scans on system catalogs, and/or for cases where + * prefetching is expected to be pointless (like IOS). + * + * For system catalogs, we usually either scan by a PK value, or we we expect + * only few rows (or rather we don't know how many rows to expect). Also, we + * need to prevent infinite in the get_tablespace_io_concurrency() call - it + * does an index scan internally. So we simply disable prefetching for system + * catalogs. We could deal with this by picking a conservative static target + * (e.g. effective_io_concurrency, capped to something), but places that are + * performance sensitive likely use syscache anyway, and catalogs tend to be + * very small and hot. So we don't bother. + * + * For IOS, we expect to not need most heap pages (that's the whole point of + * IOS, actually), and prefetching them might lead to a lot of wasted I/O. + * + * XXX Not sure the infinite loop can still happen, now that the target lookup + * moved to callers of index_beginscan. */ IndexScanDesc index_beginscan(Relation heapRelation, Relation indexRelation, Snapshot snapshot, - int nkeys, int norderbys) + int nkeys, int norderbys, + int prefetch_max) { IndexScanDesc scan; Assert(snapshot != InvalidSnapshot); - scan = index_beginscan_internal(indexRelation, nkeys, norderbys, snapshot, NULL, false); + scan = index_beginscan_internal(indexRelation, nkeys, norderbys, snapshot, + NULL, false, prefetch_max); /* * Save additional parameters into the scandesc. Everything else was set @@ -256,7 +288,8 @@ index_beginscan_bitmap(Relation indexRelation, Assert(snapshot != InvalidSnapshot); - scan = index_beginscan_internal(indexRelation, nkeys, 0, snapshot, NULL, false); + /* No prefetch in bitmap scans, prefetch is done by the heap scan. */ + scan = index_beginscan_internal(indexRelation, nkeys, 0, snapshot, NULL, false, 0); /* * Save additional parameters into the scandesc. Everything else was set @@ -273,7 +306,8 @@ index_beginscan_bitmap(Relation indexRelation, static IndexScanDesc index_beginscan_internal(Relation indexRelation, int nkeys, int norderbys, Snapshot snapshot, - ParallelIndexScanDesc pscan, bool temp_snap) + ParallelIndexScanDesc pscan, bool temp_snap, + int prefetch_max) { IndexScanDesc scan; @@ -296,6 +330,31 @@ index_beginscan_internal(Relation indexRelation, /* Initialize information for parallel scan. */ scan->parallel_scan = pscan; scan->xs_temp_snap = temp_snap; + scan->indexonly = false; + + /* + * With prefetching requested, initialize the prefetcher state. + * + * FIXME This should really be in the IndexScanState, not IndexScanDesc + * (certainly the queues etc). But index_getnext_tid only gets the scan + * descriptor, so how else would we pass it? Seems like a sign of wrong + * layer doing the prefetching. + */ + if ((prefetch_max > 0) && + (io_direct_flags & IO_DIRECT_DATA) == 0) /* no prefetching for direct I/O */ + { + IndexPrefetch prefetcher = palloc0(sizeof(IndexPrefetchData)); + + prefetcher->queueIndex = 0; + prefetcher->queueStart = 0; + prefetcher->queueEnd = 0; + + prefetcher->prefetchTarget = 0; + prefetcher->prefetchMaxTarget = prefetch_max; + prefetcher->vmBuffer = InvalidBuffer; + + scan->xs_prefetch = prefetcher; + } return scan; } @@ -332,6 +391,20 @@ index_rescan(IndexScanDesc scan, scan->indexRelation->rd_indam->amrescan(scan, keys, nkeys, orderbys, norderbys); + + /* If we're prefetching for this index, maybe reset some of the state. */ + if (scan->xs_prefetch != NULL) + { + IndexPrefetch prefetcher = scan->xs_prefetch; + + prefetcher->queueStart = 0; + prefetcher->queueEnd = 0; + prefetcher->queueIndex = 0; + prefetcher->prefetchDone = false; + + /* restart the incremental ramp-up */ + prefetcher->prefetchTarget = 0; + } } /* ---------------- @@ -360,6 +433,23 @@ index_endscan(IndexScanDesc scan) if (scan->xs_temp_snap) UnregisterSnapshot(scan->xs_snapshot); + /* + * If prefetching was enabled for this scan, log prefetch stats. + * + * FIXME This should really go to EXPLAIN ANALYZE instead. + */ + if (scan->xs_prefetch) + { + IndexPrefetch prefetch = scan->xs_prefetch; + + elog(LOG, "index prefetch stats: requests " UINT64_FORMAT " prefetches " UINT64_FORMAT " (%f) skip cached " UINT64_FORMAT " sequential " UINT64_FORMAT, + prefetch->countAll, + prefetch->countPrefetch, + prefetch->countPrefetch * 100.0 / prefetch->countAll, + prefetch->countSkipCached, + prefetch->countSkipSequential); + } + /* Release the scan data structure itself */ IndexScanEnd(scan); } @@ -505,7 +595,8 @@ index_parallelrescan(IndexScanDesc scan) */ IndexScanDesc index_beginscan_parallel(Relation heaprel, Relation indexrel, int nkeys, - int norderbys, ParallelIndexScanDesc pscan) + int norderbys, ParallelIndexScanDesc pscan, + int prefetch_max) { Snapshot snapshot; IndexScanDesc scan; @@ -514,7 +605,7 @@ index_beginscan_parallel(Relation heaprel, Relation indexrel, int nkeys, snapshot = RestoreSnapshot(pscan->ps_snapshot_data); RegisterSnapshot(snapshot); scan = index_beginscan_internal(indexrel, nkeys, norderbys, snapshot, - pscan, true); + pscan, true, prefetch_max); /* * Save additional parameters into the scandesc. Everything else was set @@ -536,8 +627,8 @@ index_beginscan_parallel(Relation heaprel, Relation indexrel, int nkeys, * or NULL if no more matching tuples exist. * ---------------- */ -ItemPointer -index_getnext_tid(IndexScanDesc scan, ScanDirection direction) +static ItemPointer +index_getnext_tid_internal(IndexScanDesc scan, ScanDirection direction) { bool found; @@ -640,12 +731,16 @@ index_getnext_slot(IndexScanDesc scan, ScanDirection direction, TupleTableSlot * { for (;;) { + /* Do prefetching (if requested/enabled). */ + index_prefetch_tids(scan, direction); + if (!scan->xs_heap_continue) { - ItemPointer tid; + ItemPointer tid; + bool all_visible; /* Time to fetch the next TID from the index */ - tid = index_getnext_tid(scan, direction); + tid = index_prefetch_get_tid(scan, direction, &all_visible); /* If we're out of index entries, we're done */ if (tid == NULL) @@ -1003,3 +1098,531 @@ index_opclass_options(Relation indrel, AttrNumber attnum, Datum attoptions, return build_local_reloptions(&relopts, attoptions, validate); } + +/* + * index_prefetch_is_sequential + * Track the block number and check if the I/O pattern is sequential, + * or if the same block was just prefetched. + * + * Prefetching is cheap, but for some access patterns the benefits are small + * compared to the extra overhead. In particular, for sequential access the + * read-ahead performed by the OS is very effective/efficient. Doing more + * prefetching is just increasing the costs. + * + * This tries to identify simple sequential patterns, so that we can skip + * the prefetching request. This is implemented by having a small queue + * of block numbers, and checking it before prefetching another block. + * + * We look at the preceding PREFETCH_SEQ_PATTERN_BLOCKS blocks, and see if + * they are sequential. We also check if the block is the same as the last + * request (which is not sequential). + * + * Note that the main prefetch queue is not really useful for this, as it + * stores TIDs while we care about block numbers. Consider a sorted table, + * with a perfectly sequential pattern when accessed through an index. Each + * heap page may have dozens of TIDs, but we need to check block numbers. + * We could keep enough TIDs to cover enough blocks, but then we also need + * to walk those when checking the pattern (in hot path). + * + * So instead, we maintain a small separate queue of block numbers, and we use + * this instead. + * + * Returns true if the block is in a sequential pattern (and so should not be + * prefetched), or false (not sequential, should be prefetched). + * + * XXX The name is a bit misleading, as it also adds the block number to the + * block queue and checks if the block is the same as the last one (which + * does not require a sequential pattern). + */ +static bool +index_prefetch_is_sequential(IndexPrefetch prefetch, BlockNumber block) +{ + int idx; + + /* + * If the block queue is empty, just store the block and we're done (it's + * neither a sequential pattern, neither recently prefetched block). + */ + if (prefetch->blockIndex == 0) + { + prefetch->blockItems[PREFETCH_BLOCK_INDEX(prefetch->blockIndex)] = block; + prefetch->blockIndex++; + return false; + } + + /* + * Check if it's the same as the immediately preceding block. We don't + * want to prefetch the same block over and over (which would happen for + * well correlated indexes). + * + * In principle we could rely on index_prefetch_add_cache doing this using + * the full cache, but this check is much cheaper and we need to look at + * the preceding block anyway, so we just do it. + * + * XXX Notice we haven't added the block to the block queue yet, and there + * is a preceding block (i.e. blockIndex-1 is valid). + */ + if (prefetch->blockItems[PREFETCH_BLOCK_INDEX(prefetch->blockIndex - 1)] == block) + return true; + + /* + * Add the block number to the queue. + * + * We do this before checking if the pattern, because we want to know + * about the block even if we end up skipping the prefetch. Otherwise we'd + * not be able to detect longer sequential pattens - we'd skip one block + * but then fail to skip the next couple blocks even in a perfect + * sequential pattern. This ocillation might even prevent the OS + * read-ahead from kicking in. + */ + prefetch->blockItems[PREFETCH_BLOCK_INDEX(prefetch->blockIndex)] = block; + prefetch->blockIndex++; + + /* + * Check if the last couple blocks are in a sequential pattern. We look + * for a sequential pattern of PREFETCH_SEQ_PATTERN_BLOCKS (4 by default), + * so we look for patterns of 5 pages (40kB) including the new block. + * + * XXX Perhaps this should be tied to effective_io_concurrency somehow? + * + * XXX Could it be harmful that we read the queue backwards? Maybe memory + * prefetching works better for the forward direction? + */ + for (int i = 1; i < PREFETCH_SEQ_PATTERN_BLOCKS; i++) + { + /* + * Are there enough requests to confirm a sequential pattern? We only + * consider something to be sequential after finding a sequence of + * PREFETCH_SEQ_PATTERN_BLOCKS blocks. + * + * FIXME Better to move this outside the loop. + */ + if (prefetch->blockIndex < i) + return false; + + /* + * Calculate index of the earlier block (we need to do -1 as we + * already incremented the index when adding the new block to the + * queue). + */ + idx = PREFETCH_BLOCK_INDEX(prefetch->blockIndex - i - 1); + + /* + * For a sequential pattern, blocks "k" step ago needs to have block + * number by "k" smaller compared to the current block. + */ + if (prefetch->blockItems[idx] != (block - i)) + return false; + } + + return true; +} + +/* + * index_prefetch_add_cache + * Add a block to the cache, check if it was recently prefetched. + * + * We don't want to prefetch blocks that we already prefetched recently. It's + * cheap but not free, and the overhead may have measurable impact. + * + * This check needs to be very cheap, even with fairly large caches (hundreds + * of entries, see PREFETCH_CACHE_SIZE). + * + * A simple queue would allow expiring the requests, but checking if it + * contains a particular block prefetched would be expensive (linear search). + * Another option would be a simple hash table, which has fast lookup but + * does not allow expiring entries cheaply. + * + * The cache does not need to be perfect, we can accept false + * positives/negatives, as long as the rate is reasonably low. We also need + * to expire entries, so that only "recent" requests are remembered. + * + * We use a hybrid cache that is organized as many small LRU caches. Each + * block is mapped to a particular LRU by hashing (so it's a bit like a + * hash table). The LRU caches are tiny (e.g. 8 entries), and the expiration + * happens at the level of a single LRU (by tracking only the 8 most recent requests). + * + * This allows quick searches and expiration, but with false negatives (when a + * particular LRU has too many collisions, we may evict entries that are more + * recent than some other LRU). + * + * For example, imagine 128 LRU caches, each with 8 entries - that's 1024 + * prefetch request in total (these are the default parameters.) + * + * The recency is determined using a prefetch counter, incremented every + * time we end up prefetching a block. The counter is uint64, so it should + * not wrap (125 zebibytes, would take ~4 million years at 1GB/s). + * + * To check if a block was prefetched recently, we calculate hash(block), + * and then linearly search if the tiny LRU has entry for the same block + * and request less than PREFETCH_CACHE_SIZE ago. + * + * At the same time, we either update the entry (for the queried block) if + * found, or replace the oldest/empty entry. + * + * If the block was not recently prefetched (i.e. we want to prefetch it), + * we increment the counter. + * + * Returns true if the block was recently prefetched (and thus we don't + * need to prefetch it again), or false (should do a prefetch). + * + * XXX It's a bit confusing these return values are inverse compared to + * what index_prefetch_is_sequential does. + */ +static bool +index_prefetch_add_cache(IndexPrefetch prefetch, BlockNumber block) +{ + PrefetchCacheEntry *entry; + + /* map the block number the the LRU */ + int lru = hash_uint32(block) % PREFETCH_LRU_COUNT; + + /* age/index of the oldest entry in the LRU, to maybe use */ + uint64 oldestRequest = PG_UINT64_MAX; + int oldestIndex = -1; + + /* + * First add the block to the (tiny) top-level LRU cache and see if it's + * part of a sequential pattern. In this case we just ignore the block and + * don't prefetch it - we expect read-ahead to do a better job. + * + * XXX Maybe we should still add the block to the hybrid cache, in case we + * happen to access it later? That might help if we first scan a lot of + * the table sequentially, and then randomly. Not sure that's very likely + * with index access, though. + */ + if (index_prefetch_is_sequential(prefetch, block)) + { + prefetch->countSkipSequential++; + return true; + } + + /* + * See if we recently prefetched this block - we simply scan the LRU + * linearly. While doing that, we also track the oldest entry, so that we + * know where to put the block if we don't find a matching entry. + */ + for (int i = 0; i < PREFETCH_LRU_SIZE; i++) + { + entry = &prefetch->prefetchCache[lru * PREFETCH_LRU_SIZE + i]; + + /* Is this the oldest prefetch request in this LRU? */ + if (entry->request < oldestRequest) + { + oldestRequest = entry->request; + oldestIndex = i; + } + + /* + * If the entry is unused (identified by request being set to 0), + * we're done. Notice the field is uint64, so empty entry is + * guaranteed to be the oldest one. + */ + if (entry->request == 0) + continue; + + /* Is this entry for the same block as the current request? */ + if (entry->block == block) + { + bool prefetched; + + /* + * Is the old request sufficiently recent? If yes, we treat the + * block as already prefetched. + * + * XXX We do add the cache size to the request in order not to + * have issues with uint64 underflows. + */ + prefetched = ((entry->request + PREFETCH_CACHE_SIZE) >= prefetch->prefetchReqNumber); + + /* Update the request number. */ + entry->request = ++prefetch->prefetchReqNumber; + + prefetch->countSkipCached += (prefetched) ? 1 : 0; + + return prefetched; + } + } + + /* + * We didn't find the block in the LRU, so store it either in an empty + * entry, or in the "oldest" prefetch request in this LRU. + */ + Assert((oldestIndex >= 0) && (oldestIndex < PREFETCH_LRU_SIZE)); + + /* FIXME do a nice macro */ + entry = &prefetch->prefetchCache[lru * PREFETCH_LRU_SIZE + oldestIndex]; + + entry->block = block; + entry->request = ++prefetch->prefetchReqNumber; + + /* not in the prefetch cache */ + return false; +} + +/* + * index_prefetch + * Prefetch the TID, unless it's sequential or recently prefetched. + * + * XXX Some ideas how to auto-tune the prefetching, so that unnecessary + * prefetching does not cause significant regressions (e.g. for nestloop + * with inner index scan). We could track number of rescans and number of + * items (TIDs) actually returned from the scan. Then we could calculate + * rows / rescan and use that to clamp prefetch target. + * + * That'd help with cases when a scan matches only very few rows, far less + * than the prefetchTarget, because the unnecessary prefetches are wasted + * I/O. Imagine a LIMIT on top of index scan, or something like that. + * + * Another option is to use the planner estimates - we know how many rows we're + * expecting to fetch (on average, assuming the estimates are reasonably + * accurate), so why not to use that? + * + * Of course, we could/should combine these two approaches. + * + * XXX The prefetching may interfere with the patch allowing us to evaluate + * conditions on the index tuple, in which case we may not need the heap + * tuple. Maybe if there's such filter, we should prefetch only pages that + * are not all-visible (and the same idea would also work for IOS), but + * it also makes the indexing a bit "aware" of the visibility stuff (which + * seems a somewhat wrong). Also, maybe we should consider the filter selectivity + * (if the index-only filter is expected to eliminate only few rows, then + * the vm check is pointless). Maybe this could/should be auto-tuning too, + * i.e. we could track how many heap tuples were needed after all, and then + * we would consider this when deciding whether to prefetch all-visible + * pages or not (matters only for regular index scans, not IOS). + * + * XXX Maybe we could/should also prefetch the next index block, e.g. stored + * in BTScanPosData.nextPage. + * + * XXX Could we tune the cache size based on execution statistics? We have + * a cache of limited size (PREFETCH_CACHE_SIZE = 1024 by default), but + * how do we know it's the right size? Ideally, we'd have a cache large + * enough to track actually cached blocks. If the OS caches 10240 pages, + * then we may do 90% of prefetch requests unnecessarily. Or maybe there's + * a lot of contention, blocks are evicted quickly, and 90% of the blocks + * in the cache are not actually cached anymore? But we do have a concept + * of sequential request ID (PrefetchCacheEntry->request), which gives us + * information about "age" of the last prefetch. Now it's used only when + * evicting entries (to keep the more recent one), but maybe we could also + * use it when deciding if the page is cached. Right now any block that's + * in the cache is considered cached and not prefetched, but maybe we could + * have "max age", and tune it based on feedback from reading the blocks + * later. For example, if we find the block in cache and decide not to + * prefetch it, but then later find we have to do I/O, it means our cache + * is too large. And we could "reduce" the maximum age (measured from the + * current prefetchReqNumber value), so that only more recent blocks would + * be considered cached. Not sure about the opposite direction, where we + * decide to prefetch a block - AFAIK we don't have a way to determine if + * I/O was needed or not in this case (so we can't increase the max age). + * But maybe we could di that somehow speculatively, i.e. increase the + * value once in a while, and see what happens. + */ +static void +index_prefetch(IndexScanDesc scan, ItemPointer tid, bool skip_all_visible, bool *all_visible) +{ + IndexPrefetch prefetch = scan->xs_prefetch; + BlockNumber block; + + /* by default not all visible (or we didn't check) */ + *all_visible = false; + + /* + * No heap relation means bitmap index scan, which does prefetching at the + * bitmap heap scan, so no prefetch here (we can't do it anyway, without + * the heap) + * + * XXX But in this case we should have prefetchMaxTarget=0, because in + * index_bebinscan_bitmap() we disable prefetching. So maybe we should + * just check that. + */ + if (!prefetch) + return; + + /* + * If we got here, prefetching is enabled and it's a node that supports + * prefetching (i.e. it can't be a bitmap index scan). + */ + Assert(scan->heapRelation); + + block = ItemPointerGetBlockNumber(tid); + + /* + * When prefetching for IOS, we want to only prefetch pages that are not + * marked as all-visible (because not fetching all-visible pages is the + * point of IOS). + * + * XXX This is not great, because it releases the VM buffer for each TID + * we consider to prefetch. We should reuse that somehow, similar to the + * actual IOS code. Ideally, we should use the same ioss_VMBuffer (if + * we can propagate it here). Or at least do it for a bulk of prefetches, + * although that's not very useful - after the ramp-up we will prefetch + * the pages one by one anyway. + * + * XXX Ideally we'd also propagate this to the executor, so that the + * nodeIndexonlyscan.c doesn't need to repeat the same VM check (which + * is measurable). But the index_getnext_tid() is not really well + * suited for that, so the API needs a change.s + */ + if (skip_all_visible) + { + *all_visible = VM_ALL_VISIBLE(scan->heapRelation, + block, + &prefetch->vmBuffer); + + if (*all_visible) + return; + } + + /* + * Do not prefetch the same block over and over again, + * + * This happens e.g. for clustered or naturally correlated indexes (fkey + * to a sequence ID). It's not expensive (the block is in page cache + * already, so no I/O), but it's not free either. + */ + if (!index_prefetch_add_cache(prefetch, block)) + { + prefetch->countPrefetch++; + + PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM, block); + pgBufferUsage.blks_prefetches++; + } + + prefetch->countAll++; +} + +/* ---------------- + * index_getnext_tid - get the next TID from a scan + * + * The result is the next TID satisfying the scan keys, + * or NULL if no more matching tuples exist. + * + * FIXME not sure this handles xs_heapfetch correctly. + * ---------------- + */ +ItemPointer +index_getnext_tid(IndexScanDesc scan, ScanDirection direction) +{ + bool all_visible; /* ignored */ + + /* Do prefetching (if requested/enabled). */ + index_prefetch_tids(scan, direction); + + /* Read the TID from the queue (or directly from the index). */ + return index_prefetch_get_tid(scan, direction, &all_visible); +} + +ItemPointer +index_getnext_tid_vm(IndexScanDesc scan, ScanDirection direction, bool *all_visible) +{ + /* Do prefetching (if requested/enabled). */ + index_prefetch_tids(scan, direction); + + /* Read the TID from the queue (or directly from the index). */ + return index_prefetch_get_tid(scan, direction, all_visible); +} + +static void +index_prefetch_tids(IndexScanDesc scan, ScanDirection direction) +{ + /* for convenience */ + IndexPrefetch prefetch = scan->xs_prefetch; + + /* + * If the prefetching is still active (i.e. enabled and we still + * haven't finished reading TIDs from the scan), read enough TIDs into + * the queue until we hit the current target. + */ + if (PREFETCH_ACTIVE(prefetch)) + { + /* + * Ramp up the prefetch distance incrementally. + * + * Intentionally done as first, before reading the TIDs into the + * queue, so that there's always at least one item. Otherwise we + * might get into a situation where we start with target=0 and no + * TIDs loaded. + */ + prefetch->prefetchTarget = Min(prefetch->prefetchTarget + 1, + prefetch->prefetchMaxTarget); + + /* + * Now read TIDs from the index until the queue is full (with + * respect to the current prefetch target). + */ + while (!PREFETCH_FULL(prefetch)) + { + ItemPointer tid; + bool all_visible; + + /* Time to fetch the next TID from the index */ + tid = index_getnext_tid_internal(scan, direction); + + /* + * If we're out of index entries, we're done (and we mark the + * the prefetcher as inactive). + */ + if (tid == NULL) + { + prefetch->prefetchDone = true; + break; + } + + Assert(ItemPointerEquals(tid, &scan->xs_heaptid)); + + /* + * Issue the actuall prefetch requests for the new TID. + * + * XXX index_getnext_tid_prefetch is only called for IOS (for now), + * so skip prefetching of all-visible pages. + */ + index_prefetch(scan, tid, scan->indexonly, &all_visible); + + prefetch->queueItems[PREFETCH_QUEUE_INDEX(prefetch->queueEnd)].tid = *tid; + prefetch->queueItems[PREFETCH_QUEUE_INDEX(prefetch->queueEnd)].all_visible = all_visible; + prefetch->queueEnd++; + } + } +} + +static ItemPointer +index_prefetch_get_tid(IndexScanDesc scan, ScanDirection direction, bool *all_visible) +{ + /* for convenience */ + IndexPrefetch prefetch = scan->xs_prefetch; + + /* + * With prefetching enabled (even if we already finished reading + * all TIDs from the index scan), we need to return a TID from the + * queue. Otherwise, we just get the next TID from the scan + * directly. + */ + if (PREFETCH_ENABLED(prefetch)) + { + /* Did we reach the end of the scan and the queue is empty? */ + if (PREFETCH_DONE(prefetch)) + return NULL; + + scan->xs_heaptid = prefetch->queueItems[PREFETCH_QUEUE_INDEX(prefetch->queueIndex)].tid; + *all_visible = prefetch->queueItems[PREFETCH_QUEUE_INDEX(prefetch->queueIndex)].all_visible; + prefetch->queueIndex++; + } + else /* not prefetching, just do the regular work */ + { + ItemPointer tid; + + /* Time to fetch the next TID from the index */ + tid = index_getnext_tid_internal(scan, direction); + *all_visible = false; + + /* If we're out of index entries, we're done */ + if (tid == NULL) + return NULL; + + Assert(ItemPointerEquals(tid, &scan->xs_heaptid)); + } + + /* Return the TID of the tuple we found. */ + return &scan->xs_heaptid; +} diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index f1d71bc54e8..6810996edfd 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -3568,6 +3568,7 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning) !INSTR_TIME_IS_ZERO(usage->local_blk_write_time)); bool has_temp_timing = (!INSTR_TIME_IS_ZERO(usage->temp_blk_read_time) || !INSTR_TIME_IS_ZERO(usage->temp_blk_write_time)); + bool has_prefetches = (usage->blks_prefetches > 0); bool show_planning = (planning && (has_shared || has_local || has_temp || has_shared_timing || @@ -3679,6 +3680,23 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning) appendStringInfoChar(es->str, '\n'); } + /* As above, show only positive counter values. */ + if (has_prefetches) + { + ExplainIndentText(es); + appendStringInfoString(es->str, "Prefetches:"); + + if (usage->blks_prefetches > 0) + appendStringInfo(es->str, " blocks=%lld", + (long long) usage->blks_prefetches); + + if (usage->blks_prefetch_rounds > 0) + appendStringInfo(es->str, " rounds=%lld", + (long long) usage->blks_prefetch_rounds); + + appendStringInfoChar(es->str, '\n'); + } + if (show_planning) es->indent--; } diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index 2fa2118f3c2..15fa3211667 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -770,11 +770,15 @@ check_exclusion_or_unique_constraint(Relation heap, Relation index, /* * May have to restart scan from this point if a potential conflict is * found. + * + * XXX Should this do index prefetch? Probably not worth it for unique + * constraints, I guess? Otherwise we should calculate prefetch_target + * just like in nodeIndexscan etc. */ retry: conflict = false; found_self = false; - index_scan = index_beginscan(heap, index, &DirtySnapshot, indnkeyatts, 0); + index_scan = index_beginscan(heap, index, &DirtySnapshot, indnkeyatts, 0, 0); index_rescan(index_scan, scankeys, indnkeyatts, NULL, 0); while (index_getnext_slot(index_scan, ForwardScanDirection, existing_slot)) diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index 81f27042bc4..91676ccff95 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -204,8 +204,13 @@ RelationFindReplTupleByIndex(Relation rel, Oid idxoid, /* Build scan key. */ skey_attoff = build_replindex_scan_key(skey, rel, idxrel, searchslot); - /* Start an index scan. */ - scan = index_beginscan(rel, idxrel, &snap, skey_attoff, 0); + /* Start an index scan. + * + * XXX Should this do index prefetching? We're looking for a single tuple, + * probably using a PK / UNIQUE index, so does not seem worth it. If we + * reconsider this, calclate prefetch_target like in nodeIndexscan. + */ + scan = index_beginscan(rel, idxrel, &snap, skey_attoff, 0, 0); retry: found = false; diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index c383f34c066..0011d9f679c 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -235,6 +235,8 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add) dst->local_blks_written += add->local_blks_written; dst->temp_blks_read += add->temp_blks_read; dst->temp_blks_written += add->temp_blks_written; + dst->blks_prefetch_rounds += add->blks_prefetch_rounds; + dst->blks_prefetches += add->blks_prefetches; INSTR_TIME_ADD(dst->shared_blk_read_time, add->shared_blk_read_time); INSTR_TIME_ADD(dst->shared_blk_write_time, add->shared_blk_write_time); INSTR_TIME_ADD(dst->local_blk_read_time, add->local_blk_read_time); @@ -259,6 +261,8 @@ BufferUsageAccumDiff(BufferUsage *dst, dst->local_blks_written += add->local_blks_written - sub->local_blks_written; dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read; dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written; + dst->blks_prefetches += add->blks_prefetches - sub->blks_prefetches; + dst->blks_prefetch_rounds += add->blks_prefetch_rounds - sub->blks_prefetch_rounds; INSTR_TIME_ACCUM_DIFF(dst->shared_blk_read_time, add->shared_blk_read_time, sub->shared_blk_read_time); INSTR_TIME_ACCUM_DIFF(dst->shared_blk_write_time, diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index f1db35665c8..b6660c10a63 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -43,7 +43,7 @@ #include "storage/predicate.h" #include "utils/memutils.h" #include "utils/rel.h" - +#include "utils/spccache.h" static TupleTableSlot *IndexOnlyNext(IndexOnlyScanState *node); static void StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup, @@ -65,6 +65,8 @@ IndexOnlyNext(IndexOnlyScanState *node) IndexScanDesc scandesc; TupleTableSlot *slot; ItemPointer tid; + Relation heapRel = node->ss.ss_currentRelation; + bool all_visible; /* * extract necessary information from index scan node @@ -83,16 +85,47 @@ IndexOnlyNext(IndexOnlyScanState *node) if (scandesc == NULL) { + int prefetch_max; + + /* + * Determine number of heap pages to prefetch for this index. This is + * essentially just effective_io_concurrency for the table (or the + * tablespace it's in). + * + * XXX Should this also look at plan.plan_rows and maybe cap the target + * to that? Pointless to prefetch more than we expect to use. Or maybe + * just reset to that value during prefetching, after reading the next + * index page (or rather after rescan)? + * + * XXX Maybe reduce the value with parallel workers? + */ + prefetch_max = Min(get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace), + node->ss.ps.plan->plan_rows); + /* * We reach here if the index only scan is not parallel, or if we're * serially executing an index only scan that was planned to be * parallel. + * + * XXX Maybe we should enable prefetching, but prefetch only pages that + * are not all-visible (but checking that from the index code seems like + * a violation of layering etc). + * + * XXX This might lead to IOS being slower than plain index scan, if the + * table has a lot of pages that need recheck. */ scandesc = index_beginscan(node->ss.ss_currentRelation, node->ioss_RelationDesc, estate->es_snapshot, node->ioss_NumScanKeys, - node->ioss_NumOrderByKeys); + node->ioss_NumOrderByKeys, + prefetch_max); + + /* + * Remember this is index-only scan, because of prefetching. Not the most + * elegant way to pass this info. + */ + scandesc->indexonly = true; node->ioss_ScanDesc = scandesc; @@ -116,7 +149,7 @@ IndexOnlyNext(IndexOnlyScanState *node) /* * OK, now that we have what we need, fetch the next tuple. */ - while ((tid = index_getnext_tid(scandesc, direction)) != NULL) + while ((tid = index_getnext_tid_vm(scandesc, direction, &all_visible)) != NULL) { bool tuple_from_heap = false; @@ -155,8 +188,11 @@ IndexOnlyNext(IndexOnlyScanState *node) * * It's worth going through this complexity to avoid needing to lock * the VM buffer, which could cause significant contention. + * + * XXX Skip if we already know the page is all visible from prefetcher. */ - if (!VM_ALL_VISIBLE(scandesc->heapRelation, + if (!all_visible && + !VM_ALL_VISIBLE(scandesc->heapRelation, ItemPointerGetBlockNumber(tid), &node->ioss_VMBuffer)) { @@ -380,6 +416,18 @@ ExecEndIndexOnlyScan(IndexOnlyScanState *node) node->ioss_VMBuffer = InvalidBuffer; } + /* Release VM buffer pin from prefetcher, if any. */ + if (indexScanDesc && indexScanDesc->xs_prefetch) + { + IndexPrefetch indexPrefetch = indexScanDesc->xs_prefetch; + + if (indexPrefetch->vmBuffer != InvalidBuffer) + { + ReleaseBuffer(indexPrefetch->vmBuffer); + indexPrefetch->vmBuffer = InvalidBuffer; + } + } + /* * close the index relation (no-op if we didn't open it) */ @@ -646,6 +694,24 @@ ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node, { EState *estate = node->ss.ps.state; ParallelIndexScanDesc piscan; + Relation heapRel = node->ss.ss_currentRelation; + int prefetch_max; + + /* + * Determine number of heap pages to prefetch for this index. This is + * essentially just effective_io_concurrency for the table (or the + * tablespace it's in). + * + * XXX Should this also look at plan.plan_rows and maybe cap the target + * to that? Pointless to prefetch more than we expect to use. Or maybe + * just reset to that value during prefetching, after reading the next + * index page (or rather after rescan)? + * + * XXX Maybe reduce the value with parallel workers? + */ + + prefetch_max = Min(get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace), + node->ss.ps.plan->plan_rows); piscan = shm_toc_allocate(pcxt->toc, node->ioss_PscanLen); index_parallelscan_initialize(node->ss.ss_currentRelation, @@ -658,7 +724,8 @@ ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node, node->ioss_RelationDesc, node->ioss_NumScanKeys, node->ioss_NumOrderByKeys, - piscan); + piscan, + prefetch_max); node->ioss_ScanDesc->xs_want_itup = true; node->ioss_VMBuffer = InvalidBuffer; @@ -696,6 +763,23 @@ ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node, ParallelWorkerContext *pwcxt) { ParallelIndexScanDesc piscan; + Relation heapRel = node->ss.ss_currentRelation; + int prefetch_max; + + /* + * Determine number of heap pages to prefetch for this index. This is + * essentially just effective_io_concurrency for the table (or the + * tablespace it's in). + * + * XXX Should this also look at plan.plan_rows and maybe cap the target + * to that? Pointless to prefetch more than we expect to use. Or maybe + * just reset to that value during prefetching, after reading the next + * index page (or rather after rescan)? + * + * XXX Maybe reduce the value with parallel workers? + */ + prefetch_max = Min(get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace), + node->ss.ps.plan->plan_rows); piscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false); node->ioss_ScanDesc = @@ -703,7 +787,8 @@ ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node, node->ioss_RelationDesc, node->ioss_NumScanKeys, node->ioss_NumOrderByKeys, - piscan); + piscan, + prefetch_max); node->ioss_ScanDesc->xs_want_itup = true; /* diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 14b9c00217a..a5f5394ef49 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -43,6 +43,7 @@ #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/rel.h" +#include "utils/spccache.h" /* * When an ordering operator is used, tuples fetched from the index that @@ -85,6 +86,7 @@ IndexNext(IndexScanState *node) ScanDirection direction; IndexScanDesc scandesc; TupleTableSlot *slot; + Relation heapRel = node->ss.ss_currentRelation; /* * extract necessary information from index scan node @@ -103,6 +105,21 @@ IndexNext(IndexScanState *node) if (scandesc == NULL) { + int prefetch_max; + + /* + * Determine number of heap pages to prefetch for this index scan. This + * is essentially just effective_io_concurrency for the table (or the + * tablespace it's in). + * + * XXX Should this also look at plan.plan_rows and maybe cap the target + * to that? Pointless to prefetch more than we expect to use. Or maybe + * just reset to that value during prefetching, after reading the next + * index page (or rather after rescan)? + */ + prefetch_max = Min(get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace), + node->ss.ps.plan->plan_rows); + /* * We reach here if the index scan is not parallel, or if we're * serially executing an index scan that was planned to be parallel. @@ -111,7 +128,8 @@ IndexNext(IndexScanState *node) node->iss_RelationDesc, estate->es_snapshot, node->iss_NumScanKeys, - node->iss_NumOrderByKeys); + node->iss_NumOrderByKeys, + prefetch_max); node->iss_ScanDesc = scandesc; @@ -177,6 +195,7 @@ IndexNextWithReorder(IndexScanState *node) Datum *lastfetched_vals; bool *lastfetched_nulls; int cmp; + Relation heapRel = node->ss.ss_currentRelation; estate = node->ss.ps.state; @@ -198,6 +217,21 @@ IndexNextWithReorder(IndexScanState *node) if (scandesc == NULL) { + int prefetch_max; + + /* + * Determine number of heap pages to prefetch for this index. This is + * essentially just effective_io_concurrency for the table (or the + * tablespace it's in). + * + * XXX Should this also look at plan.plan_rows and maybe cap the target + * to that? Pointless to prefetch more than we expect to use. Or maybe + * just reset to that value during prefetching, after reading the next + * index page (or rather after rescan)? + */ + prefetch_max = Min(get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace), + node->ss.ps.plan->plan_rows); + /* * We reach here if the index scan is not parallel, or if we're * serially executing an index scan that was planned to be parallel. @@ -206,7 +240,8 @@ IndexNextWithReorder(IndexScanState *node) node->iss_RelationDesc, estate->es_snapshot, node->iss_NumScanKeys, - node->iss_NumOrderByKeys); + node->iss_NumOrderByKeys, + prefetch_max); node->iss_ScanDesc = scandesc; @@ -1662,6 +1697,24 @@ ExecIndexScanInitializeDSM(IndexScanState *node, { EState *estate = node->ss.ps.state; ParallelIndexScanDesc piscan; + Relation heapRel = node->ss.ss_currentRelation; + int prefetch_max; + + /* + * Determine number of heap pages to prefetch for this index. This is + * essentially just effective_io_concurrency for the table (or the + * tablespace it's in). + * + * XXX Should this also look at plan.plan_rows and maybe cap the target + * to that? Pointless to prefetch more than we expect to use. Or maybe + * just reset to that value during prefetching, after reading the next + * index page (or rather after rescan)? + * + * XXX Maybe reduce the value with parallel workers? + */ + + prefetch_max = Min(get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace), + node->ss.ps.plan->plan_rows); piscan = shm_toc_allocate(pcxt->toc, node->iss_PscanLen); index_parallelscan_initialize(node->ss.ss_currentRelation, @@ -1674,7 +1727,8 @@ ExecIndexScanInitializeDSM(IndexScanState *node, node->iss_RelationDesc, node->iss_NumScanKeys, node->iss_NumOrderByKeys, - piscan); + piscan, + prefetch_max); /* * If no run-time keys to calculate or they are ready, go ahead and pass @@ -1710,6 +1764,23 @@ ExecIndexScanInitializeWorker(IndexScanState *node, ParallelWorkerContext *pwcxt) { ParallelIndexScanDesc piscan; + Relation heapRel = node->ss.ss_currentRelation; + int prefetch_max; + + /* + * Determine number of heap pages to prefetch for this index. This is + * essentially just effective_io_concurrency for the table (or the + * tablespace it's in). + * + * XXX Should this also look at plan.plan_rows and maybe cap the target + * to that? Pointless to prefetch more than we expect to use. Or maybe + * just reset to that value during prefetching, after reading the next + * index page (or rather after rescan)? + * + * XXX Maybe reduce the value with parallel workers? + */ + prefetch_max = Min(get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace), + node->ss.ps.plan->plan_rows); piscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false); node->iss_ScanDesc = @@ -1717,7 +1788,8 @@ ExecIndexScanInitializeWorker(IndexScanState *node, node->iss_RelationDesc, node->iss_NumScanKeys, node->iss_NumOrderByKeys, - piscan); + piscan, + prefetch_max); /* * If no run-time keys to calculate or they are ready, go ahead and pass diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index e11d022827a..8b662d371dd 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -6289,9 +6289,10 @@ get_actual_variable_endpoint(Relation heapRel, InitNonVacuumableSnapshot(SnapshotNonVacuumable, GlobalVisTestFor(heapRel)); + /* XXX Maybe should do prefetching using the default prefetch parameters? */ index_scan = index_beginscan(heapRel, indexRel, &SnapshotNonVacuumable, - 1, 0); + 1, 0, 0); /* Set it up for index-only scan */ index_scan->xs_want_itup = true; index_rescan(index_scan, scankeys, 1, NULL, 0); diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 80dc8d54066..f6882f644d2 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -17,6 +17,7 @@ #include "access/sdir.h" #include "access/skey.h" #include "nodes/tidbitmap.h" +#include "storage/bufmgr.h" #include "storage/lockdefs.h" #include "utils/relcache.h" #include "utils/snapshot.h" @@ -154,7 +155,8 @@ extern void index_insert_cleanup(Relation indexRelation, extern IndexScanDesc index_beginscan(Relation heapRelation, Relation indexRelation, Snapshot snapshot, - int nkeys, int norderbys); + int nkeys, int norderbys, + int prefetch_max); extern IndexScanDesc index_beginscan_bitmap(Relation indexRelation, Snapshot snapshot, int nkeys); @@ -171,9 +173,13 @@ extern void index_parallelscan_initialize(Relation heapRelation, extern void index_parallelrescan(IndexScanDesc scan); extern IndexScanDesc index_beginscan_parallel(Relation heaprel, Relation indexrel, int nkeys, int norderbys, - ParallelIndexScanDesc pscan); + ParallelIndexScanDesc pscan, + int prefetch_max); extern ItemPointer index_getnext_tid(IndexScanDesc scan, ScanDirection direction); +extern ItemPointer index_getnext_tid_vm(IndexScanDesc scan, + ScanDirection direction, + bool *all_visible); struct TupleTableSlot; extern bool index_fetch_heap(IndexScanDesc scan, struct TupleTableSlot *slot); extern bool index_getnext_slot(IndexScanDesc scan, ScanDirection direction, @@ -232,4 +238,104 @@ extern HeapTuple systable_getnext_ordered(SysScanDesc sysscan, ScanDirection direction); extern void systable_endscan_ordered(SysScanDesc sysscan); +/* + * Cache of recently prefetched blocks, organized as a hash table of + * small LRU caches. Doesn't need to be perfectly accurate, but we + * aim to make false positives/negatives reasonably low. + */ +typedef struct PrefetchCacheEntry { + BlockNumber block; + uint64 request; +} PrefetchCacheEntry; + +/* + * Size of the cache of recently prefetched blocks - shouldn't be too + * small or too large. 1024 seems about right, it covers ~8MB of data. + * It's somewhat arbitrary, there's no particular formula saying it + * should not be higher/lower. + * + * The cache is structured as an array of small LRU caches, so the total + * size needs to be a multiple of LRU size. The LRU should be tiny to + * keep linear search cheap enough. + * + * XXX Maybe we could consider effective_cache_size or something? + */ +#define PREFETCH_LRU_SIZE 8 +#define PREFETCH_LRU_COUNT 128 +#define PREFETCH_CACHE_SIZE (PREFETCH_LRU_SIZE * PREFETCH_LRU_COUNT) + +/* + * Used to detect sequential patterns (and disable prefetching). + */ +#define PREFETCH_QUEUE_HISTORY 8 +#define PREFETCH_SEQ_PATTERN_BLOCKS 4 + +typedef struct PrefetchEntry +{ + ItemPointerData tid; + bool all_visible; +} PrefetchEntry; + +typedef struct IndexPrefetchData +{ + /* + * XXX We need to disable this in some cases (e.g. when using index-only + * scans, we don't want to prefetch pages). Or maybe we should prefetch + * only pages that are not all-visible, that'd be even better. + */ + int prefetchTarget; /* how far we should be prefetching */ + int prefetchMaxTarget; /* maximum prefetching distance */ + int prefetchReset; /* reset to this distance on rescan */ + bool prefetchDone; /* did we get all TIDs from the index? */ + + /* runtime statistics */ + uint64 countAll; /* all prefetch requests */ + uint64 countPrefetch; /* actual prefetches */ + uint64 countSkipSequential; + uint64 countSkipCached; + + /* used when prefetching index-only scans */ + Buffer vmBuffer; + + /* + * Queue of TIDs to prefetch. + * + * XXX Sizing for MAX_IO_CONCURRENCY may be overkill, but it seems simpler + * than dynamically adjusting for custom values. + */ + PrefetchEntry queueItems[MAX_IO_CONCURRENCY]; + uint64 queueIndex; /* next TID to prefetch */ + uint64 queueStart; /* first valid TID in queue */ + uint64 queueEnd; /* first invalid (empty) TID in queue */ + + /* + * A couple of last prefetched blocks, used to check for certain access + * pattern and skip prefetching - e.g. for sequential access). + * + * XXX Separate from the main queue, because we only want to compare the + * block numbers, not the whole TID. In sequential access it's likely we + * read many items from each page, and we don't want to check many items + * (as that is much more expensive). + */ + BlockNumber blockItems[PREFETCH_QUEUE_HISTORY]; + uint64 blockIndex; /* index in the block (points to the first + * empty entry)*/ + + /* + * Cache of recently prefetched blocks, organized as a hash table of + * small LRU caches. + */ + uint64 prefetchReqNumber; + PrefetchCacheEntry prefetchCache[PREFETCH_CACHE_SIZE]; + +} IndexPrefetchData; + +#define PREFETCH_QUEUE_INDEX(a) ((a) % (MAX_IO_CONCURRENCY)) +#define PREFETCH_QUEUE_EMPTY(p) ((p)->queueEnd == (p)->queueIndex) +#define PREFETCH_ENABLED(p) ((p) && ((p)->prefetchMaxTarget > 0)) +#define PREFETCH_FULL(p) ((p)->queueEnd - (p)->queueIndex == (p)->prefetchTarget) +#define PREFETCH_DONE(p) ((p) && ((p)->prefetchDone && PREFETCH_QUEUE_EMPTY(p))) +#define PREFETCH_ACTIVE(p) (PREFETCH_ENABLED(p) && !(p)->prefetchDone) +#define PREFETCH_BLOCK_INDEX(v) ((v) % PREFETCH_QUEUE_HISTORY) + #endif /* GENAM_H */ diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index d03360eac04..d5903492c6e 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -106,6 +106,12 @@ typedef struct IndexFetchTableData Relation rel; } IndexFetchTableData; +/* + * Forward declarations, defined in genam.h. + */ +typedef struct IndexPrefetchData IndexPrefetchData; +typedef struct IndexPrefetchData *IndexPrefetch; + /* * We use the same IndexScanDescData structure for both amgettuple-based * and amgetbitmap-based index scans. Some fields are only relevant in @@ -129,6 +135,7 @@ typedef struct IndexScanDescData bool ignore_killed_tuples; /* do not return killed entries */ bool xactStartedInRecovery; /* prevents killing/seeing killed * tuples */ + bool indexonly; /* is this index-only scan? */ /* index access method's private state */ void *opaque; /* access-method-specific info */ @@ -162,6 +169,9 @@ typedef struct IndexScanDescData bool *xs_orderbynulls; bool xs_recheckorderby; + /* prefetching state (or NULL if disabled for this scan) */ + IndexPrefetchData *xs_prefetch; + /* parallel index scan information, in shared memory */ struct ParallelIndexScanDescData *parallel_scan; } IndexScanDescData; diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index d5d69941c52..f53fb4a1e51 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -33,6 +33,8 @@ typedef struct BufferUsage int64 local_blks_written; /* # of local disk blocks written */ int64 temp_blks_read; /* # of temp blocks read */ int64 temp_blks_written; /* # of temp blocks written */ + int64 blks_prefetch_rounds; /* # of prefetch rounds */ + int64 blks_prefetches; /* # of buffers prefetched */ instr_time shared_blk_read_time; /* time spent reading shared blocks */ instr_time shared_blk_write_time; /* time spent writing shared blocks */ instr_time local_blk_read_time; /* time spent reading local blocks */ -- 2.41.0