From a3335da2a7a28dbb258380fa23d9ddd7c887f1d9 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Fri, 17 Nov 2023 23:54:19 +0100
Subject: [PATCH v20231209 1/2] prefetch 2023-11-24

Patch version shared on 2023/11/24.
---
 src/backend/access/heap/heapam_handler.c |  12 +-
 src/backend/access/index/genam.c         |  31 +-
 src/backend/access/index/indexam.c       | 645 ++++++++++++++++++++++-
 src/backend/commands/explain.c           |  18 +
 src/backend/executor/execIndexing.c      |   6 +-
 src/backend/executor/execReplication.c   |   9 +-
 src/backend/executor/instrument.c        |   4 +
 src/backend/executor/nodeIndexonlyscan.c |  97 +++-
 src/backend/executor/nodeIndexscan.c     |  80 ++-
 src/backend/utils/adt/selfuncs.c         |   3 +-
 src/include/access/genam.h               | 110 +++-
 src/include/access/relscan.h             |  10 +
 src/include/executor/instrument.h        |   2 +
 13 files changed, 997 insertions(+), 30 deletions(-)

diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 7c28dafb728..89474078951 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -44,6 +44,7 @@
 #include "storage/smgr.h"
 #include "utils/builtins.h"
 #include "utils/rel.h"
+#include "utils/spccache.h"
 
 static void reform_and_rewrite_tuple(HeapTuple tuple,
 									 Relation OldHeap, Relation NewHeap,
@@ -747,6 +748,14 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 			PROGRESS_CLUSTER_INDEX_RELID
 		};
 		int64		ci_val[2];
+		int			prefetch_max;
+
+		/*
+		 * Get the prefetch target for the old tablespace (which is what we'll
+		 * read using the index). We'll use it as a reset value too, although
+		 * there should be no rescans for CLUSTER etc.
+		 */
+		prefetch_max = get_tablespace_io_concurrency(OldHeap->rd_rel->reltablespace);
 
 		/* Set phase and OIDOldIndex to columns */
 		ci_val[0] = PROGRESS_CLUSTER_PHASE_INDEX_SCAN_HEAP;
@@ -755,7 +764,8 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 
 		tableScan = NULL;
 		heapScan = NULL;
-		indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0);
+		indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0,
+									prefetch_max);
 		index_rescan(indexScan, NULL, 0, NULL, 0);
 	}
 	else
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index 4ca12006843..d45a209ee3a 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -126,6 +126,9 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
 	scan->xs_hitup = NULL;
 	scan->xs_hitupdesc = NULL;
 
+	/* Information used for asynchronous prefetching during index scans. */
+	scan->xs_prefetch = NULL;
+
 	return scan;
 }
 
@@ -440,8 +443,20 @@ systable_beginscan(Relation heapRelation,
 				elog(ERROR, "column is not in index");
 		}
 
+		/*
+		 * We don't do any prefetching on system catalogs, for two main reasons.
+		 *
+		 * Firstly, we usually do PK lookups, which makes prefetching pointles,
+		 * or we often don't know how many rows to expect (and the numbers tend
+		 * to be fairly low). So it's not clear it'd help. Furthermore, places
+		 * that are sensitive tend to use syscache anyway.
+		 *
+		 * Secondly, we can't call get_tablespace_io_concurrency() because that
+		 * does a sysscan internally, so it might lead to a cycle. We could use
+		 * use effective_io_concurrency, but it doesn't seem worth it.
+		 */
 		sysscan->iscan = index_beginscan(heapRelation, irel,
-										 snapshot, nkeys, 0);
+										 snapshot, nkeys, 0, 0);
 		index_rescan(sysscan->iscan, key, nkeys, NULL, 0);
 		sysscan->scan = NULL;
 	}
@@ -696,8 +711,20 @@ systable_beginscan_ordered(Relation heapRelation,
 			elog(ERROR, "column is not in index");
 	}
 
+	/*
+	 * We don't do any prefetching on system catalogs, for two main reasons.
+	 *
+	 * Firstly, we usually do PK lookups, which makes prefetching pointles,
+	 * or we often don't know how many rows to expect (and the numbers tend
+	 * to be fairly low). So it's not clear it'd help. Furthermore, places
+	 * that are sensitive tend to use syscache anyway.
+	 *
+	 * Secondly, we can't call get_tablespace_io_concurrency() because that
+	 * does a sysscan internally, so it might lead to a cycle. We could use
+	 * use effective_io_concurrency, but it doesn't seem worth it.
+	 */
 	sysscan->iscan = index_beginscan(heapRelation, indexRelation,
-									 snapshot, nkeys, 0);
+									 snapshot, nkeys, 0, 0);
 	index_rescan(sysscan->iscan, key, nkeys, NULL, 0);
 	sysscan->scan = NULL;
 
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index f23e0199f08..e493548b68a 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -49,16 +49,19 @@
 #include "access/relscan.h"
 #include "access/tableam.h"
 #include "access/transam.h"
+#include "access/visibilitymap.h"
 #include "access/xlog.h"
 #include "catalog/index.h"
 #include "catalog/pg_amproc.h"
 #include "catalog/pg_type.h"
 #include "commands/defrem.h"
+#include "common/hashfn.h"
 #include "nodes/makefuncs.h"
 #include "pgstat.h"
 #include "storage/bufmgr.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
+#include "utils/lsyscache.h"
 #include "utils/ruleutils.h"
 #include "utils/snapmgr.h"
 #include "utils/syscache.h"
@@ -106,7 +109,12 @@ do { \
 
 static IndexScanDesc index_beginscan_internal(Relation indexRelation,
 											  int nkeys, int norderbys, Snapshot snapshot,
-											  ParallelIndexScanDesc pscan, bool temp_snap);
+											  ParallelIndexScanDesc pscan, bool temp_snap,
+											  int prefetch_max);
+
+static void index_prefetch_tids(IndexScanDesc scan, ScanDirection direction);
+static ItemPointer index_prefetch_get_tid(IndexScanDesc scan, ScanDirection direction, bool *all_visible);
+static void index_prefetch(IndexScanDesc scan, ItemPointer tid, bool skip_all_visible, bool *all_visible);
 
 
 /* ----------------------------------------------------------------
@@ -215,18 +223,42 @@ index_insert_cleanup(Relation indexRelation,
  * index_beginscan - start a scan of an index with amgettuple
  *
  * Caller must be holding suitable locks on the heap and the index.
+ *
+ * prefetch_max determines if prefetching is requested for this index scan,
+ * and how far ahead we want to prefetch
+ *
+ * Setting prefetch_max to 0 disables prefetching for the index scan. We do
+ * this for two reasons - for scans on system catalogs, and/or for cases where
+ * prefetching is expected to be pointless (like IOS).
+ *
+ * For system catalogs, we usually either scan by a PK value, or we we expect
+ * only few rows (or rather we don't know how many rows to expect). Also, we
+ * need to prevent infinite in the get_tablespace_io_concurrency() call - it
+ * does an index scan internally. So we simply disable prefetching for system
+ * catalogs. We could deal with this by picking a conservative static target
+ * (e.g. effective_io_concurrency, capped to something), but places that are
+ * performance sensitive likely use syscache anyway, and catalogs tend to be
+ * very small and hot. So we don't bother.
+ *
+ * For IOS, we expect to not need most heap pages (that's the whole point of
+ * IOS, actually), and prefetching them might lead to a lot of wasted I/O.
+ *
+ * XXX Not sure the infinite loop can still happen, now that the target lookup
+ * moved to callers of index_beginscan.
  */
 IndexScanDesc
 index_beginscan(Relation heapRelation,
 				Relation indexRelation,
 				Snapshot snapshot,
-				int nkeys, int norderbys)
+				int nkeys, int norderbys,
+				int prefetch_max)
 {
 	IndexScanDesc scan;
 
 	Assert(snapshot != InvalidSnapshot);
 
-	scan = index_beginscan_internal(indexRelation, nkeys, norderbys, snapshot, NULL, false);
+	scan = index_beginscan_internal(indexRelation, nkeys, norderbys, snapshot,
+									NULL, false, prefetch_max);
 
 	/*
 	 * Save additional parameters into the scandesc.  Everything else was set
@@ -256,7 +288,8 @@ index_beginscan_bitmap(Relation indexRelation,
 
 	Assert(snapshot != InvalidSnapshot);
 
-	scan = index_beginscan_internal(indexRelation, nkeys, 0, snapshot, NULL, false);
+	/* No prefetch in bitmap scans, prefetch is done by the heap scan. */
+	scan = index_beginscan_internal(indexRelation, nkeys, 0, snapshot, NULL, false, 0);
 
 	/*
 	 * Save additional parameters into the scandesc.  Everything else was set
@@ -273,7 +306,8 @@ index_beginscan_bitmap(Relation indexRelation,
 static IndexScanDesc
 index_beginscan_internal(Relation indexRelation,
 						 int nkeys, int norderbys, Snapshot snapshot,
-						 ParallelIndexScanDesc pscan, bool temp_snap)
+						 ParallelIndexScanDesc pscan, bool temp_snap,
+						 int prefetch_max)
 {
 	IndexScanDesc scan;
 
@@ -296,6 +330,31 @@ index_beginscan_internal(Relation indexRelation,
 	/* Initialize information for parallel scan. */
 	scan->parallel_scan = pscan;
 	scan->xs_temp_snap = temp_snap;
+	scan->indexonly = false;
+
+	/*
+	 * With prefetching requested, initialize the prefetcher state.
+	 *
+	 * FIXME This should really be in the IndexScanState, not IndexScanDesc
+	 * (certainly the queues etc). But index_getnext_tid only gets the scan
+	 * descriptor, so how else would we pass it? Seems like a sign of wrong
+	 * layer doing the prefetching.
+	 */
+	if ((prefetch_max > 0) &&
+		(io_direct_flags & IO_DIRECT_DATA) == 0)	/* no prefetching for direct I/O */
+	{
+		IndexPrefetch prefetcher = palloc0(sizeof(IndexPrefetchData));
+
+		prefetcher->queueIndex = 0;
+		prefetcher->queueStart = 0;
+		prefetcher->queueEnd = 0;
+
+		prefetcher->prefetchTarget = 0;
+		prefetcher->prefetchMaxTarget = prefetch_max;
+		prefetcher->vmBuffer = InvalidBuffer;
+
+		scan->xs_prefetch = prefetcher;
+	}
 
 	return scan;
 }
@@ -332,6 +391,20 @@ index_rescan(IndexScanDesc scan,
 
 	scan->indexRelation->rd_indam->amrescan(scan, keys, nkeys,
 											orderbys, norderbys);
+
+	/* If we're prefetching for this index, maybe reset some of the state. */
+	if (scan->xs_prefetch != NULL)
+	{
+		IndexPrefetch prefetcher = scan->xs_prefetch;
+
+		prefetcher->queueStart = 0;
+		prefetcher->queueEnd = 0;
+		prefetcher->queueIndex = 0;
+		prefetcher->prefetchDone = false;
+
+		/* restart the incremental ramp-up */
+		prefetcher->prefetchTarget = 0;
+	}
 }
 
 /* ----------------
@@ -360,6 +433,23 @@ index_endscan(IndexScanDesc scan)
 	if (scan->xs_temp_snap)
 		UnregisterSnapshot(scan->xs_snapshot);
 
+	/*
+	 * If prefetching was enabled for this scan, log prefetch stats.
+	 *
+	 * FIXME This should really go to EXPLAIN ANALYZE instead.
+	 */
+	if (scan->xs_prefetch)
+	{
+		IndexPrefetch prefetch = scan->xs_prefetch;
+
+		elog(LOG, "index prefetch stats: requests " UINT64_FORMAT " prefetches " UINT64_FORMAT " (%f) skip cached " UINT64_FORMAT " sequential " UINT64_FORMAT,
+			 prefetch->countAll,
+			 prefetch->countPrefetch,
+			 prefetch->countPrefetch * 100.0 / prefetch->countAll,
+			 prefetch->countSkipCached,
+			 prefetch->countSkipSequential);
+	}
+
 	/* Release the scan data structure itself */
 	IndexScanEnd(scan);
 }
@@ -505,7 +595,8 @@ index_parallelrescan(IndexScanDesc scan)
  */
 IndexScanDesc
 index_beginscan_parallel(Relation heaprel, Relation indexrel, int nkeys,
-						 int norderbys, ParallelIndexScanDesc pscan)
+						 int norderbys, ParallelIndexScanDesc pscan,
+						 int prefetch_max)
 {
 	Snapshot	snapshot;
 	IndexScanDesc scan;
@@ -514,7 +605,7 @@ index_beginscan_parallel(Relation heaprel, Relation indexrel, int nkeys,
 	snapshot = RestoreSnapshot(pscan->ps_snapshot_data);
 	RegisterSnapshot(snapshot);
 	scan = index_beginscan_internal(indexrel, nkeys, norderbys, snapshot,
-									pscan, true);
+									pscan, true, prefetch_max);
 
 	/*
 	 * Save additional parameters into the scandesc.  Everything else was set
@@ -536,8 +627,8 @@ index_beginscan_parallel(Relation heaprel, Relation indexrel, int nkeys,
  * or NULL if no more matching tuples exist.
  * ----------------
  */
-ItemPointer
-index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
+static ItemPointer
+index_getnext_tid_internal(IndexScanDesc scan, ScanDirection direction)
 {
 	bool		found;
 
@@ -640,12 +731,16 @@ index_getnext_slot(IndexScanDesc scan, ScanDirection direction, TupleTableSlot *
 {
 	for (;;)
 	{
+		/* Do prefetching (if requested/enabled). */
+		index_prefetch_tids(scan, direction);
+
 		if (!scan->xs_heap_continue)
 		{
-			ItemPointer tid;
+			ItemPointer	tid;
+			bool		all_visible;
 
 			/* Time to fetch the next TID from the index */
-			tid = index_getnext_tid(scan, direction);
+			tid = index_prefetch_get_tid(scan, direction, &all_visible);
 
 			/* If we're out of index entries, we're done */
 			if (tid == NULL)
@@ -1003,3 +1098,531 @@ index_opclass_options(Relation indrel, AttrNumber attnum, Datum attoptions,
 
 	return build_local_reloptions(&relopts, attoptions, validate);
 }
+
+/*
+ * index_prefetch_is_sequential
+ *		Track the block number and check if the I/O pattern is sequential,
+ *		or if the same block was just prefetched.
+ *
+ * Prefetching is cheap, but for some access patterns the benefits are small
+ * compared to the extra overhead. In particular, for sequential access the
+ * read-ahead performed by the OS is very effective/efficient. Doing more
+ * prefetching is just increasing the costs.
+ *
+ * This tries to identify simple sequential patterns, so that we can skip
+ * the prefetching request. This is implemented by having a small queue
+ * of block numbers, and checking it before prefetching another block.
+ *
+ * We look at the preceding PREFETCH_SEQ_PATTERN_BLOCKS blocks, and see if
+ * they are sequential. We also check if the block is the same as the last
+ * request (which is not sequential).
+ *
+ * Note that the main prefetch queue is not really useful for this, as it
+ * stores TIDs while we care about block numbers. Consider a sorted table,
+ * with a perfectly sequential pattern when accessed through an index. Each
+ * heap page may have dozens of TIDs, but we need to check block numbers.
+ * We could keep enough TIDs to cover enough blocks, but then we also need
+ * to walk those when checking the pattern (in hot path).
+ *
+ * So instead, we maintain a small separate queue of block numbers, and we use
+ * this instead.
+ *
+ * Returns true if the block is in a sequential pattern (and so should not be
+ * prefetched), or false (not sequential, should be prefetched).
+ *
+ * XXX The name is a bit misleading, as it also adds the block number to the
+ * block queue and checks if the block is the same as the last one (which
+ * does not require a sequential pattern).
+ */
+static bool
+index_prefetch_is_sequential(IndexPrefetch prefetch, BlockNumber block)
+{
+	int			idx;
+
+	/*
+	 * If the block queue is empty, just store the block and we're done (it's
+	 * neither a sequential pattern, neither recently prefetched block).
+	 */
+	if (prefetch->blockIndex == 0)
+	{
+		prefetch->blockItems[PREFETCH_BLOCK_INDEX(prefetch->blockIndex)] = block;
+		prefetch->blockIndex++;
+		return false;
+	}
+
+	/*
+	 * Check if it's the same as the immediately preceding block. We don't
+	 * want to prefetch the same block over and over (which would happen for
+	 * well correlated indexes).
+	 *
+	 * In principle we could rely on index_prefetch_add_cache doing this using
+	 * the full cache, but this check is much cheaper and we need to look at
+	 * the preceding block anyway, so we just do it.
+	 *
+	 * XXX Notice we haven't added the block to the block queue yet, and there
+	 * is a preceding block (i.e. blockIndex-1 is valid).
+	 */
+	if (prefetch->blockItems[PREFETCH_BLOCK_INDEX(prefetch->blockIndex - 1)] == block)
+		return true;
+
+	/*
+	 * Add the block number to the queue.
+	 *
+	 * We do this before checking if the pattern, because we want to know
+	 * about the block even if we end up skipping the prefetch. Otherwise we'd
+	 * not be able to detect longer sequential pattens - we'd skip one block
+	 * but then fail to skip the next couple blocks even in a perfect
+	 * sequential pattern. This ocillation might even prevent the OS
+	 * read-ahead from kicking in.
+	 */
+	prefetch->blockItems[PREFETCH_BLOCK_INDEX(prefetch->blockIndex)] = block;
+	prefetch->blockIndex++;
+
+	/*
+	 * Check if the last couple blocks are in a sequential pattern. We look
+	 * for a sequential pattern of PREFETCH_SEQ_PATTERN_BLOCKS (4 by default),
+	 * so we look for patterns of 5 pages (40kB) including the new block.
+	 *
+	 * XXX Perhaps this should be tied to effective_io_concurrency somehow?
+	 *
+	 * XXX Could it be harmful that we read the queue backwards? Maybe memory
+	 * prefetching works better for the forward direction?
+	 */
+	for (int i = 1; i < PREFETCH_SEQ_PATTERN_BLOCKS; i++)
+	{
+		/*
+		 * Are there enough requests to confirm a sequential pattern? We only
+		 * consider something to be sequential after finding a sequence of
+		 * PREFETCH_SEQ_PATTERN_BLOCKS blocks.
+		 *
+		 * FIXME Better to move this outside the loop.
+		 */
+		if (prefetch->blockIndex < i)
+			return false;
+
+		/*
+		 * Calculate index of the earlier block (we need to do -1 as we
+		 * already incremented the index when adding the new block to the
+		 * queue).
+		 */
+		idx = PREFETCH_BLOCK_INDEX(prefetch->blockIndex - i - 1);
+
+		/*
+		 * For a sequential pattern, blocks "k" step ago needs to have block
+		 * number by "k" smaller compared to the current block.
+		 */
+		if (prefetch->blockItems[idx] != (block - i))
+			return false;
+	}
+
+	return true;
+}
+
+/*
+ * index_prefetch_add_cache
+ *		Add a block to the cache, check if it was recently prefetched.
+ *
+ * We don't want to prefetch blocks that we already prefetched recently. It's
+ * cheap but not free, and the overhead may have measurable impact.
+ *
+ * This check needs to be very cheap, even with fairly large caches (hundreds
+ * of entries, see PREFETCH_CACHE_SIZE).
+ *
+ * A simple queue would allow expiring the requests, but checking if it
+ * contains a particular block prefetched would be expensive (linear search).
+ * Another option would be a simple hash table, which has fast lookup but
+ * does not allow expiring entries cheaply.
+ *
+ * The cache does not need to be perfect, we can accept false
+ * positives/negatives, as long as the rate is reasonably low. We also need
+ * to expire entries, so that only "recent" requests are remembered.
+ *
+ * We use a hybrid cache that is organized as many small LRU caches. Each
+ * block is mapped to a particular LRU by hashing (so it's a bit like a
+ * hash table). The LRU caches are tiny (e.g. 8 entries), and the expiration
+ * happens at the level of a single LRU (by tracking only the 8 most recent requests).
+ *
+ * This allows quick searches and expiration, but with false negatives (when a
+ * particular LRU has too many collisions, we may evict entries that are more
+ * recent than some other LRU).
+ *
+ * For example, imagine 128 LRU caches, each with 8 entries - that's 1024
+ * prefetch request in total (these are the default parameters.)
+ *
+ * The recency is determined using a prefetch counter, incremented every
+ * time we end up prefetching a block. The counter is uint64, so it should
+ * not wrap (125 zebibytes, would take ~4 million years at 1GB/s).
+ *
+ * To check if a block was prefetched recently, we calculate hash(block),
+ * and then linearly search if the tiny LRU has entry for the same block
+ * and request less than PREFETCH_CACHE_SIZE ago.
+ *
+ * At the same time, we either update the entry (for the queried block) if
+ * found, or replace the oldest/empty entry.
+ *
+ * If the block was not recently prefetched (i.e. we want to prefetch it),
+ * we increment the counter.
+ *
+ * Returns true if the block was recently prefetched (and thus we don't
+ * need to prefetch it again), or false (should do a prefetch).
+ *
+ * XXX It's a bit confusing these return values are inverse compared to
+ * what index_prefetch_is_sequential does.
+ */
+static bool
+index_prefetch_add_cache(IndexPrefetch prefetch, BlockNumber block)
+{
+	PrefetchCacheEntry *entry;
+
+	/* map the block number the the LRU */
+	int			lru = hash_uint32(block) % PREFETCH_LRU_COUNT;
+
+	/* age/index of the oldest entry in the LRU, to maybe use */
+	uint64		oldestRequest = PG_UINT64_MAX;
+	int			oldestIndex = -1;
+
+	/*
+	 * First add the block to the (tiny) top-level LRU cache and see if it's
+	 * part of a sequential pattern. In this case we just ignore the block and
+	 * don't prefetch it - we expect read-ahead to do a better job.
+	 *
+	 * XXX Maybe we should still add the block to the hybrid cache, in case we
+	 * happen to access it later? That might help if we first scan a lot of
+	 * the table sequentially, and then randomly. Not sure that's very likely
+	 * with index access, though.
+	 */
+	if (index_prefetch_is_sequential(prefetch, block))
+	{
+		prefetch->countSkipSequential++;
+		return true;
+	}
+
+	/*
+	 * See if we recently prefetched this block - we simply scan the LRU
+	 * linearly. While doing that, we also track the oldest entry, so that we
+	 * know where to put the block if we don't find a matching entry.
+	 */
+	for (int i = 0; i < PREFETCH_LRU_SIZE; i++)
+	{
+		entry = &prefetch->prefetchCache[lru * PREFETCH_LRU_SIZE + i];
+
+		/* Is this the oldest prefetch request in this LRU? */
+		if (entry->request < oldestRequest)
+		{
+			oldestRequest = entry->request;
+			oldestIndex = i;
+		}
+
+		/*
+		 * If the entry is unused (identified by request being set to 0),
+		 * we're done. Notice the field is uint64, so empty entry is
+		 * guaranteed to be the oldest one.
+		 */
+		if (entry->request == 0)
+			continue;
+
+		/* Is this entry for the same block as the current request? */
+		if (entry->block == block)
+		{
+			bool		prefetched;
+
+			/*
+			 * Is the old request sufficiently recent? If yes, we treat the
+			 * block as already prefetched.
+			 *
+			 * XXX We do add the cache size to the request in order not to
+			 * have issues with uint64 underflows.
+			 */
+			prefetched = ((entry->request + PREFETCH_CACHE_SIZE) >= prefetch->prefetchReqNumber);
+
+			/* Update the request number. */
+			entry->request = ++prefetch->prefetchReqNumber;
+
+			prefetch->countSkipCached += (prefetched) ? 1 : 0;
+
+			return prefetched;
+		}
+	}
+
+	/*
+	 * We didn't find the block in the LRU, so store it either in an empty
+	 * entry, or in the "oldest" prefetch request in this LRU.
+	 */
+	Assert((oldestIndex >= 0) && (oldestIndex < PREFETCH_LRU_SIZE));
+
+	/* FIXME do a nice macro */
+	entry = &prefetch->prefetchCache[lru * PREFETCH_LRU_SIZE + oldestIndex];
+
+	entry->block = block;
+	entry->request = ++prefetch->prefetchReqNumber;
+
+	/* not in the prefetch cache */
+	return false;
+}
+
+/*
+ * index_prefetch
+ *		Prefetch the TID, unless it's sequential or recently prefetched.
+ *
+ * XXX Some ideas how to auto-tune the prefetching, so that unnecessary
+ * prefetching does not cause significant regressions (e.g. for nestloop
+ * with inner index scan). We could track number of rescans and number of
+ * items (TIDs) actually returned from the scan. Then we could calculate
+ * rows / rescan and use that to clamp prefetch target.
+ *
+ * That'd help with cases when a scan matches only very few rows, far less
+ * than the prefetchTarget, because the unnecessary prefetches are wasted
+ * I/O. Imagine a LIMIT on top of index scan, or something like that.
+ *
+ * Another option is to use the planner estimates - we know how many rows we're
+ * expecting to fetch (on average, assuming the estimates are reasonably
+ * accurate), so why not to use that?
+ *
+ * Of course, we could/should combine these two approaches.
+ *
+ * XXX The prefetching may interfere with the patch allowing us to evaluate
+ * conditions on the index tuple, in which case we may not need the heap
+ * tuple. Maybe if there's such filter, we should prefetch only pages that
+ * are not all-visible (and the same idea would also work for IOS), but
+ * it also makes the indexing a bit "aware" of the visibility stuff (which
+ * seems a somewhat wrong). Also, maybe we should consider the filter selectivity
+ * (if the index-only filter is expected to eliminate only few rows, then
+ * the vm check is pointless). Maybe this could/should be auto-tuning too,
+ * i.e. we could track how many heap tuples were needed after all, and then
+ * we would consider this when deciding whether to prefetch all-visible
+ * pages or not (matters only for regular index scans, not IOS).
+ *
+ * XXX Maybe we could/should also prefetch the next index block, e.g. stored
+ * in BTScanPosData.nextPage.
+ *
+ * XXX Could we tune the cache size based on execution statistics? We have
+ * a cache of limited size (PREFETCH_CACHE_SIZE = 1024 by default), but
+ * how do we know it's the right size? Ideally, we'd have a cache large
+ * enough to track actually cached blocks. If the OS caches 10240 pages,
+ * then we may do 90% of prefetch requests unnecessarily. Or maybe there's
+ * a lot of contention, blocks are evicted quickly, and 90% of the blocks
+ * in the cache are not actually cached anymore? But we do have a concept
+ * of sequential request ID (PrefetchCacheEntry->request), which gives us
+ * information about "age" of the last prefetch. Now it's used only when
+ * evicting entries (to keep the more recent one), but maybe we could also
+ * use it when deciding if the page is cached. Right now any block that's
+ * in the cache is considered cached and not prefetched, but maybe we could
+ * have "max age", and tune it based on feedback from reading the blocks
+ * later. For example, if we find the block in cache and decide not to
+ * prefetch it, but then later find we have to do I/O, it means our cache
+ * is too large. And we could "reduce" the maximum age (measured from the
+ * current prefetchReqNumber value), so that only more recent blocks would
+ * be considered cached. Not sure about the opposite direction, where we
+ * decide to prefetch a block - AFAIK we don't have a way to determine if
+ * I/O was needed or not in this case (so we can't increase the max age).
+ * But maybe we could di that somehow speculatively, i.e. increase the
+ * value once in a while, and see what happens.
+ */
+static void
+index_prefetch(IndexScanDesc scan, ItemPointer tid, bool skip_all_visible, bool *all_visible)
+{
+	IndexPrefetch prefetch = scan->xs_prefetch;
+	BlockNumber block;
+
+	/* by default not all visible (or we didn't check) */
+	*all_visible = false;
+
+	/*
+	 * No heap relation means bitmap index scan, which does prefetching at the
+	 * bitmap heap scan, so no prefetch here (we can't do it anyway, without
+	 * the heap)
+	 *
+	 * XXX But in this case we should have prefetchMaxTarget=0, because in
+	 * index_bebinscan_bitmap() we disable prefetching. So maybe we should
+	 * just check that.
+	 */
+	if (!prefetch)
+		return;
+
+	/*
+	 * If we got here, prefetching is enabled and it's a node that supports
+	 * prefetching (i.e. it can't be a bitmap index scan).
+	 */
+	Assert(scan->heapRelation);
+
+	block = ItemPointerGetBlockNumber(tid);
+
+	/*
+	 * When prefetching for IOS, we want to only prefetch pages that are not
+	 * marked as all-visible (because not fetching all-visible pages is the
+	 * point of IOS).
+	 *
+	 * XXX This is not great, because it releases the VM buffer for each TID
+	 * we consider to prefetch. We should reuse that somehow, similar to the
+	 * actual IOS code. Ideally, we should use the same ioss_VMBuffer (if
+	 * we can propagate it here). Or at least do it for a bulk of prefetches,
+	 * although that's not very useful - after the ramp-up we will prefetch
+	 * the pages one by one anyway.
+	 *
+	 * XXX Ideally we'd also propagate this to the executor, so that the
+	 * nodeIndexonlyscan.c doesn't need to repeat the same VM check (which
+	 * is measurable). But the index_getnext_tid() is not really well
+	 * suited for that, so the API needs a change.s
+	 */
+	if (skip_all_visible)
+	{
+		*all_visible = VM_ALL_VISIBLE(scan->heapRelation,
+									  block,
+									  &prefetch->vmBuffer);
+
+		if (*all_visible)
+			return;
+	}
+
+	/*
+	 * Do not prefetch the same block over and over again,
+	 *
+	 * This happens e.g. for clustered or naturally correlated indexes (fkey
+	 * to a sequence ID). It's not expensive (the block is in page cache
+	 * already, so no I/O), but it's not free either.
+	 */
+	if (!index_prefetch_add_cache(prefetch, block))
+	{
+		prefetch->countPrefetch++;
+
+		PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM, block);
+		pgBufferUsage.blks_prefetches++;
+	}
+
+	prefetch->countAll++;
+}
+
+/* ----------------
+ * index_getnext_tid - get the next TID from a scan
+ *
+ * The result is the next TID satisfying the scan keys,
+ * or NULL if no more matching tuples exist.
+ *
+ * FIXME not sure this handles xs_heapfetch correctly.
+ * ----------------
+ */
+ItemPointer
+index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
+{
+	bool		all_visible;	/* ignored */
+
+	/* Do prefetching (if requested/enabled). */
+	index_prefetch_tids(scan, direction);
+
+	/* Read the TID from the queue (or directly from the index). */
+	return index_prefetch_get_tid(scan, direction, &all_visible);
+}
+
+ItemPointer
+index_getnext_tid_vm(IndexScanDesc scan, ScanDirection direction, bool *all_visible)
+{
+	/* Do prefetching (if requested/enabled). */
+	index_prefetch_tids(scan, direction);
+
+	/* Read the TID from the queue (or directly from the index). */
+	return index_prefetch_get_tid(scan, direction, all_visible);
+}
+
+static void
+index_prefetch_tids(IndexScanDesc scan, ScanDirection direction)
+{
+	/* for convenience */
+	IndexPrefetch prefetch = scan->xs_prefetch;
+
+	/*
+	 * If the prefetching is still active (i.e. enabled and we still
+	 * haven't finished reading TIDs from the scan), read enough TIDs into
+	 * the queue until we hit the current target.
+	 */
+	if (PREFETCH_ACTIVE(prefetch))
+	{
+		/*
+		 * Ramp up the prefetch distance incrementally.
+		 *
+		 * Intentionally done as first, before reading the TIDs into the
+		 * queue, so that there's always at least one item. Otherwise we
+		 * might get into a situation where we start with target=0 and no
+		 * TIDs loaded.
+		 */
+		prefetch->prefetchTarget = Min(prefetch->prefetchTarget + 1,
+									   prefetch->prefetchMaxTarget);
+
+		/*
+		 * Now read TIDs from the index until the queue is full (with
+		 * respect to the current prefetch target).
+		 */
+		while (!PREFETCH_FULL(prefetch))
+		{
+			ItemPointer tid;
+			bool		all_visible;
+
+			/* Time to fetch the next TID from the index */
+			tid = index_getnext_tid_internal(scan, direction);
+
+			/*
+			 * If we're out of index entries, we're done (and we mark the
+			 * the prefetcher as inactive).
+			 */
+			if (tid == NULL)
+			{
+				prefetch->prefetchDone = true;
+				break;
+			}
+
+			Assert(ItemPointerEquals(tid, &scan->xs_heaptid));
+
+			/*
+			 * Issue the actuall prefetch requests for the new TID.
+			 *
+			 * XXX index_getnext_tid_prefetch is only called for IOS (for now),
+			 * so skip prefetching of all-visible pages.
+			 */
+			index_prefetch(scan, tid, scan->indexonly, &all_visible);
+
+			prefetch->queueItems[PREFETCH_QUEUE_INDEX(prefetch->queueEnd)].tid = *tid;
+			prefetch->queueItems[PREFETCH_QUEUE_INDEX(prefetch->queueEnd)].all_visible = all_visible;
+			prefetch->queueEnd++;
+		}
+	}
+}
+
+static ItemPointer
+index_prefetch_get_tid(IndexScanDesc scan, ScanDirection direction, bool *all_visible)
+{
+	/* for convenience */
+	IndexPrefetch prefetch = scan->xs_prefetch;
+
+	/*
+	 * With prefetching enabled (even if we already finished reading
+	 * all TIDs from the index scan), we need to return a TID from the
+	 * queue. Otherwise, we just get the next TID from the scan
+	 * directly.
+	 */
+	if (PREFETCH_ENABLED(prefetch))
+	{
+		/* Did we reach the end of the scan and the queue is empty? */
+		if (PREFETCH_DONE(prefetch))
+			return NULL;
+
+		scan->xs_heaptid = prefetch->queueItems[PREFETCH_QUEUE_INDEX(prefetch->queueIndex)].tid;
+		*all_visible = prefetch->queueItems[PREFETCH_QUEUE_INDEX(prefetch->queueIndex)].all_visible;
+		prefetch->queueIndex++;
+	}
+	else				/* not prefetching, just do the regular work  */
+	{
+		ItemPointer tid;
+
+		/* Time to fetch the next TID from the index */
+		tid = index_getnext_tid_internal(scan, direction);
+		*all_visible = false;
+
+		/* If we're out of index entries, we're done */
+		if (tid == NULL)
+			return NULL;
+
+		Assert(ItemPointerEquals(tid, &scan->xs_heaptid));
+	}
+
+	/* Return the TID of the tuple we found. */
+	return &scan->xs_heaptid;
+}
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index f1d71bc54e8..6810996edfd 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -3568,6 +3568,7 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning)
 										!INSTR_TIME_IS_ZERO(usage->local_blk_write_time));
 		bool		has_temp_timing = (!INSTR_TIME_IS_ZERO(usage->temp_blk_read_time) ||
 									   !INSTR_TIME_IS_ZERO(usage->temp_blk_write_time));
+		bool		has_prefetches = (usage->blks_prefetches > 0);
 		bool		show_planning = (planning && (has_shared ||
 												  has_local || has_temp ||
 												  has_shared_timing ||
@@ -3679,6 +3680,23 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning)
 			appendStringInfoChar(es->str, '\n');
 		}
 
+		/* As above, show only positive counter values. */
+		if (has_prefetches)
+		{
+			ExplainIndentText(es);
+			appendStringInfoString(es->str, "Prefetches:");
+
+			if (usage->blks_prefetches > 0)
+				appendStringInfo(es->str, " blocks=%lld",
+								 (long long) usage->blks_prefetches);
+
+			if (usage->blks_prefetch_rounds > 0)
+				appendStringInfo(es->str, " rounds=%lld",
+								 (long long) usage->blks_prefetch_rounds);
+
+			appendStringInfoChar(es->str, '\n');
+		}
+
 		if (show_planning)
 			es->indent--;
 	}
diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c
index 2fa2118f3c2..15fa3211667 100644
--- a/src/backend/executor/execIndexing.c
+++ b/src/backend/executor/execIndexing.c
@@ -770,11 +770,15 @@ check_exclusion_or_unique_constraint(Relation heap, Relation index,
 	/*
 	 * May have to restart scan from this point if a potential conflict is
 	 * found.
+	 *
+	 * XXX Should this do index prefetch? Probably not worth it for unique
+	 * constraints, I guess? Otherwise we should calculate prefetch_target
+	 * just like in nodeIndexscan etc.
 	 */
 retry:
 	conflict = false;
 	found_self = false;
-	index_scan = index_beginscan(heap, index, &DirtySnapshot, indnkeyatts, 0);
+	index_scan = index_beginscan(heap, index, &DirtySnapshot, indnkeyatts, 0, 0);
 	index_rescan(index_scan, scankeys, indnkeyatts, NULL, 0);
 
 	while (index_getnext_slot(index_scan, ForwardScanDirection, existing_slot))
diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c
index 81f27042bc4..91676ccff95 100644
--- a/src/backend/executor/execReplication.c
+++ b/src/backend/executor/execReplication.c
@@ -204,8 +204,13 @@ RelationFindReplTupleByIndex(Relation rel, Oid idxoid,
 	/* Build scan key. */
 	skey_attoff = build_replindex_scan_key(skey, rel, idxrel, searchslot);
 
-	/* Start an index scan. */
-	scan = index_beginscan(rel, idxrel, &snap, skey_attoff, 0);
+	/* Start an index scan.
+	 *
+	 * XXX Should this do index prefetching? We're looking for a single tuple,
+	 * probably using a PK / UNIQUE index, so does not seem worth it. If we
+	 * reconsider this, calclate prefetch_target like in nodeIndexscan.
+	 */
+	scan = index_beginscan(rel, idxrel, &snap, skey_attoff, 0, 0);
 
 retry:
 	found = false;
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index c383f34c066..0011d9f679c 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -235,6 +235,8 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add)
 	dst->local_blks_written += add->local_blks_written;
 	dst->temp_blks_read += add->temp_blks_read;
 	dst->temp_blks_written += add->temp_blks_written;
+	dst->blks_prefetch_rounds += add->blks_prefetch_rounds;
+	dst->blks_prefetches += add->blks_prefetches;
 	INSTR_TIME_ADD(dst->shared_blk_read_time, add->shared_blk_read_time);
 	INSTR_TIME_ADD(dst->shared_blk_write_time, add->shared_blk_write_time);
 	INSTR_TIME_ADD(dst->local_blk_read_time, add->local_blk_read_time);
@@ -259,6 +261,8 @@ BufferUsageAccumDiff(BufferUsage *dst,
 	dst->local_blks_written += add->local_blks_written - sub->local_blks_written;
 	dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read;
 	dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written;
+	dst->blks_prefetches += add->blks_prefetches - sub->blks_prefetches;
+	dst->blks_prefetch_rounds += add->blks_prefetch_rounds - sub->blks_prefetch_rounds;
 	INSTR_TIME_ACCUM_DIFF(dst->shared_blk_read_time,
 						  add->shared_blk_read_time, sub->shared_blk_read_time);
 	INSTR_TIME_ACCUM_DIFF(dst->shared_blk_write_time,
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index f1db35665c8..b6660c10a63 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -43,7 +43,7 @@
 #include "storage/predicate.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
-
+#include "utils/spccache.h"
 
 static TupleTableSlot *IndexOnlyNext(IndexOnlyScanState *node);
 static void StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup,
@@ -65,6 +65,8 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	IndexScanDesc scandesc;
 	TupleTableSlot *slot;
 	ItemPointer tid;
+	Relation	heapRel = node->ss.ss_currentRelation;
+	bool		all_visible;
 
 	/*
 	 * extract necessary information from index scan node
@@ -83,16 +85,47 @@ IndexOnlyNext(IndexOnlyScanState *node)
 
 	if (scandesc == NULL)
 	{
+		int			prefetch_max;
+
+		/*
+		 * Determine number of heap pages to prefetch for this index. This is
+		 * essentially just effective_io_concurrency for the table (or the
+		 * tablespace it's in).
+		 *
+		 * XXX Should this also look at plan.plan_rows and maybe cap the target
+		 * to that? Pointless to prefetch more than we expect to use. Or maybe
+		 * just reset to that value during prefetching, after reading the next
+		 * index page (or rather after rescan)?
+		 *
+		 * XXX Maybe reduce the value with parallel workers?
+		 */
+		prefetch_max = Min(get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace),
+						   node->ss.ps.plan->plan_rows);
+
 		/*
 		 * We reach here if the index only scan is not parallel, or if we're
 		 * serially executing an index only scan that was planned to be
 		 * parallel.
+		 *
+		 * XXX Maybe we should enable prefetching, but prefetch only pages that
+		 * are not all-visible (but checking that from the index code seems like
+		 * a violation of layering etc).
+		 *
+		 * XXX This might lead to IOS being slower than plain index scan, if the
+		 * table has a lot of pages that need recheck.
 		 */
 		scandesc = index_beginscan(node->ss.ss_currentRelation,
 								   node->ioss_RelationDesc,
 								   estate->es_snapshot,
 								   node->ioss_NumScanKeys,
-								   node->ioss_NumOrderByKeys);
+								   node->ioss_NumOrderByKeys,
+								   prefetch_max);
+
+		/*
+		 * Remember this is index-only scan, because of prefetching. Not the most
+		 * elegant way to pass this info.
+		 */
+		scandesc->indexonly = true;
 
 		node->ioss_ScanDesc = scandesc;
 
@@ -116,7 +149,7 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	/*
 	 * OK, now that we have what we need, fetch the next tuple.
 	 */
-	while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
+	while ((tid = index_getnext_tid_vm(scandesc, direction, &all_visible)) != NULL)
 	{
 		bool		tuple_from_heap = false;
 
@@ -155,8 +188,11 @@ IndexOnlyNext(IndexOnlyScanState *node)
 		 *
 		 * It's worth going through this complexity to avoid needing to lock
 		 * the VM buffer, which could cause significant contention.
+		 *
+		 * XXX Skip if we already know the page is all visible from prefetcher.
 		 */
-		if (!VM_ALL_VISIBLE(scandesc->heapRelation,
+		if (!all_visible &&
+			!VM_ALL_VISIBLE(scandesc->heapRelation,
 							ItemPointerGetBlockNumber(tid),
 							&node->ioss_VMBuffer))
 		{
@@ -380,6 +416,18 @@ ExecEndIndexOnlyScan(IndexOnlyScanState *node)
 		node->ioss_VMBuffer = InvalidBuffer;
 	}
 
+	/* Release VM buffer pin from prefetcher, if any. */
+	if (indexScanDesc && indexScanDesc->xs_prefetch)
+	{
+		IndexPrefetch indexPrefetch = indexScanDesc->xs_prefetch;
+
+		if (indexPrefetch->vmBuffer != InvalidBuffer)
+		{
+			ReleaseBuffer(indexPrefetch->vmBuffer);
+			indexPrefetch->vmBuffer = InvalidBuffer;
+		}
+	}
+
 	/*
 	 * close the index relation (no-op if we didn't open it)
 	 */
@@ -646,6 +694,24 @@ ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node,
 {
 	EState	   *estate = node->ss.ps.state;
 	ParallelIndexScanDesc piscan;
+	Relation	heapRel = node->ss.ss_currentRelation;
+	int			prefetch_max;
+
+	/*
+	 * Determine number of heap pages to prefetch for this index. This is
+	 * essentially just effective_io_concurrency for the table (or the
+	 * tablespace it's in).
+	 *
+	 * XXX Should this also look at plan.plan_rows and maybe cap the target
+	 * to that? Pointless to prefetch more than we expect to use. Or maybe
+	 * just reset to that value during prefetching, after reading the next
+	 * index page (or rather after rescan)?
+	 *
+	 * XXX Maybe reduce the value with parallel workers?
+	 */
+
+	prefetch_max = Min(get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace),
+					   node->ss.ps.plan->plan_rows);
 
 	piscan = shm_toc_allocate(pcxt->toc, node->ioss_PscanLen);
 	index_parallelscan_initialize(node->ss.ss_currentRelation,
@@ -658,7 +724,8 @@ ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node,
 								 node->ioss_RelationDesc,
 								 node->ioss_NumScanKeys,
 								 node->ioss_NumOrderByKeys,
-								 piscan);
+								 piscan,
+								 prefetch_max);
 	node->ioss_ScanDesc->xs_want_itup = true;
 	node->ioss_VMBuffer = InvalidBuffer;
 
@@ -696,6 +763,23 @@ ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node,
 								  ParallelWorkerContext *pwcxt)
 {
 	ParallelIndexScanDesc piscan;
+	Relation	heapRel = node->ss.ss_currentRelation;
+	int			prefetch_max;
+
+	/*
+	 * Determine number of heap pages to prefetch for this index. This is
+	 * essentially just effective_io_concurrency for the table (or the
+	 * tablespace it's in).
+	 *
+	 * XXX Should this also look at plan.plan_rows and maybe cap the target
+	 * to that? Pointless to prefetch more than we expect to use. Or maybe
+	 * just reset to that value during prefetching, after reading the next
+	 * index page (or rather after rescan)?
+	 *
+	 * XXX Maybe reduce the value with parallel workers?
+	 */
+	prefetch_max = Min(get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace),
+					   node->ss.ps.plan->plan_rows);
 
 	piscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
 	node->ioss_ScanDesc =
@@ -703,7 +787,8 @@ ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node,
 								 node->ioss_RelationDesc,
 								 node->ioss_NumScanKeys,
 								 node->ioss_NumOrderByKeys,
-								 piscan);
+								 piscan,
+								 prefetch_max);
 	node->ioss_ScanDesc->xs_want_itup = true;
 
 	/*
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
index 14b9c00217a..a5f5394ef49 100644
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@@ -43,6 +43,7 @@
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
+#include "utils/spccache.h"
 
 /*
  * When an ordering operator is used, tuples fetched from the index that
@@ -85,6 +86,7 @@ IndexNext(IndexScanState *node)
 	ScanDirection direction;
 	IndexScanDesc scandesc;
 	TupleTableSlot *slot;
+	Relation heapRel = node->ss.ss_currentRelation;
 
 	/*
 	 * extract necessary information from index scan node
@@ -103,6 +105,21 @@ IndexNext(IndexScanState *node)
 
 	if (scandesc == NULL)
 	{
+		int	prefetch_max;
+
+		/*
+		 * Determine number of heap pages to prefetch for this index scan. This
+		 * is essentially just effective_io_concurrency for the table (or the
+		 * tablespace it's in).
+		 *
+		 * XXX Should this also look at plan.plan_rows and maybe cap the target
+		 * to that? Pointless to prefetch more than we expect to use. Or maybe
+		 * just reset to that value during prefetching, after reading the next
+		 * index page (or rather after rescan)?
+		 */
+		prefetch_max = Min(get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace),
+						   node->ss.ps.plan->plan_rows);
+
 		/*
 		 * We reach here if the index scan is not parallel, or if we're
 		 * serially executing an index scan that was planned to be parallel.
@@ -111,7 +128,8 @@ IndexNext(IndexScanState *node)
 								   node->iss_RelationDesc,
 								   estate->es_snapshot,
 								   node->iss_NumScanKeys,
-								   node->iss_NumOrderByKeys);
+								   node->iss_NumOrderByKeys,
+								   prefetch_max);
 
 		node->iss_ScanDesc = scandesc;
 
@@ -177,6 +195,7 @@ IndexNextWithReorder(IndexScanState *node)
 	Datum	   *lastfetched_vals;
 	bool	   *lastfetched_nulls;
 	int			cmp;
+	Relation	heapRel = node->ss.ss_currentRelation;
 
 	estate = node->ss.ps.state;
 
@@ -198,6 +217,21 @@ IndexNextWithReorder(IndexScanState *node)
 
 	if (scandesc == NULL)
 	{
+		int	prefetch_max;
+
+		/*
+		 * Determine number of heap pages to prefetch for this index. This is
+		 * essentially just effective_io_concurrency for the table (or the
+		 * tablespace it's in).
+		 *
+		 * XXX Should this also look at plan.plan_rows and maybe cap the target
+		 * to that? Pointless to prefetch more than we expect to use. Or maybe
+		 * just reset to that value during prefetching, after reading the next
+		 * index page (or rather after rescan)?
+		 */
+		prefetch_max = Min(get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace),
+						   node->ss.ps.plan->plan_rows);
+
 		/*
 		 * We reach here if the index scan is not parallel, or if we're
 		 * serially executing an index scan that was planned to be parallel.
@@ -206,7 +240,8 @@ IndexNextWithReorder(IndexScanState *node)
 								   node->iss_RelationDesc,
 								   estate->es_snapshot,
 								   node->iss_NumScanKeys,
-								   node->iss_NumOrderByKeys);
+								   node->iss_NumOrderByKeys,
+								   prefetch_max);
 
 		node->iss_ScanDesc = scandesc;
 
@@ -1662,6 +1697,24 @@ ExecIndexScanInitializeDSM(IndexScanState *node,
 {
 	EState	   *estate = node->ss.ps.state;
 	ParallelIndexScanDesc piscan;
+	Relation	heapRel = node->ss.ss_currentRelation;
+	int			prefetch_max;
+
+	/*
+	 * Determine number of heap pages to prefetch for this index. This is
+	 * essentially just effective_io_concurrency for the table (or the
+	 * tablespace it's in).
+	 *
+	 * XXX Should this also look at plan.plan_rows and maybe cap the target
+	 * to that? Pointless to prefetch more than we expect to use. Or maybe
+	 * just reset to that value during prefetching, after reading the next
+	 * index page (or rather after rescan)?
+	 *
+	 * XXX Maybe reduce the value with parallel workers?
+	 */
+
+	prefetch_max = Min(get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace),
+					   node->ss.ps.plan->plan_rows);
 
 	piscan = shm_toc_allocate(pcxt->toc, node->iss_PscanLen);
 	index_parallelscan_initialize(node->ss.ss_currentRelation,
@@ -1674,7 +1727,8 @@ ExecIndexScanInitializeDSM(IndexScanState *node,
 								 node->iss_RelationDesc,
 								 node->iss_NumScanKeys,
 								 node->iss_NumOrderByKeys,
-								 piscan);
+								 piscan,
+								 prefetch_max);
 
 	/*
 	 * If no run-time keys to calculate or they are ready, go ahead and pass
@@ -1710,6 +1764,23 @@ ExecIndexScanInitializeWorker(IndexScanState *node,
 							  ParallelWorkerContext *pwcxt)
 {
 	ParallelIndexScanDesc piscan;
+	Relation	heapRel = node->ss.ss_currentRelation;
+	int			prefetch_max;
+
+	/*
+	 * Determine number of heap pages to prefetch for this index. This is
+	 * essentially just effective_io_concurrency for the table (or the
+	 * tablespace it's in).
+	 *
+	 * XXX Should this also look at plan.plan_rows and maybe cap the target
+	 * to that? Pointless to prefetch more than we expect to use. Or maybe
+	 * just reset to that value during prefetching, after reading the next
+	 * index page (or rather after rescan)?
+	 *
+	 * XXX Maybe reduce the value with parallel workers?
+	 */
+	prefetch_max = Min(get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace),
+					   node->ss.ps.plan->plan_rows);
 
 	piscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
 	node->iss_ScanDesc =
@@ -1717,7 +1788,8 @@ ExecIndexScanInitializeWorker(IndexScanState *node,
 								 node->iss_RelationDesc,
 								 node->iss_NumScanKeys,
 								 node->iss_NumOrderByKeys,
-								 piscan);
+								 piscan,
+								 prefetch_max);
 
 	/*
 	 * If no run-time keys to calculate or they are ready, go ahead and pass
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index e11d022827a..8b662d371dd 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -6289,9 +6289,10 @@ get_actual_variable_endpoint(Relation heapRel,
 	InitNonVacuumableSnapshot(SnapshotNonVacuumable,
 							  GlobalVisTestFor(heapRel));
 
+	/* XXX Maybe should do prefetching using the default prefetch parameters? */
 	index_scan = index_beginscan(heapRel, indexRel,
 								 &SnapshotNonVacuumable,
-								 1, 0);
+								 1, 0, 0);
 	/* Set it up for index-only scan */
 	index_scan->xs_want_itup = true;
 	index_rescan(index_scan, scankeys, 1, NULL, 0);
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index 80dc8d54066..f6882f644d2 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -17,6 +17,7 @@
 #include "access/sdir.h"
 #include "access/skey.h"
 #include "nodes/tidbitmap.h"
+#include "storage/bufmgr.h"
 #include "storage/lockdefs.h"
 #include "utils/relcache.h"
 #include "utils/snapshot.h"
@@ -154,7 +155,8 @@ extern void index_insert_cleanup(Relation indexRelation,
 extern IndexScanDesc index_beginscan(Relation heapRelation,
 									 Relation indexRelation,
 									 Snapshot snapshot,
-									 int nkeys, int norderbys);
+									 int nkeys, int norderbys,
+									 int prefetch_max);
 extern IndexScanDesc index_beginscan_bitmap(Relation indexRelation,
 											Snapshot snapshot,
 											int nkeys);
@@ -171,9 +173,13 @@ extern void index_parallelscan_initialize(Relation heapRelation,
 extern void index_parallelrescan(IndexScanDesc scan);
 extern IndexScanDesc index_beginscan_parallel(Relation heaprel,
 											  Relation indexrel, int nkeys, int norderbys,
-											  ParallelIndexScanDesc pscan);
+											  ParallelIndexScanDesc pscan,
+											  int prefetch_max);
 extern ItemPointer index_getnext_tid(IndexScanDesc scan,
 									 ScanDirection direction);
+extern ItemPointer index_getnext_tid_vm(IndexScanDesc scan,
+										ScanDirection direction,
+										bool *all_visible);
 struct TupleTableSlot;
 extern bool index_fetch_heap(IndexScanDesc scan, struct TupleTableSlot *slot);
 extern bool index_getnext_slot(IndexScanDesc scan, ScanDirection direction,
@@ -232,4 +238,104 @@ extern HeapTuple systable_getnext_ordered(SysScanDesc sysscan,
 										  ScanDirection direction);
 extern void systable_endscan_ordered(SysScanDesc sysscan);
 
+/*
+ * Cache of recently prefetched blocks, organized as a hash table of
+ * small LRU caches. Doesn't need to be perfectly accurate, but we
+ * aim to make false positives/negatives reasonably low.
+ */
+typedef struct PrefetchCacheEntry {
+	BlockNumber		block;
+	uint64			request;
+} PrefetchCacheEntry;
+
+/*
+ * Size of the cache of recently prefetched blocks - shouldn't be too
+ * small or too large. 1024 seems about right, it covers ~8MB of data.
+ * It's somewhat arbitrary, there's no particular formula saying it
+ * should not be higher/lower.
+ *
+ * The cache is structured as an array of small LRU caches, so the total
+ * size needs to be a multiple of LRU size. The LRU should be tiny to
+ * keep linear search cheap enough.
+ *
+ * XXX Maybe we could consider effective_cache_size or something?
+ */
+#define		PREFETCH_LRU_SIZE		8
+#define		PREFETCH_LRU_COUNT		128
+#define		PREFETCH_CACHE_SIZE		(PREFETCH_LRU_SIZE * PREFETCH_LRU_COUNT)
+
+/*
+ * Used to detect sequential patterns (and disable prefetching).
+ */
+#define		PREFETCH_QUEUE_HISTORY			8
+#define		PREFETCH_SEQ_PATTERN_BLOCKS		4
+
+typedef struct PrefetchEntry
+{
+	ItemPointerData		tid;
+	bool				all_visible;
+} PrefetchEntry;
+
+typedef struct IndexPrefetchData
+{
+	/*
+	 * XXX We need to disable this in some cases (e.g. when using index-only
+	 * scans, we don't want to prefetch pages). Or maybe we should prefetch
+	 * only pages that are not all-visible, that'd be even better.
+	 */
+	int			prefetchTarget;	/* how far we should be prefetching */
+	int			prefetchMaxTarget;	/* maximum prefetching distance */
+	int			prefetchReset;	/* reset to this distance on rescan */
+	bool		prefetchDone;	/* did we get all TIDs from the index? */
+
+	/* runtime statistics */
+	uint64		countAll;		/* all prefetch requests */
+	uint64		countPrefetch;	/* actual prefetches */
+	uint64		countSkipSequential;
+	uint64		countSkipCached;
+
+	/* used when prefetching index-only scans */
+	Buffer		vmBuffer;
+
+	/*
+	 * Queue of TIDs to prefetch.
+	 *
+	 * XXX Sizing for MAX_IO_CONCURRENCY may be overkill, but it seems simpler
+	 * than dynamically adjusting for custom values.
+	 */
+	PrefetchEntry	queueItems[MAX_IO_CONCURRENCY];
+	uint64			queueIndex;	/* next TID to prefetch */
+	uint64			queueStart;	/* first valid TID in queue */
+	uint64			queueEnd;	/* first invalid (empty) TID in queue */
+
+	/*
+	 * A couple of last prefetched blocks, used to check for certain access
+	 * pattern and skip prefetching - e.g. for sequential access).
+	 *
+	 * XXX Separate from the main queue, because we only want to compare the
+	 * block numbers, not the whole TID. In sequential access it's likely we
+	 * read many items from each page, and we don't want to check many items
+	 * (as that is much more expensive).
+	 */
+	BlockNumber		blockItems[PREFETCH_QUEUE_HISTORY];
+	uint64			blockIndex;	/* index in the block (points to the first
+								 * empty entry)*/
+
+	/*
+	 * Cache of recently prefetched blocks, organized as a hash table of
+	 * small LRU caches.
+	 */
+	uint64				prefetchReqNumber;
+	PrefetchCacheEntry	prefetchCache[PREFETCH_CACHE_SIZE];
+
+} IndexPrefetchData;
+
+#define PREFETCH_QUEUE_INDEX(a)	((a) % (MAX_IO_CONCURRENCY))
+#define PREFETCH_QUEUE_EMPTY(p)	((p)->queueEnd == (p)->queueIndex)
+#define PREFETCH_ENABLED(p)		((p) && ((p)->prefetchMaxTarget > 0))
+#define PREFETCH_FULL(p)		((p)->queueEnd - (p)->queueIndex == (p)->prefetchTarget)
+#define PREFETCH_DONE(p)		((p) && ((p)->prefetchDone && PREFETCH_QUEUE_EMPTY(p)))
+#define PREFETCH_ACTIVE(p)		(PREFETCH_ENABLED(p) && !(p)->prefetchDone)
+#define PREFETCH_BLOCK_INDEX(v)	((v) % PREFETCH_QUEUE_HISTORY)
+
 #endif							/* GENAM_H */
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index d03360eac04..d5903492c6e 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -106,6 +106,12 @@ typedef struct IndexFetchTableData
 	Relation	rel;
 } IndexFetchTableData;
 
+/*
+ * Forward declarations, defined in genam.h.
+ */
+typedef struct IndexPrefetchData IndexPrefetchData;
+typedef struct IndexPrefetchData *IndexPrefetch;
+
 /*
  * We use the same IndexScanDescData structure for both amgettuple-based
  * and amgetbitmap-based index scans.  Some fields are only relevant in
@@ -129,6 +135,7 @@ typedef struct IndexScanDescData
 	bool		ignore_killed_tuples;	/* do not return killed entries */
 	bool		xactStartedInRecovery;	/* prevents killing/seeing killed
 										 * tuples */
+	bool		indexonly;			/* is this index-only scan? */
 
 	/* index access method's private state */
 	void	   *opaque;			/* access-method-specific info */
@@ -162,6 +169,9 @@ typedef struct IndexScanDescData
 	bool	   *xs_orderbynulls;
 	bool		xs_recheckorderby;
 
+	/* prefetching state (or NULL if disabled for this scan) */
+	IndexPrefetchData *xs_prefetch;
+
 	/* parallel index scan information, in shared memory */
 	struct ParallelIndexScanDescData *parallel_scan;
 }			IndexScanDescData;
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index d5d69941c52..f53fb4a1e51 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -33,6 +33,8 @@ typedef struct BufferUsage
 	int64		local_blks_written; /* # of local disk blocks written */
 	int64		temp_blks_read; /* # of temp blocks read */
 	int64		temp_blks_written;	/* # of temp blocks written */
+	int64		blks_prefetch_rounds;	/* # of prefetch rounds */
+	int64		blks_prefetches;	/* # of buffers prefetched */
 	instr_time	shared_blk_read_time;	/* time spent reading shared blocks */
 	instr_time	shared_blk_write_time;	/* time spent writing shared blocks */
 	instr_time	local_blk_read_time;	/* time spent reading local blocks */
-- 
2.41.0