From 3ba126559c69a31f6f0c48c85de2c892cebac4f3 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Fri, 24 Nov 2023 12:31:43 +0100 Subject: [PATCH v20231124 7/7] 20231016-reworked --- src/backend/access/index/indexam.c | 195 ++++++++++++++++++++++++---- src/backend/storage/buffer/bufmgr.c | 12 -- src/backend/storage/file/fd.c | 40 ------ src/backend/storage/smgr/md.c | 27 ---- src/backend/storage/smgr/smgr.c | 19 --- src/include/access/genam.h | 25 +++- src/include/storage/fd.h | 1 - src/include/storage/md.h | 2 - src/include/storage/smgr.h | 2 - 9 files changed, 195 insertions(+), 128 deletions(-) diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 45c97aff5ef..82e3266bcc0 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -1203,6 +1203,148 @@ index_prefetch_is_sequential(IndexPrefetch prefetch, BlockNumber block) return true; } +/* + * index_prefetch_add_cache + * Add a block to the cache, check if it was recently prefetched. + * + * We don't want to prefetch blocks that we already prefetched recently. It's + * cheap but not free, and the overhead may have measurable impact. + * + * This check needs to be very cheap, even with fairly large caches (hundreds + * of entries, see PREFETCH_CACHE_SIZE). + * + * A simple queue would allow expiring the requests, but checking if it + * contains a particular block prefetched would be expensive (linear search). + * Another option would be a simple hash table, which has fast lookup but + * does not allow expiring entries cheaply. + * + * The cache does not need to be perfect, we can accept false + * positives/negatives, as long as the rate is reasonably low. We also need + * to expire entries, so that only "recent" requests are remembered. + * + * We use a hybrid cache that is organized as many small LRU caches. Each + * block is mapped to a particular LRU by hashing (so it's a bit like a + * hash table). The LRU caches are tiny (e.g. 8 entries), and the expiration + * happens at the level of a single LRU (by tracking only the 8 most recent requests). + * + * This allows quick searches and expiration, but with false negatives (when a + * particular LRU has too many collisions, we may evict entries that are more + * recent than some other LRU). + * + * For example, imagine 128 LRU caches, each with 8 entries - that's 1024 + * prefetch request in total (these are the default parameters.) + * + * The recency is determined using a prefetch counter, incremented every + * time we end up prefetching a block. The counter is uint64, so it should + * not wrap (125 zebibytes, would take ~4 million years at 1GB/s). + * + * To check if a block was prefetched recently, we calculate hash(block), + * and then linearly search if the tiny LRU has entry for the same block + * and request less than PREFETCH_CACHE_SIZE ago. + * + * At the same time, we either update the entry (for the queried block) if + * found, or replace the oldest/empty entry. + * + * If the block was not recently prefetched (i.e. we want to prefetch it), + * we increment the counter. + * + * Returns true if the block was recently prefetched (and thus we don't + * need to prefetch it again), or false (should do a prefetch). + * + * XXX It's a bit confusing these return values are inverse compared to + * what index_prefetch_is_sequential does. + */ +static bool +index_prefetch_add_cache(IndexPrefetch prefetch, BlockNumber block) +{ + PrefetchCacheEntry *entry; + + /* map the block number the the LRU */ + int lru = hash_uint32(block) % PREFETCH_LRU_COUNT; + + /* age/index of the oldest entry in the LRU, to maybe use */ + uint64 oldestRequest = PG_UINT64_MAX; + int oldestIndex = -1; + + /* + * First add the block to the (tiny) top-level LRU cache and see if it's + * part of a sequential pattern. In this case we just ignore the block and + * don't prefetch it - we expect read-ahead to do a better job. + * + * XXX Maybe we should still add the block to the hybrid cache, in case we + * happen to access it later? That might help if we first scan a lot of + * the table sequentially, and then randomly. Not sure that's very likely + * with index access, though. + */ + if (index_prefetch_is_sequential(prefetch, block)) + { + prefetch->countSkipSequential++; + return true; + } + + /* + * See if we recently prefetched this block - we simply scan the LRU + * linearly. While doing that, we also track the oldest entry, so that we + * know where to put the block if we don't find a matching entry. + */ + for (int i = 0; i < PREFETCH_LRU_SIZE; i++) + { + entry = &prefetch->prefetchCache[lru * PREFETCH_LRU_SIZE + i]; + + /* Is this the oldest prefetch request in this LRU? */ + if (entry->request < oldestRequest) + { + oldestRequest = entry->request; + oldestIndex = i; + } + + /* + * If the entry is unused (identified by request being set to 0), + * we're done. Notice the field is uint64, so empty entry is + * guaranteed to be the oldest one. + */ + if (entry->request == 0) + continue; + + /* Is this entry for the same block as the current request? */ + if (entry->block == block) + { + bool prefetched; + + /* + * Is the old request sufficiently recent? If yes, we treat the + * block as already prefetched. + * + * XXX We do add the cache size to the request in order not to + * have issues with uint64 underflows. + */ + prefetched = ((entry->request + PREFETCH_CACHE_SIZE) >= prefetch->prefetchReqNumber); + + /* Update the request number. */ + entry->request = ++prefetch->prefetchReqNumber; + + prefetch->countSkipCached += (prefetched) ? 1 : 0; + + return prefetched; + } + } + + /* + * We didn't find the block in the LRU, so store it either in an empty + * entry, or in the "oldest" prefetch request in this LRU. + */ + Assert((oldestIndex >= 0) && (oldestIndex < PREFETCH_LRU_SIZE)); + + /* FIXME do a nice macro */ + entry = &prefetch->prefetchCache[lru * PREFETCH_LRU_SIZE + oldestIndex]; + + entry->block = block; + entry->request = ++prefetch->prefetchReqNumber; + + /* not in the prefetch cache */ + return false; +} + /* * index_prefetch * Prefetch the TID, unless it's sequential or recently prefetched. @@ -1237,13 +1379,35 @@ index_prefetch_is_sequential(IndexPrefetch prefetch, BlockNumber block) * * XXX Maybe we could/should also prefetch the next index block, e.g. stored * in BTScanPosData.nextPage. + * + * XXX Could we tune the cache size based on execution statistics? We have + * a cache of limited size (PREFETCH_CACHE_SIZE = 1024 by default), but + * how do we know it's the right size? Ideally, we'd have a cache large + * enough to track actually cached blocks. If the OS caches 10240 pages, + * then we may do 90% of prefetch requests unnecessarily. Or maybe there's + * a lot of contention, blocks are evicted quickly, and 90% of the blocks + * in the cache are not actually cached anymore? But we do have a concept + * of sequential request ID (PrefetchCacheEntry->request), which gives us + * information about "age" of the last prefetch. Now it's used only when + * evicting entries (to keep the more recent one), but maybe we could also + * use it when deciding if the page is cached. Right now any block that's + * in the cache is considered cached and not prefetched, but maybe we could + * have "max age", and tune it based on feedback from reading the blocks + * later. For example, if we find the block in cache and decide not to + * prefetch it, but then later find we have to do I/O, it means our cache + * is too large. And we could "reduce" the maximum age (measured from the + * current prefetchReqNumber value), so that only more recent blocks would + * be considered cached. Not sure about the opposite direction, where we + * decide to prefetch a block - AFAIK we don't have a way to determine if + * I/O was needed or not in this case (so we can't increase the max age). + * But maybe we could di that somehow speculatively, i.e. increase the + * value once in a while, and see what happens. */ static void index_prefetch(IndexScanDesc scan, ItemPointer tid, bool skip_all_visible, bool *all_visible) { IndexPrefetch prefetch = scan->xs_prefetch; BlockNumber block; - PrefetchBufferResult result; /* by default not all visible (or we didn't check) */ *all_visible = false; @@ -1295,28 +1459,6 @@ index_prefetch(IndexScanDesc scan, ItemPointer tid, bool skip_all_visible, bool return; } - /* - * Now also check if the blocks to prefetch are in a sequential pattern. - * We do it here because we need to do this check before PrefetchBuffer - * initiates the prefetch, and we it can't do this easily (as it doesn't - * know in what context it's called in). So we do it here. - * - * We use a tiny LRU cache and see if the blocks follow a sequential - * pattern - if it's the same as the previous block, or if the last - * couple blocks are a continguous sequence, we don't prefetch it. - */ - if (index_prefetch_is_sequential(prefetch, block)) - { - prefetch->countSkipSequential++; - return; - } - - /* XXX shouldn't this be before the VM / sequenqial check? */ - prefetch->countAll++; - - /* OK, try prefetching the block. */ - result = PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM, block); - /* * Do not prefetch the same block over and over again, * @@ -1324,11 +1466,15 @@ index_prefetch(IndexScanDesc scan, ItemPointer tid, bool skip_all_visible, bool * to a sequence ID). It's not expensive (the block is in page cache * already, so no I/O), but it's not free either. */ - if (result.initiated_io) + if (!index_prefetch_add_cache(prefetch, block)) { prefetch->countPrefetch++; + + PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM, block); pgBufferUsage.blks_prefetches++; } + + prefetch->countAll++; } /* ---------------- @@ -1462,5 +1608,6 @@ index_prefetch_get_tid(IndexScanDesc scan, ScanDirection direction, bool *all_vi Assert(ItemPointerEquals(tid, &scan->xs_heaptid)); } + /* Return the TID of the tuple we found. */ return &scan->xs_heaptid; } diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 74da9c1376b..f7c67d504cd 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -567,20 +567,8 @@ PrefetchSharedBuffer(SMgrRelation smgr_reln, /* * Try to initiate an asynchronous read. This returns false in * recovery if the relation file doesn't exist. - * - * But first check if the block is already present in page cache. - * - * FIXME This breaks prefetch from recovery. Apparently that expects - * the prefetch to initiate the I/O, otherwise it fails with. But - * XLogPrefetcherNextBlock checks initiated_io, and may fail with: - * - * FATAL: could not prefetch relation 1663/16384/16401 block 83758 - * - * So maybe just fake the initiated_io=true in this case? Or not do - * this when in recovery. */ if ((io_direct_flags & IO_DIRECT_DATA) == 0 && - !smgrcached(smgr_reln, forkNum, blockNum) && smgrprefetch(smgr_reln, forkNum, blockNum)) { result.initiated_io = true; diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 2c51a3376f3..f691ba09321 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -78,7 +78,6 @@ #include /* for getrlimit */ #include #include -#include #ifndef WIN32 #include #endif @@ -2084,45 +2083,6 @@ retry: #endif } -/* - * FileCached - check if a given range of the file is in page cache. - * - * XXX relies on preadv2, probably needs to be checked by configure - */ -bool -FileCached(File file, off_t offset, off_t amount, uint32 wait_event_info) -{ -#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED) - int returnCode; - size_t readlen; - char buffer[BLCKSZ]; - struct iovec iov[1]; - - Assert(FileIsValid(file)); - - DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT, - file, VfdCache[file].fileName, - (int64) offset, (int64) amount)); - - returnCode = FileAccess(file); - if (returnCode < 0) - return false; - - /* XXX not sure if this ensures proper buffer alignment */ - iov[0].iov_base = &buffer; - iov[0].iov_len = amount; - - pgstat_report_wait_start(wait_event_info); - readlen = preadv2(VfdCache[file].fd, iov, 1, offset, RWF_NOWAIT); - pgstat_report_wait_end(); - - return (readlen == amount); -#else - Assert(FileIsValid(file)); - return false; -#endif -} - void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info) { diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 16a7c424683..fdecbad1709 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -736,33 +736,6 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) return true; } -/* - * mdcached() -- Check if the whole block is already available in page cache. - */ -bool -mdcached(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) -{ -#ifdef USE_PREFETCH - off_t seekpos; - MdfdVec *v; - - Assert((io_direct_flags & IO_DIRECT_DATA) == 0); - - v = _mdfd_getseg(reln, forknum, blocknum, false, - InRecovery ? EXTENSION_RETURN_NULL : EXTENSION_FAIL); - if (v == NULL) - return false; - - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); - - Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); - - (void) FileCached(v->mdfd_vfd, seekpos, BLCKSZ, WAIT_EVENT_DATA_FILE_PREFETCH); -#endif /* USE_PREFETCH */ - - return true; -} - /* * mdread() -- Read the specified block from a relation. */ diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 209518aae01..5d0f3d515c3 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -55,8 +55,6 @@ typedef struct f_smgr BlockNumber blocknum, int nblocks, bool skipFsync); bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); - bool (*smgr_cached) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum); void (*smgr_read) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void *buffer); void (*smgr_write) (SMgrRelation reln, ForkNumber forknum, @@ -82,7 +80,6 @@ static const f_smgr smgrsw[] = { .smgr_extend = mdextend, .smgr_zeroextend = mdzeroextend, .smgr_prefetch = mdprefetch, - .smgr_cached = mdcached, .smgr_read = mdread, .smgr_write = mdwrite, .smgr_writeback = mdwriteback, @@ -553,22 +550,6 @@ smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) return smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum); } -/* - * smgrcached() -- Check if the specified block is already in page cache. - */ -bool -smgrcached(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) -{ - /* - * In recovery we consider the blocks not cached, so that PrefetchSharedBuffer - * initiates the I/O. XLogPrefetcherNextBlock relies on that. - */ - if (InRecovery) - return false; - - return smgrsw[reln->smgr_which].smgr_cached(reln, forknum, blocknum); -} - /* * smgrread() -- read a particular block from a relation into the supplied * buffer. diff --git a/src/include/access/genam.h b/src/include/access/genam.h index db1dc9c44b6..b5dbe971770 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -247,7 +247,23 @@ typedef struct PrefetchCacheEntry { } PrefetchCacheEntry; /* - * Used to detect sequential patterns (to not prefetch in this case). + * Size of the cache of recently prefetched blocks - shouldn't be too + * small or too large. 1024 seems about right, it covers ~8MB of data. + * It's somewhat arbitrary, there's no particular formula saying it + * should not be higher/lower. + * + * The cache is structured as an array of small LRU caches, so the total + * size needs to be a multiple of LRU size. The LRU should be tiny to + * keep linear search cheap enough. + * + * XXX Maybe we could consider effective_cache_size or something? + */ +#define PREFETCH_LRU_SIZE 8 +#define PREFETCH_LRU_COUNT 128 +#define PREFETCH_CACHE_SIZE (PREFETCH_LRU_SIZE * PREFETCH_LRU_COUNT) + +/* + * Used to detect sequential patterns (and disable prefetching). */ #define PREFETCH_QUEUE_HISTORY 8 #define PREFETCH_SEQ_PATTERN_BLOCKS 4 @@ -303,6 +319,13 @@ typedef struct IndexPrefetchData uint64 blockIndex; /* index in the block (points to the first * empty entry)*/ + /* + * Cache of recently prefetched blocks, organized as a hash table of + * small LRU caches. + */ + uint64 prefetchReqNumber; + PrefetchCacheEntry prefetchCache[PREFETCH_CACHE_SIZE]; + } IndexPrefetchData; #define PREFETCH_QUEUE_INDEX(a) ((a) % (MAX_IO_CONCURRENCY)) diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index c96a24dddd3..d9d5d9da5fb 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -105,7 +105,6 @@ extern File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fil extern File OpenTemporaryFile(bool interXact); extern void FileClose(File file); extern int FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info); -extern bool FileCached(File file, off_t offset, off_t amount, uint32 wait_event_info); extern int FileRead(File file, void *buffer, size_t amount, off_t offset, uint32 wait_event_info); extern int FileWrite(File file, const void *buffer, size_t amount, off_t offset, uint32 wait_event_info); extern int FileSync(File file, uint32 wait_event_info); diff --git a/src/include/storage/md.h b/src/include/storage/md.h index 8dc1382471e..941879ee6a8 100644 --- a/src/include/storage/md.h +++ b/src/include/storage/md.h @@ -32,8 +32,6 @@ extern void mdzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync); extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); -extern bool mdcached(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum); extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void *buffer); extern void mdwrite(SMgrRelation reln, ForkNumber forknum, diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 7fbed2a4291..a9a179aabac 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -96,8 +96,6 @@ extern void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync); extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); -extern bool smgrcached(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum); extern void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void *buffer); extern void smgrwrite(SMgrRelation reln, ForkNumber forknum, -- 2.42.0