From e5232b8a8e90f60f45aadc813c5f024c9ecd8dab Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Thu, 27 Jul 2023 15:36:01 +0200 Subject: [PATCH v1] Cache btree scan end page across rescans in the same node If the index is repeatedly scanned for values (e.g. in a nested loop) and if the values that are being looked up are highly correlated, then we can likely reuse the previous index scan's last page as a startpoint for the new scan, instead of going through a relatively expensive index descent. --- src/backend/access/nbtree/nbtpage.c | 18 +++++ src/backend/access/nbtree/nbtree.c | 8 +++ src/backend/access/nbtree/nbtsearch.c | 100 ++++++++++++++++++++++++-- src/include/access/nbtree.h | 21 ++++++ 4 files changed, 141 insertions(+), 6 deletions(-) diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index d78971bfe8..897b6772fc 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -856,6 +856,24 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access) return buf; } +Buffer +_bt_getrecentbuf(Relation rel, BlockNumber blkno, Buffer buf, int access) +{ + Assert(BlockNumberIsValid(blkno)); + Assert(BufferIsValid(buf)); + + if (!ReadRecentBuffer(rel->rd_locator, MAIN_FORKNUM, blkno, buf)) + { + /* Read an existing block of the relation */ + buf = ReadBuffer(rel, blkno); + } + + _bt_lockbuf(rel, buf, access); + _bt_checkpage(rel, buf); + + return buf; +} + /* * _bt_allocbuf() -- Allocate a new block/page. * diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 4553aaee53..e4ca2f8ecb 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -376,6 +376,11 @@ btbeginscan(Relation rel, int nkeys, int norderbys) */ so->currTuples = so->markTuples = NULL; + so->rescans = so->rescanSamePage = 0; + so->recentEndPage = InvalidBlockNumber; + so->pageCacheValid = false; + so->pageCache = (char *) palloc(BLCKSZ); + scan->xs_itupdesc = RelationGetDescr(rel); scan->opaque = so; @@ -402,6 +407,7 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, BTScanPosInvalidate(so->currPos); } + so->rescans++; so->markItemIndex = -1; so->arrayKeyCount = 0; BTScanPosUnpinIfPinned(so->markPos); @@ -467,6 +473,8 @@ btendscan(IndexScanDesc scan) /* Release storage */ if (so->keyData != NULL) pfree(so->keyData); + if (so->pageCache != NULL) + pfree(so->pageCache); /* so->arrayKeyData and so->arrayKeys are in arrayContext */ if (so->arrayContext != NULL) MemoryContextDelete(so->arrayContext); diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 3230b3b894..67e597b56c 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -47,6 +47,45 @@ static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot); static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir); +static void _bt_endscanonpage(BTScanOpaque btso, Buffer buf, + BlockNumber endPage, Page page); + +#define RescanMayHitSamePage(so) (((float) ((so)->rescanSamePage) / (float) ((so)->rescans)) >= 0.5) + +static void +_bt_endscanonpage(BTScanOpaque btso, Buffer buf, BlockNumber endPage, Page page) +{ + BlockNumber prevEndPage = btso->recentEndPage; + + /* + * We have often (>50%) hit the page the previous scan ended on, so + * cache the current (last) page of the scan for future use. + */ + if (RescanMayHitSamePage(btso)) + { + /* + * If we have a valid cache, and the cache contains this page, then + * we don't have anything to do. + */ + if (prevEndPage == endPage && btso->pageCacheValid) + { + /* do nothing */ + } + else + { + memcpy(btso->pageCache, page, BLCKSZ); + btso->pageCacheValid = true; + btso->recentBuffer = buf; + btso->recentEndPage = endPage; + } + } + else if (prevEndPage != endPage) + { + btso->recentEndPage = endPage; + btso->pageCacheValid = false; + } +} + /* * _bt_drop_lock_and_maybe_pin() @@ -872,7 +911,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) { Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; - Buffer buf; + Buffer buf = InvalidBuffer; BTStack stack; OffsetNumber offnum; StrategyNumber strat; @@ -1371,13 +1410,59 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) inskey.keysz = keysCount; /* - * Use the manufactured insertion scan key to descend the tree and - * position ourselves on the target leaf page. + * If we've restarted the scan through amrescan, it is quite possible + * that a previous scan ended on the same page we're trying to find. + * If this happened repeatedly, we cache the last page of the scan, + * so that we may be able to ignore the penalty of traversing the tree + * from the top. + * + * XXX: Maybe this might not be concurrency-safe? Haven't thought about it + * quite yet. */ - stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ, scan->xs_snapshot); + if (RescanMayHitSamePage(so) && so->pageCacheValid) + { + Page page = so->pageCache; + BTPageOpaque opaque = BTPageGetOpaque(page); + + /* cached page is a leaf page, and is not empty */ + Assert(P_ISLEAF(opaque) && PageGetMaxOffsetNumber(page) != 0); + + /* + * If the search key doesn't fit within the min/max of this page, + * we continue with normal index descent. + */ + if (_bt_compare(rel, &inskey, page, P_HIKEY) >= 0) + goto nocache; + if (_bt_compare(rel, &inskey, page, PageGetMaxOffsetNumber(page)) <= 0) + goto nocache; + + buf = _bt_getrecentbuf(rel, so->recentEndPage, so->recentBuffer, + BT_READ); - /* don't need to keep the stack around... */ - _bt_freestack(stack); + /* + * It is possible the page has split in the meantime, so we may have + * to move right + */ + buf = _bt_moveright(rel, NULL, &inskey, buf, false, NULL, BT_READ, + scan->xs_snapshot); + so->rescanSamePage += 1; + } + +nocache: + if (!BufferIsValid(buf)) + { + /* + * Use the manufactured insertion scan key to descend the tree and + * position ourselves on the target leaf page. + */ + stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ, scan->xs_snapshot); + + /* don't need to keep the stack around... */ + _bt_freestack(stack); + + if (buf == so->recentBuffer) + so->rescanSamePage += 1; + } if (!BufferIsValid(buf)) { @@ -1792,6 +1877,9 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) so->currPos.itemIndex = MaxTIDsPerBTreePage - 1; } + if (!continuescan) + _bt_endscanonpage(so, so->currPos.buf, so->currPos.currPage, page); + return (so->currPos.firstItem <= so->currPos.lastItem); } diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 8891fa7973..b76d433725 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1062,6 +1062,26 @@ typedef struct BTScanOpaqueData char *currTuples; /* tuple storage for currPos */ char *markTuples; /* tuple storage for markPos */ + /* + * If we're on the outer side of a loop join, parameterized joins may + * have some correlation, i.e. repeated close values which fit on + * nearby pages. By tracking at which leaf page we start and end our + * scans, we can detect this case and (in some cases) reduce buffer + * accesses by a huge margin. + * + * TODO: Caching this page locally is OK, because this is inside a query + * context and thus bound to the lifetime and snapshot of the query. + * Accessing an updated version of the page might not be, so that needs + * checking. + */ + int rescans; /* number of rescans */ + int rescanSamePage; /* number of rescans that started on the previous scan's end location */ + + BlockNumber recentEndPage; /* scan end page of previous scan */ + Buffer recentBuffer; /* buffer of recentEndPage. May not match. */ + bool pageCacheValid; /* is the cached page valid? */ + char *pageCache; /* copy of the previous scan's last page, if valid */ + /* * If the marked position is on the same page as current position, we * don't use markPos, but just keep the marked itemIndex in markItemIndex @@ -1207,6 +1227,7 @@ extern void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage); extern void _bt_checkpage(Relation rel, Buffer buf); extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access); +extern Buffer _bt_getrecentbuf(Relation rel, BlockNumber blkno, Buffer buffer, int access); extern Buffer _bt_allocbuf(Relation rel, Relation heaprel); extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access); -- 2.40.1