From 51baee1800b14313b9d03daed238d81ac48d5c8e Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Sat, 16 Nov 2024 15:58:41 -0500 Subject: [PATCH v30 2/4] Lower nbtree skip array maintenance overhead. Add an optimization that fixes regressions in index scans that are nominally eligible to use skip scan, but can never actually benefit from skipping. These are cases where a leading prefix column contains many distinct values -- especially when the number of values approaches the total number of index tuples, where skipping can never be profitable. The optimization is activated dynamically, as a fallback strategy. It works by determining a prefix of leading index columns whose scan keys (often skip array scan keys) are guaranteed to be satisfied by every possible index tuple on a given page. _bt_readpage is then able to start comparisons at the first scan key that might not be satisfied. This necessitates making _bt_readpage temporarily cease maintaining the scan's arrays. _bt_checkkeys will treat the scan's keys as if they were not marked as required during preprocessing. This process relies on the non-required SAOP array logic in _bt_advance_array_keys that was added to Postgres 17 by commit 5bf748b8. The new optimization does not affect array primitive scan scheduling. It is similar to the precheck optimization added by Postgres 17 commit e0b1ee17dc, though it is only used during nbtree scans with skip arrays. It can be applied during scans that were never eligible for the precheck optimization. As a result, many scans that cannot benefit from skipping will still benefit from using skip arrays (skip arrays indirectly enable the use of the optimization introduced by this commit). Skip scan's approach of adding skip arrays during preprocessing and then fixing (or significantly ameliorating) the resulting regressions seen in unsympathetic cases is enabled by the optimization added by this commit (and by the "look ahead" optimization introduced by commit 5bf748b8). This allows the planner to avoid generating distinct, competing index paths (one path for skip scan, another for an equivalent traditional full index scan). The overall effect is to make scan runtime close to optimal, even when the planner works off an incorrect cardinality estimate. Scans will also perform well given a skipped column with data skew: individual groups of pages with many distinct values in respect of a skipped column can be read about as efficiently as before, without having to give up on skipping over other provably-irrelevant leaf pages. Author: Peter Geoghegan Reviewed-By: Heikki Linnakangas Reviewed-By: Masahiro Ikeda Discussion: https://postgr.es/m/CAH2-Wz=Y93jf5WjoOsN=xvqpMjRy-bxCE037bVFi-EasrpeUJA@mail.gmail.com --- src/include/access/nbtree.h | 9 +- src/backend/access/nbtree/nbtpreprocesskeys.c | 2 + src/backend/access/nbtree/nbtree.c | 1 + src/backend/access/nbtree/nbtsearch.c | 65 ++- src/backend/access/nbtree/nbtutils.c | 483 +++++++++++++++--- 5 files changed, 444 insertions(+), 116 deletions(-) diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index b86bf7bf3..ec0f095ba 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1059,6 +1059,7 @@ typedef struct BTScanOpaqueData /* workspace for SK_SEARCHARRAY support */ int numArrayKeys; /* number of equality-type array keys */ + bool rowCompare; /* At least one RowCompare key in keyData[]? */ bool needPrimScan; /* New prim scan to continue in current dir? */ bool scanBehind; /* Check scan not still behind on next page? */ bool oppositeDirCheck; /* scanBehind opposite-scan-dir check? */ @@ -1105,6 +1106,8 @@ typedef struct BTReadPageState IndexTuple finaltup; /* Needed by scans with array keys */ Page page; /* Page being read */ bool firstpage; /* page is first for primitive scan? */ + bool forcenonrequired; /* treat all scan keys as nonrequired? */ + int ikey; /* start comparisons from ikey'th scan key */ /* Per-tuple input parameters, set by _bt_readpage for _bt_checkkeys */ OffsetNumber offnum; /* current tuple's page offset number */ @@ -1114,10 +1117,9 @@ typedef struct BTReadPageState bool continuescan; /* Terminate ongoing (primitive) index scan? */ /* - * Input and output parameters, set and unset by both _bt_readpage and - * _bt_checkkeys to manage precheck optimizations + * Input and output parameter, set and unset by both _bt_readpage and + * _bt_checkkeys for "key required in opposite direction" optimization */ - bool prechecked; /* precheck set continuescan to 'true'? */ bool firstmatch; /* at least one match so far? */ /* @@ -1327,6 +1329,7 @@ extern bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arra IndexTuple tuple, int tupnatts); extern bool _bt_scanbehind_checkkeys(IndexScanDesc scan, ScanDirection dir, IndexTuple finaltup); +extern void _bt_skip_ikeyprefix(IndexScanDesc scan, BTReadPageState *pstate); extern void _bt_killitems(IndexScanDesc scan); extern BTCycleId _bt_vacuum_cycleid(Relation rel); extern BTCycleId _bt_start_vacuum(Relation rel); diff --git a/src/backend/access/nbtree/nbtpreprocesskeys.c b/src/backend/access/nbtree/nbtpreprocesskeys.c index 7bd121f83..963fce8a9 100644 --- a/src/backend/access/nbtree/nbtpreprocesskeys.c +++ b/src/backend/access/nbtree/nbtpreprocesskeys.c @@ -466,6 +466,8 @@ _bt_preprocess_keys(IndexScanDesc scan) if (numberOfEqualCols == attno - 1) _bt_mark_scankey_required(outkey); + so->rowCompare = true; + /* * We don't support RowCompare using equality; such a qual would * mess up the numberOfEqualCols tracking. diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 815cbcfb7..d714fc4ad 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -349,6 +349,7 @@ btbeginscan(Relation rel, int nkeys, int norderbys) else so->keyData = NULL; + so->rowCompare = false; so->needPrimScan = false; so->scanBehind = false; so->oppositeDirCheck = false; diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 2524cb23a..3491dac78 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -1643,47 +1643,15 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, pstate.finaltup = NULL; pstate.page = page; pstate.firstpage = firstpage; + pstate.forcenonrequired = false; + pstate.ikey = 0; pstate.offnum = InvalidOffsetNumber; pstate.skip = InvalidOffsetNumber; pstate.continuescan = true; /* default assumption */ - pstate.prechecked = false; pstate.firstmatch = false; pstate.rechecks = 0; pstate.targetdistance = 0; - /* - * Prechecking the value of the continuescan flag for the last item on the - * page (for backwards scan it will be the first item on a page). If we - * observe it to be true, then it should be true for all other items. This - * allows us to do significant optimizations in the _bt_checkkeys() - * function for all the items on the page. - * - * With the forward scan, we do this check for the last item on the page - * instead of the high key. It's relatively likely that the most - * significant column in the high key will be different from the - * corresponding value from the last item on the page. So checking with - * the last item on the page would give a more precise answer. - * - * We skip this for the first page read by each (primitive) scan, to avoid - * slowing down point queries. They typically don't stand to gain much - * when the optimization can be applied, and are more likely to notice the - * overhead of the precheck. Also avoid it during scans with array keys, - * which might be using skip scan (XXX fixed in next commit). - */ - if (!pstate.firstpage && !arrayKeys && minoff < maxoff) - { - ItemId iid; - IndexTuple itup; - - iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff); - itup = (IndexTuple) PageGetItem(page, iid); - - /* Call with arrayKeys=false to avoid undesirable side-effects */ - _bt_checkkeys(scan, &pstate, false, itup, indnatts); - pstate.prechecked = pstate.continuescan; - pstate.continuescan = true; /* reset */ - } - if (ScanDirectionIsForward(dir)) { /* SK_SEARCHARRAY forward scans must provide high key up front */ @@ -1711,6 +1679,13 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, so->scanBehind = so->oppositeDirCheck = false; /* reset */ } + /* + * Consider pstate.ikey optimization once the ongoing primitive index + * scan has already read at least one page + */ + if (!pstate.firstpage && minoff < maxoff) + _bt_skip_ikeyprefix(scan, &pstate); + /* load items[] in ascending order */ itemIndex = 0; @@ -1747,6 +1722,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, { Assert(!passes_quals && pstate.continuescan); Assert(offnum < pstate.skip); + Assert(!pstate.forcenonrequired); offnum = pstate.skip; pstate.skip = InvalidOffsetNumber; @@ -1810,8 +1786,12 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, IndexTuple itup = (IndexTuple) PageGetItem(page, iid); int truncatt; + /* stop treating required keys as nonrequired */ + pstate.forcenonrequired = false; + pstate.firstmatch = false; + pstate.ikey = 0; + truncatt = BTreeTupleGetNAtts(itup, rel); - pstate.prechecked = false; /* precheck didn't cover HIKEY */ _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt); } @@ -1850,6 +1830,13 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, so->scanBehind = so->oppositeDirCheck = false; /* reset */ } + /* + * Consider pstate.ikey optimization once the ongoing primitive index + * scan has already read at least one page + */ + if (!pstate.firstpage && minoff < maxoff) + _bt_skip_ikeyprefix(scan, &pstate); + /* load items[] in descending order */ itemIndex = MaxTIDsPerBTreePage; @@ -1889,6 +1876,13 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, Assert(!BTreeTupleIsPivot(itup)); pstate.offnum = offnum; + if (offnum == minoff) + { + /* stop treating required keys as nonrequired */ + pstate.forcenonrequired = false; + pstate.firstmatch = false; + pstate.ikey = 0; + } passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys, itup, indnatts); @@ -1900,6 +1894,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, { Assert(!passes_quals && pstate.continuescan); Assert(offnum > pstate.skip); + Assert(!pstate.forcenonrequired); offnum = pstate.skip; pstate.skip = InvalidOffsetNumber; diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 62530702f..b46bfe492 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -57,11 +57,11 @@ static bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, IndexTuple finaltup); static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, - bool advancenonrequired, bool prechecked, bool firstmatch, - bool *continuescan, int *ikey); + bool advancenonrequired, bool forcenonrequired, + bool firstmatch, bool *continuescan, int *ikey); static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, - ScanDirection dir, bool *continuescan); + ScanDirection dir, bool forcenonrequired, bool *continuescan); static void _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, int tupnatts, TupleDesc tupdesc); static int _bt_keep_natts(Relation rel, IndexTuple lastleft, @@ -1421,9 +1421,10 @@ _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir) * postcondition's <= operator with a >=. In other words, just swap the * precondition with the postcondition.) * - * We also deal with "advancing" non-required arrays here. Callers whose - * sktrig scan key is non-required specify sktrig_required=false. These calls - * are the only exception to the general rule about always advancing the + * We also deal with "advancing" non-required arrays here (or arrays that are + * treated as non-required for the duration of a _bt_readpage call). Callers + * whose sktrig scan key is non-required specify sktrig_required=false. These + * calls are the only exception to the general rule about always advancing the * required array keys (the scan may not even have a required array). These * callers should just pass a NULL pstate (since there is never any question * of stopping the scan). No call to _bt_tuple_before_array_skeys is required @@ -1480,8 +1481,8 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, */ pstate->firstmatch = false; - /* Shouldn't have to invalidate 'prechecked', though */ - Assert(!pstate->prechecked); + /* Shouldn't have to invalidate forcenonrequired state */ + Assert(!pstate->forcenonrequired && pstate->ikey == 0); /* * Once we return we'll have a new set of required array keys, so @@ -1490,6 +1491,26 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, pstate->rechecks = 0; pstate->targetdistance = 0; } + else if (sktrig < so->numberOfKeys - 1 && + !(so->keyData[so->numberOfKeys - 1].sk_flags & SK_SEARCHARRAY)) + { + int least_sign_ikey = so->numberOfKeys - 1; + bool continuescan; + + /* + * Optimization: perform a precheck of the least significant key + * during !sktrig_required calls when it isn't already our sktrig + * (provided the precheck key is not itself an array). + * + * When the precheck works out we'll avoid an expensive binary search + * of sktrig's array (plus any other arrays before least_sign_ikey). + */ + Assert(so->keyData[sktrig].sk_flags & SK_SEARCHARRAY); + if (!_bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, + false, false, false, + &continuescan, &least_sign_ikey)) + return false; + } Assert(_bt_verify_keys_with_arraykeys(scan)); @@ -1533,8 +1554,6 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, if (cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) { - Assert(sktrig_required); - required = true; if (cur->sk_attno > tupnatts) @@ -1668,7 +1687,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, } else { - Assert(sktrig_required && required); + Assert(required); /* * This is a required non-array equality strategy scan key, which @@ -1710,7 +1729,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, * be eliminated by _bt_preprocess_keys. It won't matter if some of * our "true" array scan keys (or even all of them) are non-required. */ - if (required && + if (sktrig_required && required && ((ScanDirectionIsForward(dir) && result > 0) || (ScanDirectionIsBackward(dir) && result < 0))) beyond_end_advance = true; @@ -1725,7 +1744,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, * array scan keys are considered interesting.) */ all_satisfied = false; - if (required) + if (sktrig_required && required) all_required_satisfied = false; else { @@ -1785,6 +1804,12 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, * of any required scan key). All that matters is whether caller's tuple * satisfies the new qual, so it's safe to just skip the _bt_check_compare * recheck when we've already determined that it can only return 'false'. + * + * Note: In practice most scan keys are marked required by preprocessing, + * if necessary by generating a preceding skip array. We nevertheless + * often handle array keys marked required as if they were nonrequired. + * This behavior is requested by our _bt_check_compare caller, though only + * when it is passed "forcenonrequired=true" by _bt_checkkeys. */ if ((sktrig_required && all_required_satisfied) || (!sktrig_required && all_satisfied)) @@ -1796,7 +1821,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, /* Recheck _bt_check_compare on behalf of caller */ if (_bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, - false, false, false, + false, !sktrig_required, false, &continuescan, &nsktrig) && !so->scanBehind) { @@ -2041,8 +2066,9 @@ new_prim_scan: * read at least one leaf page before the one we're reading now. This * makes primscan scheduling more efficient when scanning subsets of an * index with many distinct attribute values matching many array elements. - * It encourages fewer, larger primitive scans where that makes sense - * (where index descent costs need to be kept under control). + * It encourages fewer, larger primitive scans where that makes sense. + * This will in turn encourage _bt_readpage to apply the pstate.ikey + * optimization more often. * * Note: This heuristic isn't as aggressive as you might think. We're * conservative about allowing a primitive scan to step from the first @@ -2199,17 +2225,14 @@ _bt_verify_keys_with_arraykeys(IndexScanDesc scan) * the page to the right. * * Advances the scan's array keys when necessary for arrayKeys=true callers. - * Caller can avoid all array related side-effects when calling just to do a - * page continuescan precheck -- pass arrayKeys=false for that. Scans without - * any arrays keys must always pass arrayKeys=false. + * Scans without any array keys must always pass arrayKeys=false. * * Also stops and starts primitive index scans for arrayKeys=true callers. * Scans with array keys are required to set up page state that helps us with * this. The page's finaltup tuple (the page high key for a forward scan, or * the page's first non-pivot tuple for a backward scan) must be set in - * pstate.finaltup ahead of the first call here for the page (or possibly the - * first call after an initial continuescan-setting page precheck call). Set - * this to NULL for rightmost page (or the leftmost page for backwards scans). + * pstate.finaltup ahead of the first call here for the page. Set this to + * NULL for rightmost page (or the leftmost page for backwards scans). * * scan: index scan descriptor (containing a search-type scankey) * pstate: page level input and output parameters @@ -2224,42 +2247,34 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, TupleDesc tupdesc = RelationGetDescr(scan->indexRelation); BTScanOpaque so = (BTScanOpaque) scan->opaque; ScanDirection dir = so->currPos.dir; - int ikey = 0; + int ikey = pstate->ikey; bool res; Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts); Assert(!so->needPrimScan && !so->scanBehind && !so->oppositeDirCheck); + Assert(arrayKeys || so->numArrayKeys == 0); - res = _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, - arrayKeys, pstate->prechecked, pstate->firstmatch, + res = _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, arrayKeys, + pstate->forcenonrequired, pstate->firstmatch, &pstate->continuescan, &ikey); #ifdef USE_ASSERT_CHECKING - if (!arrayKeys && so->numArrayKeys) - { - /* - * This is a continuescan precheck call for a scan with array keys. - * - * Assert that the scan isn't in danger of becoming confused. - */ - Assert(!so->scanBehind && !so->oppositeDirCheck); - Assert(!pstate->prechecked && !pstate->firstmatch); - Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, - tupnatts, false, 0, NULL)); - } - if (pstate->prechecked || pstate->firstmatch) + if ((pstate->firstmatch || pstate->ikey > 0) && !pstate->forcenonrequired) { bool dcontinuescan; int dikey = 0; /* - * Call relied on continuescan/firstmatch prechecks -- assert that we - * get the same answer without those optimizations + * _bt_check_compare call relied on the firstmatch and/or pstate->ikey + * optimizations. Assert that _bt_check_compare gives the same answer + * when it can apply neither optimization. */ Assert(res == _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, false, false, false, &dcontinuescan, &dikey)); Assert(pstate->continuescan == dcontinuescan); + Assert(arrayKeys || dikey == ikey); + Assert(dikey >= ikey); /* weaker assert is for nonrequired arrays */ } #endif @@ -2280,6 +2295,7 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, * It's also possible that the scan is still _before_ the _start_ of * tuples matching the current set of array keys. Check for that first. */ + Assert(!pstate->forcenonrequired); if (_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, true, ikey, NULL)) { @@ -2402,6 +2418,291 @@ _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, return true; } +/* + * Determine a prefix of scan keys that are guaranteed to be satisfied by + * every possible tuple on pstate's page. + * + * Sets pstate.ikey (and pstate.forcenonrequired) on success, making later + * calls to _bt_checkkeys start checks of each tuple from the so->keyData[] + * entry at pstate.ikey (while treating keys >= pstate.ikey as nonrequired). + * + * Caller must reset pstate.ikey and pstate.forcenonrequired just ahead of the + * _bt_checkkeys call for the page's final tuple (the pstate.finaltup tuple). + * That will give _bt_checkkeys the opportunity to call _bt_advance_array_keys + * properly (with sktrig_required=true) should the array keys need to advance + * on this page. Caller doesn't need to do this on the rightmost/leftmost + * page in the index (where pstate.finaltup won't ever be set). + */ +void +_bt_skip_ikeyprefix(IndexScanDesc scan, BTReadPageState *pstate) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + ScanDirection dir = so->currPos.dir; + Relation rel = scan->indexRelation; + TupleDesc tupdesc = RelationGetDescr(rel); + ItemId iid; + IndexTuple firsttup, + lasttup; + int ikey = 0, + arrayidx = 0, + firstchangingattnum; + + Assert(pstate->minoff < pstate->maxoff); + + if (so->rowCompare && so->numArrayKeys) + return; + + if (so->numberOfKeys == 0) + return; + + /* minoff is an offset to the lowest non-pivot tuple on the page */ + iid = PageGetItemId(pstate->page, pstate->minoff); + firsttup = (IndexTuple) PageGetItem(pstate->page, iid); + + /* maxoff is an offset to the highest non-pivot tuple on the page */ + iid = PageGetItemId(pstate->page, pstate->maxoff); + lasttup = (IndexTuple) PageGetItem(pstate->page, iid); + + /* Determine the first attribute whose values change on caller's page */ + firstchangingattnum = _bt_keep_natts_fast(rel, firsttup, lasttup); + + for (; ikey < so->numberOfKeys; ikey++) + { + ScanKey key = so->keyData + ikey; + BTArrayKeyInfo *array; + Datum tupdatum; + bool tupnull; + int32 result; + + if (key->sk_flags & SK_ROW_HEADER) + { + break; /* pstate.ikey to be set to RowCompare ikey */ + } + + /* + * Determine if it's safe to set pstate.ikey to an offset to a key + * that comes after this key, by examining this key + */ + if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) == 0) + { + /* + * This is the first key that is not marked required (only happens + * when _bt_preprocess_keys couldn't mark all keys required due to + * implementation restrictions affecting skip array generation) + */ + Assert(!(key->sk_flags & SK_BT_SKIP)); + break; /* pstate.ikey to be set to nonrequired ikey */ + } + if (key->sk_strategy != BTEqualStrategyNumber) + { + /* + * This is the scan's first inequality key (barring inequalities + * that are used to describe the range of a = skip array key). + * + * It's definitely safe for _bt_checkkeys to avoid assessing this + * inequality when the page's first and last non-pivot tuples both + * satisfy the inequality (since the same must also be true of all + * the tuples in between these two). + * + * Unlike the "=" case, it doesn't matter if this attribute has + * more than one distinct value (though it is necessary for any + * and all _prior_ attributes to contain no more than one distinct + * value amongst all of the tuples from pstate.page). + */ + if (key->sk_attno > firstchangingattnum) + break; /* pstate.ikey to be set to inequality's ikey */ + + if (key->sk_flags & SK_ISNULL) + { + Assert(key->sk_flags & SK_SEARCHNOTNULL); + break; + } + + tupdatum = index_getattr(firsttup, key->sk_attno, tupdesc, &tupnull); + if (tupnull) + break; /* pstate.ikey to be set to inequality's ikey */ + if (!DatumGetBool(FunctionCall2Coll(&key->sk_func, + key->sk_collation, tupdatum, + key->sk_argument))) + break; + + tupdatum = index_getattr(lasttup, key->sk_attno, tupdesc, &tupnull); + if (tupnull) + break; /* pstate.ikey to be set to inequality's ikey */ + + if (!DatumGetBool(FunctionCall2Coll(&key->sk_func, + key->sk_collation, tupdatum, + key->sk_argument))) + break; /* pstate.ikey to be set to inequality's ikey */ + + continue; /* safe, key satisfied by every tuple */ + } + if (!(key->sk_flags & SK_SEARCHARRAY)) + { + /* + * Found a scalar (non-array) = key. + * + * It is unsafe to set pstate.ikey to an ikey beyond this key, + * unless the = key is satisfied by every possible tuple on the + * page (possible only when attribute has just one distinct value + * among all tuples on the page). + */ + if (key->sk_attno < firstchangingattnum) + { + /* + * Only one distinct value on the page for this key's column. + * Must still make sure that = key is actually satisfied by + * the value that is stored within every tuple on the page. + */ + tupdatum = index_getattr(firsttup, key->sk_attno, tupdesc, + &tupnull); + if (key->sk_flags & SK_ISNULL) + { + Assert(key->sk_flags & SK_SEARCHNULL); + if (tupnull) + continue; + break; + } + if (!DatumGetBool(FunctionCall2Coll(&key->sk_func, + key->sk_collation, tupdatum, + key->sk_argument))) + break; /* pstate.ikey to be set to equality's ikey */ + + continue; /* safe, key satisfied by every tuple */ + } + break; /* pstate.ikey to be set to scalar key's ikey */ + } + + array = &so->arrayKeys[arrayidx++]; + Assert(array->scan_key == ikey); + if (array->num_elems != -1) + { + /* + * Found a SAOP array = key. + * + * Handle this just like we handle scalar = keys. + */ + if (key->sk_attno < firstchangingattnum) + { + /* + * Only one distinct value on the page for this key's column. + * Must still make sure that SAOP array is actually satisfied + * by the value that is stored within every tuple on the page. + */ + tupdatum = index_getattr(firsttup, key->sk_attno, tupdesc, + &tupnull); + _bt_binsrch_array_skey(&so->orderProcs[ikey], false, + NoMovementScanDirection, + tupdatum, tupnull, array, key, &result); + if (result == 0) + continue; /* safe, SAOP = key satisfied by every tuple */ + } + break; /* pstate.ikey to be set to SAOP array's ikey */ + } + + /* + * Found a skip array = key. + * + * As with other = keys, moving past this skip array key is safe when + * every tuple on the page is guaranteed to satisfy the array's key. + * But we can be more aggressive here, since skip arrays make it easy + * to assess whether all the values on the page fall within the skip + * array's entire range. + */ + if (array->null_elem) + { + /* Safe, non-range skip array "satisfied" by every tuple on page */ + continue; + } + else if (key->sk_attno > firstchangingattnum) + { + /* + * We cannot assess whether this range skip array will definitely + * be satisfied by every tuple on the page, since its attribute is + * preceded by another attribute that is not certain to contain + * the same prefix of value(s) within every tuple from pstate.page + */ + break; /* pstate.ikey to be set to range array's ikey */ + } + + /* + * Found a range skip array = key. + * + * This is just like the inequality case (though the inequality keys + * tested here are in the array itself, not in so->keyData[]). + */ + tupdatum = index_getattr(firsttup, key->sk_attno, tupdesc, &tupnull); + _bt_binsrch_skiparray_skey(false, ForwardScanDirection, + tupdatum, tupnull, array, key, &result); + if (result != 0) + break; /* pstate.ikey to be set to range array's ikey */ + + tupdatum = index_getattr(lasttup, key->sk_attno, tupdesc, &tupnull); + _bt_binsrch_skiparray_skey(false, ForwardScanDirection, + tupdatum, tupnull, array, key, &result); + if (result != 0) + break; /* pstate.ikey to be set to range array's ikey */ + + /* Safe, range skip array satisfied by every tuple on page */ + } + + /* + * If pstate.ikey remains 0, _bt_advance_array_keys will still be able to + * apply its precheck optimization when dealing with "nonrequired" array + * keys. That's reason enough on its own to set forcenonrequired=true. + */ + pstate->forcenonrequired = (so->numArrayKeys > 0); + pstate->ikey = ikey; + + if (!pstate->forcenonrequired) + return; + + /* + * Set the element for range skip arrays whose ikey is >= pstate.ikey to + * whatever the first array element is in the scan's current direction. + * This allows range skip arrays that will never be satisfied by any tuple + * on the page to avoid extra sk_argument comparisons -- _bt_check_compare + * won't use the key's sk_argument when the key is marked MINVAL/MAXVAL + * (note that MINVAL/MAXVAL won't be unset until an exact match is found, + * which might not happen for any tuple on the page). + * + * Set the element for non-range skip arrays whose ikey is >= pstate.ikey + * to NULL (regardless of whether NULLs are stored first or last), too. + * This allows non-range skip arrays (recognized by _bt_check_compare as + * "non-required" skip arrays with ISNULL set) to avoid needlessly calling + * _bt_advance_array_keys (it isn't necessary because we know that any + * non-range skip array must be satisfied by every possible value). + * + * Note: It's safe for us to set ISNULL like this without regard for + * whether it'll leave the array keys before or after the page's keyspace. + * We know that once caller unsets pstate.forcenonrequired, its call to + * _bt_checkkeys will still be able to advance the scan's array keys, + * without hindrance from _bt_tuple_before_array_skeys. We also know that + * a sktrig_required=false call to _bt_advance_array_keys won't alter any + * array unless an exact match is available to "advance" the array to. + */ + for (int i = 0; i < so->numArrayKeys; i++) + { + BTArrayKeyInfo *array = &so->arrayKeys[i]; + ScanKey key = &so->keyData[array->scan_key]; + + Assert(key->sk_flags & SK_SEARCHARRAY); + + if (array->num_elems != -1) + continue; + if (array->scan_key < pstate->ikey) + continue; + + Assert(key->sk_flags & SK_BT_SKIP); + + if (!array->null_elem) + _bt_array_set_low_or_high(rel, key, array, + ScanDirectionIsForward(dir)); + else + _bt_skiparray_set_isnull(rel, key, array); + } +} + /* * Test whether an indextuple satisfies current scan condition. * @@ -2433,18 +2734,25 @@ _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, * * Though we advance non-required array keys on our own, that shouldn't have * any lasting consequences for the scan. By definition, non-required arrays - * have no fixed relationship with the scan's progress. (There are delicate - * considerations for non-required arrays when the arrays need to be advanced - * following our setting continuescan to false, but that doesn't concern us.) + * have no fixed relationship with the scan's progress. * * Pass advancenonrequired=false to avoid all array related side effects. * This allows _bt_advance_array_keys caller to avoid infinite recursion. + * + * Pass forcenonrequired=true to instruct us to treat all keys as nonrequired. + * This is used to make it safe to temporarily stop properly maintaining the + * scan's required arrays. Callers can determine which prefix of keys must + * satisfy every possible prefix of index attribute values on the page, and + * then pass us an initial *ikey for the first key that might be unsatisfied. + * We won't be maintaining any arrays before that initial *ikey, so there is + * no point in trying to do so for any later arrays. (Callers that do this + * must be careful to reset the array keys when they finish reading the page.) */ static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, - bool advancenonrequired, bool prechecked, bool firstmatch, - bool *continuescan, int *ikey) + bool advancenonrequired, bool forcenonrequired, + bool firstmatch, bool *continuescan, int *ikey) { BTScanOpaque so = (BTScanOpaque) scan->opaque; @@ -2460,36 +2768,20 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, /* * Check if the key is required in the current scan direction, in the - * opposite scan direction _only_, or in neither direction + * opposite scan direction _only_, or in neither direction (except + * when we're forced to treat all scan keys as nonrequired) */ - if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) || - ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir))) + if (forcenonrequired) + { + /* treating scan's keys as non-required */ + } + else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) || + ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir))) requiredSameDir = true; else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsBackward(dir)) || ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsForward(dir))) requiredOppositeDirOnly = true; - /* - * If the caller told us the *continuescan flag is known to be true - * for the last item on the page, then we know the keys required for - * the current direction scan should be matched. Otherwise, the - * *continuescan flag would be set for the current item and - * subsequently the last item on the page accordingly. - * - * If the key is required for the opposite direction scan, we can skip - * the check if the caller tells us there was already at least one - * matching item on the page. Also, we require the *continuescan flag - * to be true for the last item on the page to know there are no - * NULLs. - * - * Both cases above work except for the row keys, where NULLs could be - * found in the middle of matching values. - */ - if (prechecked && - (requiredSameDir || (requiredOppositeDirOnly && firstmatch)) && - !(key->sk_flags & SK_ROW_HEADER)) - continue; - if (key->sk_attno > tupnatts) { /* @@ -2511,6 +2803,19 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, { Assert(key->sk_flags & SK_SEARCHARRAY); Assert(key->sk_flags & SK_BT_SKIP); + Assert(requiredSameDir || forcenonrequired); + + /* + * Cannot fall back on _bt_tuple_before_array_skeys when we're + * treating the scan's keys as nonrequired, though. Just handle + * this like any other non-required equality-type array key. + */ + if (forcenonrequired) + { + Assert(!(key->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR))); + return _bt_advance_array_keys(scan, NULL, tuple, tupnatts, + tupdesc, *ikey, false); + } *continuescan = false; return false; @@ -2520,7 +2825,7 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, if (key->sk_flags & SK_ROW_HEADER) { if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir, - continuescan)) + forcenonrequired, continuescan)) continue; return false; } @@ -2552,9 +2857,20 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, */ if (requiredSameDir) *continuescan = false; + else if (unlikely(key->sk_flags & SK_BT_SKIP)) + { + /* + * If we're treating scan keys as nonrequired, and encounter a + * skip array scan key whose current element is NULL, then it + * must be a non-range skip array. It must be satisfied, so + * there's no need to call _bt_advance_array_keys to check. + */ + Assert(forcenonrequired && *ikey > 0); + continue; + } /* - * In any case, this indextuple doesn't match the qual. + * This indextuple doesn't match the qual. */ return false; } @@ -2575,7 +2891,7 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, * (_bt_advance_array_keys also relies on this behavior during * forward scans.) */ - if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && + if ((requiredSameDir || requiredOppositeDirOnly) && ScanDirectionIsBackward(dir)) *continuescan = false; } @@ -2593,7 +2909,7 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, * (_bt_advance_array_keys also relies on this behavior during * backward scans.) */ - if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && + if ((requiredSameDir || requiredOppositeDirOnly) && ScanDirectionIsForward(dir)) *continuescan = false; } @@ -2662,7 +2978,8 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, */ static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, - TupleDesc tupdesc, ScanDirection dir, bool *continuescan) + TupleDesc tupdesc, ScanDirection dir, + bool forcenonrequired, bool *continuescan) { ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument); int32 cmpresult = 0; @@ -2702,7 +3019,11 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, if (isNull) { - if (subkey->sk_flags & SK_BT_NULLS_FIRST) + if (forcenonrequired) + { + /* treating scan's keys as non-required */ + } + else if (subkey->sk_flags & SK_BT_NULLS_FIRST) { /* * Since NULLs are sorted before non-NULLs, we know we have @@ -2756,8 +3077,12 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, */ Assert(subkey != (ScanKey) DatumGetPointer(skey->sk_argument)); subkey--; - if ((subkey->sk_flags & SK_BT_REQFWD) && - ScanDirectionIsForward(dir)) + if (forcenonrequired) + { + /* treating scan's keys as non-required */ + } + else if ((subkey->sk_flags & SK_BT_REQFWD) && + ScanDirectionIsForward(dir)) *continuescan = false; else if ((subkey->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)) @@ -2809,7 +3134,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, break; } - if (!result) + if (!result && !forcenonrequired) { /* * Tuple fails this qual. If it's a required qual for the current @@ -2853,6 +3178,8 @@ _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, OffsetNumber aheadoffnum; IndexTuple ahead; + Assert(!pstate->forcenonrequired); + /* Avoid looking ahead when comparing the page high key */ if (pstate->offnum < pstate->minoff) return; -- 2.49.0