From 660ed6d1059f2d070d49b578319f10e5d8e80832 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Sat, 17 Jun 2023 17:03:36 -0700 Subject: [PATCH v10] Enhance nbtree ScalarArrayOp execution. Commit 9e8da0f7 taught nbtree to handle ScalarArrayOpExpr quals natively. This works by pushing down the full context (the array keys) to the nbtree index AM, enabling it to execute multiple primitive index scans that the planner treats as one continuous index scan/index path. This earlier enhancement enabled nbtree ScalarArrayOp index-only scans. It also allowed scans with ScalarArrayOp quals to return ordered results (with some notable restrictions, described further down). Take this general approach a lot further: teach nbtree SAOP index scans to determine how best to execute ScalarArrayOp scans (how many primitive index scans to use under the hood) by applying information about the physical characteristics of the index at runtime. This approach can be far more efficient. Many cases that previously required thousands of index descents now require as few as one single index descent. And, all SAOP scans reliably avoid duplicative leaf page accesses (just like any other nbtree index scan). The array state machine now advances using binary searches for the array element that best matches the next tuple's attribute value. This whole process makes required scan key arrays (i.e. arrays from scan keys that can terminate the scan) ratchet forward in lockstep with the index scan. Non-required arrays (i.e. arrays from scan keys that can only exclude non-matching tuples) are for the most part advanced via this same search process. We just can't assume a fixed relationship between the current element of any non-required array and the progress of the index scan through the index's key space (that would be wrong). Naturally, only required SAOP scan keys trigger skipping over leaf pages (non-required arrays cannot safely end or start primitive index scans). Consequently, index scans of a composite index with (say) a high-order inequality scan key (which we'll mark required) and a low-order SAOP scan key (which we'll mark non-required) will now reliably output rows in index order. Such scans are always executed as one large index scan under the hood, which is obviously the most efficient way to do it, for the usual reason (no more wasting cycles on repeat leaf page accesses). Generalizing SAOP execution along these lines removes any question of index scans outputting tuples in any order that isn't the index's order. This allow us to remove various special cases from the planner -- which in turn makes the nbtree work more widely applicable and more effective. Bugfix commit 807a40c5 taught the planner to avoid generating unsafe path keys: path keys on a multicolumn index path, with a SAOP clause on any attribute beyond the first/most significant attribute. These cases are now all safe, so we go back to generating path keys without regard for the presence of SAOP clauses (just like with any other clause type). Also undo changes from follow-up bugfix commit a4523c5a, which taught the planner to produce alternative index paths without any low-order ScalarArrayOpExpr quals (making the SAOP quals into filter quals). We'll no longer generate these alternative paths, which can no longer offer any advantage over the index qual paths that we do still generate. Affected queries thereby avoid all of the disadvantages that come from using filter quals within index scan nodes. In particular, they can avoid the extra heap page accesses previously incurred when using filter quals to exclude non-matching tuples (index quals can be used instead). This shift is expected to be fairly common in real world applications, especially with queries that have multiple SAOPs that can now all be used as index quals when scanning a composite index. Queries with low-order SAOPs (especially non-required ones) are also likely to see a significant reduction in heap page accesses. Author: Peter Geoghegan Reviewed-By: Heikki Linnakangas Reviewed-By: Matthias van de Meent Reviewed-By: Tomas Vondra Discussion: https://postgr.es/m/CAH2-Wz=ksvN_sjcnD1+Bt-WtifRA5ok48aDYnq3pkKhxgMQpcw@mail.gmail.com --- doc/src/sgml/monitoring.sgml | 15 + src/backend/access/nbtree/nbtree.c | 80 +- src/backend/access/nbtree/nbtsearch.c | 122 +- src/backend/access/nbtree/nbtutils.c | 1816 ++++++++++++++++++-- src/backend/optimizer/path/indxpath.c | 86 +- src/backend/utils/adt/selfuncs.c | 122 +- src/include/access/nbtree.h | 49 +- src/test/regress/expected/btree_index.out | 479 ++++++ src/test/regress/expected/create_index.out | 31 +- src/test/regress/expected/join.out | 5 +- src/test/regress/sql/btree_index.sql | 147 ++ src/test/regress/sql/create_index.sql | 10 +- 12 files changed, 2601 insertions(+), 361 deletions(-) diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index b804eb8b5..0dd80cc71 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -4062,6 +4062,21 @@ description | Waiting for a newly initialized WAL file to reach durable storage + + + Every time an index is searched, the index's + pg_stat_all_indexes.idx_scan + field is incremented. This usually happens once per index scan node + execution, but might take place several times during execution of a scan + that searches for multiple values together. Queries that use certain + SQL constructs to search for rows matching any value + out of a list (or an array) of multiple scalar values might perform + multiple primitive index scans (up to one primitive scan + per scalar value) at runtime. See + for details. + + + diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 696d79c08..f28fa227f 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -48,8 +48,8 @@ * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan * to a new page; some process can start doing that. * - * BTPARALLEL_DONE indicates that the scan is complete (including error exit). - * We reach this state once for every distinct combination of array keys. + * BTPARALLEL_DONE indicates that the primitive index scan is complete + * (including error exit). Reached once per primitive index scan. */ typedef enum { @@ -69,8 +69,8 @@ typedef struct BTParallelScanDescData BTPS_State btps_pageStatus; /* indicates whether next page is * available for scan. see above for * possible states of parallel scan. */ - int btps_arrayKeyCount; /* count indicating number of array scan - * keys processed by parallel scan */ + int btps_numPrimScans; /* count indicating number of primitive + * index scans (used with array keys) */ slock_t btps_mutex; /* protects above variables */ ConditionVariable btps_cv; /* used to synchronize parallel scan */ } BTParallelScanDescData; @@ -235,7 +235,7 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) _bt_start_array_keys(scan, dir); } - /* This loop handles advancing to the next array elements, if any */ + /* Each loop iteration performs another primitive index scan */ do { /* @@ -277,8 +277,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) /* If we have a tuple, return it ... */ if (res) break; - /* ... otherwise see if we have more array keys to deal with */ - } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir)); + /* ... otherwise see if we need another primitive index scan */ + } while (so->numArrayKeys && _bt_array_keys_remain(scan, dir)); return res; } @@ -305,7 +305,7 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) _bt_start_array_keys(scan, ForwardScanDirection); } - /* This loop handles advancing to the next array elements, if any */ + /* Each loop iteration performs another primitive index scan */ do { /* Fetch the first page & tuple */ @@ -335,8 +335,8 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) ntids++; } } - /* Now see if we have more array keys to deal with */ - } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection)); + /* Now see if we need another primitive index scan */ + } while (so->numArrayKeys && _bt_array_keys_remain(scan, ForwardScanDirection)); return ntids; } @@ -366,9 +366,11 @@ btbeginscan(Relation rel, int nkeys, int norderbys) so->keyData = NULL; so->arrayKeyData = NULL; /* assume no array keys for now */ - so->arraysStarted = false; so->numArrayKeys = 0; + so->advanceDir = NoMovementScanDirection; + so->needPrimScan = false; so->arrayKeys = NULL; + so->orderProcs = NULL; so->arrayContext = NULL; so->killedItems = NULL; /* until needed */ @@ -408,7 +410,9 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, } so->markItemIndex = -1; - so->arrayKeyCount = 0; + so->advanceDir = NoMovementScanDirection; + so->needPrimScan = false; + so->numPrimScans = 0; BTScanPosUnpinIfPinned(so->markPos); BTScanPosInvalidate(so->markPos); @@ -507,10 +511,6 @@ btmarkpos(IndexScanDesc scan) BTScanPosInvalidate(so->markPos); so->markItemIndex = -1; } - - /* Also record the current positions of any array keys */ - if (so->numArrayKeys) - _bt_mark_array_keys(scan); } /* @@ -521,10 +521,6 @@ btrestrpos(IndexScanDesc scan) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - /* Restore the marked positions of any array keys */ - if (so->numArrayKeys) - _bt_restore_array_keys(scan); - if (so->markItemIndex >= 0) { /* @@ -563,6 +559,9 @@ btrestrpos(IndexScanDesc scan) if (so->currTuples) memcpy(so->currTuples, so->markTuples, so->markPos.nextTupleOffset); + /* Rewind the scan's array keys, if any */ + if (so->numArrayKeys) + _bt_rewind_array_keys(scan); } else BTScanPosInvalidate(so->currPos); @@ -589,7 +588,7 @@ btinitparallelscan(void *target) SpinLockInit(&bt_target->btps_mutex); bt_target->btps_scanPage = InvalidBlockNumber; bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; - bt_target->btps_arrayKeyCount = 0; + bt_target->btps_numPrimScans = 0; ConditionVariableInit(&bt_target->btps_cv); } @@ -615,7 +614,7 @@ btparallelrescan(IndexScanDesc scan) SpinLockAcquire(&btscan->btps_mutex); btscan->btps_scanPage = InvalidBlockNumber; btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; - btscan->btps_arrayKeyCount = 0; + btscan->btps_numPrimScans = 0; SpinLockRelease(&btscan->btps_mutex); } @@ -626,7 +625,11 @@ btparallelrescan(IndexScanDesc scan) * * The return value is true if we successfully seized the scan and false * if we did not. The latter case occurs if no pages remain for the current - * set of scankeys. + * primitive index scan. + * + * When array scan keys are in use, each worker process independently advances + * its array keys. It's crucial that each worker process never be allowed to + * scan a page from before the current scan position. * * If the return value is true, *pageno returns the next or current page * of the scan (depending on the scan direction). An invalid block number @@ -657,16 +660,17 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno) SpinLockAcquire(&btscan->btps_mutex); pageStatus = btscan->btps_pageStatus; - if (so->arrayKeyCount < btscan->btps_arrayKeyCount) + if (so->numPrimScans < btscan->btps_numPrimScans) { - /* Parallel scan has already advanced to a new set of scankeys. */ + /* Top-level scan already moved on to next primitive index scan */ status = false; } else if (pageStatus == BTPARALLEL_DONE) { /* - * We're done with this set of scankeys. This may be the end, or - * there could be more sets to try. + * We're done with this primitive index scan. This might have + * been the final primitive index scan required, or the top-level + * index scan might require additional primitive scans. */ status = false; } @@ -698,9 +702,12 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno) void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page) { + BTScanOpaque so PG_USED_FOR_ASSERTS_ONLY = (BTScanOpaque) scan->opaque; ParallelIndexScanDesc parallel_scan = scan->parallel_scan; BTParallelScanDesc btscan; + Assert(!so->needPrimScan); + btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, parallel_scan->ps_offset); @@ -734,12 +741,11 @@ _bt_parallel_done(IndexScanDesc scan) parallel_scan->ps_offset); /* - * Mark the parallel scan as done for this combination of scan keys, - * unless some other process already did so. See also - * _bt_advance_array_keys. + * Mark the primitive index scan as done, unless some other process + * already did so. See also _bt_array_keys_remain. */ SpinLockAcquire(&btscan->btps_mutex); - if (so->arrayKeyCount >= btscan->btps_arrayKeyCount && + if (so->numPrimScans >= btscan->btps_numPrimScans && btscan->btps_pageStatus != BTPARALLEL_DONE) { btscan->btps_pageStatus = BTPARALLEL_DONE; @@ -753,14 +759,14 @@ _bt_parallel_done(IndexScanDesc scan) } /* - * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array - * keys. + * _bt_parallel_next_primitive_scan() -- Advances parallel primitive scan + * counter when array keys are in use. * - * Updates the count of array keys processed for both local and parallel + * Updates the count of primitive index scans for both local and parallel * scans. */ void -_bt_parallel_advance_array_keys(IndexScanDesc scan) +_bt_parallel_next_primitive_scan(IndexScanDesc scan) { BTScanOpaque so = (BTScanOpaque) scan->opaque; ParallelIndexScanDesc parallel_scan = scan->parallel_scan; @@ -769,13 +775,13 @@ _bt_parallel_advance_array_keys(IndexScanDesc scan) btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, parallel_scan->ps_offset); - so->arrayKeyCount++; + so->numPrimScans++; SpinLockAcquire(&btscan->btps_mutex); if (btscan->btps_pageStatus == BTPARALLEL_DONE) { btscan->btps_scanPage = InvalidBlockNumber; btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; - btscan->btps_arrayKeyCount++; + btscan->btps_numPrimScans++; } SpinLockRelease(&btscan->btps_mutex); } diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 63ee9ba22..7b3bbd882 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -907,7 +907,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) */ if (!so->qual_ok) { - /* Notify any other workers that we're done with this scan key. */ + /* Notify any other workers that this primitive scan is done */ _bt_parallel_done(scan); return false; } @@ -1527,10 +1527,11 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, BTPageOpaque opaque; OffsetNumber minoff; OffsetNumber maxoff; - int itemIndex; - bool continuescan; + BTReadPageState pstate; + int numArrayKeys, + itemIndex; int indnatts; - bool continuescanPrechecked; + bool continuescanPrechecked = false; bool haveFirstMatch = false; /* @@ -1551,8 +1552,13 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf)); } - continuescan = true; /* default assumption */ + pstate.dir = dir; + pstate.finaltup = NULL; + pstate.continuescan = true; /* default assumption */ + pstate.finaltupchecked = false; indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation); + numArrayKeys = so->numArrayKeys; + minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); @@ -1599,9 +1605,12 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, * the last item on the page would give a more precise answer. * * We skip this for the first page in the scan to evade the possible - * slowdown of the point queries. + * slowdown of point queries. Never apply the optimization with a scan + * that uses array keys, either, since that breaks certain assumptions. + * (Our search-type scan keys change whenever _bt_checkkeys advances the + * arrays, invalidating any precheck. Tracking all that would be tricky.) */ - if (!firstPage && minoff < maxoff) + if (!firstPage && !numArrayKeys && minoff < maxoff) { ItemId iid; IndexTuple itup; @@ -1610,21 +1619,24 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, itup = (IndexTuple) PageGetItem(page, iid); /* - * Do the precheck. Note that we pass the pointer to the - * 'continuescanPrechecked' to the 'continuescan' argument. That will - * set flag to true if all required keys are satisfied and false - * otherwise. + * Flag variable is set when all scan keys that are required in the + * current scan direction are satisfied by the last item on the page */ - (void) _bt_checkkeys(scan, itup, indnatts, dir, - &continuescanPrechecked, false, false); - } - else - { - continuescanPrechecked = false; + _bt_checkkeys(scan, &pstate, itup, false, indnatts, false, false); + continuescanPrechecked = pstate.continuescan; + pstate.continuescan = true; /* reset */ } if (ScanDirectionIsForward(dir)) { + /* SK_SEARCHARRAY forward scans must provide high key up front */ + if (numArrayKeys && !P_RIGHTMOST(opaque)) + { + ItemId iid = PageGetItemId(page, P_HIKEY); + + pstate.finaltup = (IndexTuple) PageGetItem(page, iid); + } + /* load items[] in ascending order */ itemIndex = 0; @@ -1649,8 +1661,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, itup = (IndexTuple) PageGetItem(page, iid); Assert(!BTreeTupleIsPivot(itup)); - passes_quals = _bt_checkkeys(scan, itup, indnatts, dir, - &continuescan, + passes_quals = _bt_checkkeys(scan, &pstate, itup, false, indnatts, continuescanPrechecked, haveFirstMatch); @@ -1659,9 +1670,9 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, * assert-enabled builds we also recheck that the _bt_checkkeys() * result is the same. */ - Assert((!continuescanPrechecked && haveFirstMatch) || - passes_quals == _bt_checkkeys(scan, itup, indnatts, dir, - &continuescan, false, false)); + Assert((!continuescanPrechecked && haveFirstMatch) || numArrayKeys || + passes_quals == _bt_checkkeys(scan, &pstate, itup, false, + indnatts, false, false)); if (passes_quals) { /* tuple passes all scan key conditions */ @@ -1696,7 +1707,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, } } /* When !continuescan, there can't be any more matches, so stop */ - if (!continuescan) + if (!pstate.continuescan) break; offnum = OffsetNumberNext(offnum); @@ -1713,17 +1724,17 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, * only appear on non-pivot tuples on the right sibling page are * common. */ - if (continuescan && !P_RIGHTMOST(opaque)) + if (pstate.continuescan && !P_RIGHTMOST(opaque)) { ItemId iid = PageGetItemId(page, P_HIKEY); IndexTuple itup = (IndexTuple) PageGetItem(page, iid); int truncatt; truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation); - _bt_checkkeys(scan, itup, truncatt, dir, &continuescan, false, false); + _bt_checkkeys(scan, &pstate, itup, true, truncatt, false, false); } - if (!continuescan) + if (!pstate.continuescan) so->currPos.moreRight = false; Assert(itemIndex <= MaxTIDsPerBTreePage); @@ -1733,6 +1744,14 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, } else { + /* SK_SEARCHARRAY backward scans must provide final tuple up front */ + if (numArrayKeys && minoff <= maxoff) + { + ItemId iid = PageGetItemId(page, minoff); + + pstate.finaltup = (IndexTuple) PageGetItem(page, iid); + } + /* load items[] in descending order */ itemIndex = MaxTIDsPerBTreePage; @@ -1744,6 +1763,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, IndexTuple itup; bool tuple_alive; bool passes_quals; + bool finaltup = (offnum == minoff); /* * If the scan specifies not to return killed tuples, then we @@ -1754,12 +1774,18 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, * tuple on the page, we do check the index keys, to prevent * uselessly advancing to the page to the left. This is similar * to the high key optimization used by forward scans. + * + * Separately, _bt_checkkeys actually requires that we call it + * with the final non-pivot tuple from the page, if there's one + * (final processed tuple, or first tuple in offset number terms). + * We must indicate which particular tuple comes last, too. */ if (scan->ignore_killed_tuples && ItemIdIsDead(iid)) { Assert(offnum >= P_FIRSTDATAKEY(opaque)); - if (offnum > P_FIRSTDATAKEY(opaque)) + if (!finaltup) { + Assert(offnum > minoff); offnum = OffsetNumberPrev(offnum); continue; } @@ -1772,9 +1798,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, itup = (IndexTuple) PageGetItem(page, iid); Assert(!BTreeTupleIsPivot(itup)); - passes_quals = _bt_checkkeys(scan, itup, indnatts, dir, - &continuescan, - continuescanPrechecked, + passes_quals = _bt_checkkeys(scan, &pstate, itup, finaltup, + indnatts, continuescanPrechecked, haveFirstMatch); /* @@ -1782,9 +1807,10 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, * assert-enabled builds we also recheck that the _bt_checkkeys() * result is the same. */ - Assert((!continuescanPrechecked && !haveFirstMatch) || - passes_quals == _bt_checkkeys(scan, itup, indnatts, dir, - &continuescan, false, false)); + Assert((!continuescanPrechecked && !haveFirstMatch) || numArrayKeys || + passes_quals == _bt_checkkeys(scan, &pstate, itup, + finaltup, indnatts, + false, false)); if (passes_quals && tuple_alive) { /* tuple passes all scan key conditions */ @@ -1824,7 +1850,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, } } } - if (!continuescan) + if (!pstate.continuescan) { /* there can't be any more matches, so stop */ so->currPos.moreLeft = false; @@ -1999,6 +2025,20 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) /* Remember we left a page with data */ so->currPos.moreLeft = true; + /* + * If the scan direction changed since our array keys (if any) last + * advanced, we cannot trust _bt_readpage's determination that there + * are no matches to be found to the right + */ + if (ScanDirectionIsBackward(so->advanceDir)) + { + Assert(so->numArrayKeys); + + so->currPos.moreRight = true; + so->advanceDir = dir; + so->needPrimScan = false; + } + /* release the previous buffer, if pinned */ BTScanPosUnpinIfPinned(so->currPos); } @@ -2007,6 +2047,20 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) /* Remember we left a page with data */ so->currPos.moreRight = true; + /* + * If the scan direction changed since our array keys (if any) last + * advanced, we cannot trust _bt_readpage's determination that there + * are no matches to be found to the left + */ + if (ScanDirectionIsForward(so->advanceDir)) + { + Assert(so->numArrayKeys); + + so->currPos.moreLeft = true; + so->advanceDir = dir; + so->needPrimScan = false; + } + if (scan->parallel_scan != NULL) { /* diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 2e6fc14d7..c26ed8132 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -33,7 +33,7 @@ typedef struct BTSortArrayContext { - FmgrInfo flinfo; + FmgrInfo *orderproc; Oid collation; bool reverse; } BTSortArrayContext; @@ -41,15 +41,42 @@ typedef struct BTSortArrayContext static Datum _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey, StrategyNumber strat, Datum *elems, int nelems); +static void _bt_sort_array_cmp_setup(IndexScanDesc scan, ScanKey skey); static int _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey, bool reverse, Datum *elems, int nelems); +static int _bt_merge_arrays(IndexScanDesc scan, ScanKey skey, bool reverse, + Datum *elems_orig, int nelems_orig, + Datum *elems_next, int nelems_next); static int _bt_compare_array_elements(const void *a, const void *b, void *arg); +static inline int32 _bt_compare_array_skey(FmgrInfo *orderproc, + Datum tupdatum, bool tupnull, + Datum arrdatum, ScanKey cur); +static int _bt_binsrch_array_skey(FmgrInfo *orderproc, + bool cur_elem_start, ScanDirection dir, + Datum tupdatum, bool tupnull, + BTArrayKeyInfo *array, ScanKey cur, + int32 *set_elem_result); +static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir); +static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, + BTReadPageState *pstate, + IndexTuple tuple, int sktrig, + bool validtrig); +static bool _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, + IndexTuple tuple, int sktrig); +static void _bt_update_keys_with_arraykeys(IndexScanDesc scan); +#ifdef USE_ASSERT_CHECKING +static bool _bt_verify_keys_with_arraykeys(IndexScanDesc scan); +#endif static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, ScanKey leftarg, ScanKey rightarg, bool *result); static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption); static void _bt_mark_scankey_required(ScanKey skey); +static bool _bt_check_compare(ScanDirection dir, BTScanOpaque so, + IndexTuple tuple, int tupnatts, TupleDesc tupdesc, + int numArrayKeys, bool *continuescan, int *ikey, + bool continuescanPrechecked, bool haveFirstMatch); static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, ScanDirection dir, bool *continuescan); @@ -190,13 +217,48 @@ _bt_freestack(BTStack stack) * If there are any SK_SEARCHARRAY scan keys, deconstruct the array(s) and * set up BTArrayKeyInfo info for each one that is an equality-type key. * Prepare modified scan keys in so->arrayKeyData, which will hold the current - * array elements during each primitive indexscan operation. For inequality - * array keys, it's sufficient to find the extreme element value and replace - * the whole array with that scalar value. + * array elements. + * + * _bt_preprocess_keys treats each primitive scan as an independent piece of + * work. We perform all preprocessing that must work "across array keys". + * This division of labor makes sense once you consider that we're called only + * once per btrescan, whereas _bt_preprocess_keys is called once per primitive + * index scan. + * + * Currently we perform two kinds of preprocessing to deal with redundancies. + * For inequality array keys, it's sufficient to find the extreme element + * value and replace the whole array with that scalar value. This eliminates + * all but one array key as redundant. Similarly, we are capable of "merging + * together" multiple equality array keys from two or more input scan keys + * into a single output scan key that contains only the intersecting array + * elements. This can eliminate many redundant array elements, as well as + * eliminating whole array scan keys as redundant. + * + * Note: _bt_start_array_keys actually sets up the cur_elem counters later on, + * once the scan direction is known. * * Note: the reason we need so->arrayKeyData, rather than just scribbling * on scan->keyData, is that callers are permitted to call btrescan without * supplying a new set of scankey data. + * + * Note: _bt_preprocess_keys is responsible for creating the so->keyData scan + * keys used by _bt_checkkeys. Index scans that don't use equality array keys + * will have _bt_preprocess_keys treat scan->keyData as input and so->keyData + * as output. Scans that use equality array keys have _bt_preprocess_keys + * treat so->arrayKeyData (which is our output) as their input, while (as per + * usual) outputting so->keyData for _bt_checkkeys. This function adds an + * additional layer of indirection that allows _bt_preprocess_keys to more or + * less avoid dealing with SK_SEARCHARRAY as a special case. + * + * Note: _bt_update_keys_with_arraykeys works by updating already-processed + * output keys (so->keyData) in-place. It cannot eliminate redundant or + * contradictory scan keys. This necessitates having _bt_preprocess_keys + * understand that it is unsafe to eliminate "redundant" SK_SEARCHARRAY + * equality scan keys on the basis of what is actually just the current array + * key values -- it must conservatively assume that such a scan key might no + * longer be redundant after the next _bt_update_keys_with_arraykeys call. + * Ideally we'd be able to deal with that by eliminating a subset of truly + * redundant array keys up-front, but it doesn't seem worth the trouble. */ void _bt_preprocess_array_keys(IndexScanDesc scan) @@ -204,7 +266,10 @@ _bt_preprocess_array_keys(IndexScanDesc scan) BTScanOpaque so = (BTScanOpaque) scan->opaque; int numberOfKeys = scan->numberOfKeys; int16 *indoption = scan->indexRelation->rd_indoption; + int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(scan->indexRelation); int numArrayKeys; + int lastEqualityArrayAtt = -1; + Oid lastOrderProc = InvalidOid; ScanKey cur; int i; MemoryContext oldContext; @@ -257,6 +322,8 @@ _bt_preprocess_array_keys(IndexScanDesc scan) /* Allocate space for per-array data in the workspace context */ so->arrayKeys = (BTArrayKeyInfo *) palloc0(numArrayKeys * sizeof(BTArrayKeyInfo)); + so->orderProcs = (FmgrInfo *) palloc0(nkeyatts * sizeof(FmgrInfo)); + so->advanceDir = NoMovementScanDirection; /* Now process each array key */ numArrayKeys = 0; @@ -273,6 +340,16 @@ _bt_preprocess_array_keys(IndexScanDesc scan) int j; cur = &so->arrayKeyData[i]; + + /* + * Attributes with equality-type scan keys (including but not limited + * to array scan keys) will need a 3-way comparison function. Set + * that up now. (Avoids repeating work for the same attribute.) + */ + if (cur->sk_strategy == BTEqualStrategyNumber && + !OidIsValid(so->orderProcs[cur->sk_attno - 1].fn_oid)) + _bt_sort_array_cmp_setup(scan, cur); + if (!(cur->sk_flags & SK_SEARCHARRAY)) continue; @@ -349,6 +426,46 @@ _bt_preprocess_array_keys(IndexScanDesc scan) (indoption[cur->sk_attno - 1] & INDOPTION_DESC) != 0, elem_values, num_nonnulls); + /* + * If this scan key is semantically equivalent to a previous equality + * operator array scan key, merge the two arrays together to eliminate + * redundant non-intersecting elements (and redundant whole scan keys) + */ + if (lastEqualityArrayAtt == cur->sk_attno && + lastOrderProc == cur->sk_func.fn_oid) + { + BTArrayKeyInfo *prev = &so->arrayKeys[numArrayKeys - 1]; + + Assert(so->arrayKeyData[prev->scan_key].sk_subtype == + cur->sk_subtype); + + num_elems = _bt_merge_arrays(scan, cur, + (indoption[cur->sk_attno - 1] & INDOPTION_DESC) != 0, + prev->elem_values, prev->num_elems, + elem_values, num_elems); + + pfree(elem_values); + + /* + * If there are no intersecting elements left from merging this + * array into the previous array on the same attribute, the scan + * qual is unsatisfiable + */ + if (num_elems == 0) + { + numArrayKeys = -1; + break; + } + + /* + * Lower the number of elements from the previous array, and mark + * this scan key/array as redundant for every primitive index scan + */ + prev->num_elems = num_elems; + cur->sk_flags |= SK_BT_RDDNARRAY; + continue; + } + /* * And set up the BTArrayKeyInfo data. */ @@ -356,6 +473,8 @@ _bt_preprocess_array_keys(IndexScanDesc scan) so->arrayKeys[numArrayKeys].num_elems = num_elems; so->arrayKeys[numArrayKeys].elem_values = elem_values; numArrayKeys++; + lastEqualityArrayAtt = cur->sk_attno; + lastOrderProc = cur->sk_func.fn_oid; } so->numArrayKeys = numArrayKeys; @@ -429,26 +548,28 @@ _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey, } /* - * _bt_sort_array_elements() -- sort and de-dup array elements + * _bt_sort_array_cmp_setup() -- Look up array comparison function * - * The array elements are sorted in-place, and the new number of elements - * after duplicate removal is returned. - * - * scan and skey identify the index column, whose opfamily determines the - * comparison semantics. If reverse is true, we sort in descending order. + * Sets so->orderProcs[] for scan key's attribute. This is used to sort and + * deduplicate the attribute's array (if any). It's also used during binary + * searches of the next array key matching index tuples just beyond the range + * of the scan's current set of array keys. */ -static int -_bt_sort_array_elements(IndexScanDesc scan, ScanKey skey, - bool reverse, - Datum *elems, int nelems) +static void +_bt_sort_array_cmp_setup(IndexScanDesc scan, ScanKey skey) { + BTScanOpaque so = (BTScanOpaque) scan->opaque; Relation rel = scan->indexRelation; Oid elemtype; RegProcedure cmp_proc; - BTSortArrayContext cxt; + FmgrInfo *orderproc = &so->orderProcs[skey->sk_attno - 1]; - if (nelems <= 1) - return nelems; /* no work to do */ + /* + * Should do this for all equality strategy scan keys only (including + * those without any array). See _bt_advance_array_keys for details of + * why we need an ORDER proc for non-array equality strategy scan keys. + */ + Assert(skey->sk_strategy == BTEqualStrategyNumber); /* * Determine the nominal datatype of the array elements. We have to @@ -462,22 +583,44 @@ _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey, /* * Look up the appropriate comparison function in the opfamily. * - * Note: it's possible that this would fail, if the opfamily is - * incomplete, but it seems quite unlikely that an opfamily would omit - * non-cross-type support functions for any datatype that it supports at - * all. + * Note: it's possible that this would fail, if the opfamily lacks the + * required cross-type ORDER proc. But this is no different to the case + * where _bt_first fails to find an ORDER proc for its insertion scan key. */ cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1], - elemtype, - elemtype, + rel->rd_opcintype[skey->sk_attno - 1], elemtype, BTORDER_PROC); if (!RegProcedureIsValid(cmp_proc)) - elog(ERROR, "missing support function %d(%u,%u) in opfamily %u", - BTORDER_PROC, elemtype, elemtype, - rel->rd_opfamily[skey->sk_attno - 1]); + elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"", + BTORDER_PROC, rel->rd_opcintype[skey->sk_attno - 1], elemtype, + skey->sk_attno, RelationGetRelationName(rel)); + + /* Save in orderproc entry for attribute */ + fmgr_info_cxt(cmp_proc, orderproc, so->arrayContext); +} + +/* + * _bt_sort_array_elements() -- sort and de-dup array elements + * + * The array elements are sorted in-place, and the new number of elements + * after duplicate removal is returned. + * + * scan and skey identify the index column, whose opfamily determines the + * comparison semantics. If reverse is true, we sort in descending order. + */ +static int +_bt_sort_array_elements(IndexScanDesc scan, ScanKey skey, + bool reverse, + Datum *elems, int nelems) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTSortArrayContext cxt; + + if (nelems <= 1) + return nelems; /* no work to do */ /* Sort the array elements */ - fmgr_info(cmp_proc, &cxt.flinfo); + cxt.orderproc = &so->orderProcs[skey->sk_attno - 1]; cxt.collation = skey->sk_collation; cxt.reverse = reverse; qsort_arg(elems, nelems, sizeof(Datum), @@ -488,6 +631,48 @@ _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey, _bt_compare_array_elements, &cxt); } +/* + * _bt_merge_arrays() -- merge together duplicate array keys + * + * Both scan keys have array elements that have already been sorted and + * deduplicated. + */ +static int +_bt_merge_arrays(IndexScanDesc scan, ScanKey skey, bool reverse, + Datum *elems_orig, int nelems_orig, + Datum *elems_next, int nelems_next) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTSortArrayContext cxt; + Datum *merged = palloc(sizeof(Datum) * Min(nelems_orig, nelems_next)); + int merged_nelems = 0; + + /* + * Incrementally copy the original array into a temp buffer, skipping over + * any items that are missing from the "next" array + */ + cxt.orderproc = &so->orderProcs[skey->sk_attno - 1]; + cxt.collation = skey->sk_collation; + cxt.reverse = reverse; + for (int i = 0; i < nelems_orig; i++) + { + Datum *elem = elems_orig + i; + + if (bsearch_arg(elem, elems_next, nelems_next, sizeof(Datum), + _bt_compare_array_elements, &cxt)) + merged[merged_nelems++] = *elem; + } + + /* + * Overwrite the original array with temp buffer so that we're only left + * with intersecting array elements + */ + memcpy(elems_orig, merged, merged_nelems * sizeof(Datum)); + pfree(merged); + + return merged_nelems; +} + /* * qsort_arg comparator for sorting array elements */ @@ -499,7 +684,7 @@ _bt_compare_array_elements(const void *a, const void *b, void *arg) BTSortArrayContext *cxt = (BTSortArrayContext *) arg; int32 compare; - compare = DatumGetInt32(FunctionCall2Coll(&cxt->flinfo, + compare = DatumGetInt32(FunctionCall2Coll(cxt->orderproc, cxt->collation, da, db)); if (cxt->reverse) @@ -507,6 +692,159 @@ _bt_compare_array_elements(const void *a, const void *b, void *arg) return compare; } +/* + * _bt_compare_array_skey() -- apply array comparison function + * + * Compares caller's tuple attribute value to a scan key/array element. + * Helper function used during binary searches of SK_SEARCHARRAY arrays. + * + * This routine returns: + * <0 if tupdatum < arrdatum; + * 0 if tupdatum == arrdatum; + * >0 if tupdatum > arrdatum. + * + * This is essentially the same interface as _bt_compare: both functions + * compare the value that they're searching for to a binary search pivot. + * However, unlike _bt_compare, this function's "tuple argument" comes first, + * while its "array/scankey argument" comes second. +*/ +static inline int32 +_bt_compare_array_skey(FmgrInfo *orderproc, + Datum tupdatum, bool tupnull, + Datum arrdatum, ScanKey cur) +{ + int32 result = 0; + + Assert(cur->sk_strategy == BTEqualStrategyNumber); + + if (tupnull) /* NULL tupdatum */ + { + if (cur->sk_flags & SK_ISNULL) + result = 0; /* NULL "=" NULL */ + else if (cur->sk_flags & SK_BT_NULLS_FIRST) + result = -1; /* NULL "<" NOT_NULL */ + else + result = 1; /* NULL ">" NOT_NULL */ + } + else if (cur->sk_flags & SK_ISNULL) /* NOT_NULL tupdatum, NULL arrdatum */ + { + if (cur->sk_flags & SK_BT_NULLS_FIRST) + result = 1; /* NOT_NULL ">" NULL */ + else + result = -1; /* NOT_NULL "<" NULL */ + } + else + { + /* + * Like _bt_compare, we need to be careful of cross-type comparisons, + * so the left value has to be the value that came from an index tuple + */ + result = DatumGetInt32(FunctionCall2Coll(orderproc, cur->sk_collation, + tupdatum, arrdatum)); + + /* + * We flip the sign by following the obvious rule: flip whenever the + * column is a DESC column. + * + * _bt_compare does it the wrong way around (flip when *ASC*) in order + * to compensate for passing its orderproc arguments backwards. We + * don't need to play these games because we find it natural to pass + * tupdatum as the left value (and arrdatum as the right value). + */ + if (cur->sk_flags & SK_BT_DESC) + INVERT_COMPARE_RESULT(result); + } + + return result; +} + +/* + * _bt_binsrch_array_skey() -- Binary search for next matching array key + * + * Returns an index to the first array element >= caller's tupdatum argument. + * This convention is more natural for forwards scan callers, but that can't + * really matter to backwards scan callers. Both callers require handling for + * the case where the match we return is < tupdatum, and symmetric handling + * for the case where our best match is > tupdatum. + * + * Also sets *set_elem_result to whatever _bt_compare_array_skey returned when + * we compared the returned array element to caller's tupdatum argument. This + * helps our caller to determine how advancing its array (to the element we'll + * return an offset to) might need to carry to higher order arrays. + * + * cur_elem_start indicates if the binary search should begin at the array's + * current element (or have the current element as an upper bound if it's a + * backward scan). It's safe for searches against required scan key arrays to + * reuse earlier search bounds like this because such arrays always advance in + * lockstep with the index scan's progress through the index's key space. + */ +static int +_bt_binsrch_array_skey(FmgrInfo *orderproc, + bool cur_elem_start, ScanDirection dir, + Datum tupdatum, bool tupnull, + BTArrayKeyInfo *array, ScanKey cur, + int32 *set_elem_result) +{ + int low_elem, + mid_elem, + high_elem, + result = 0; + + Assert(cur->sk_flags & SK_SEARCHARRAY); + Assert(cur->sk_strategy == BTEqualStrategyNumber); + + low_elem = 0; + mid_elem = -1; + high_elem = array->num_elems - 1; + if (cur_elem_start) + { + if (ScanDirectionIsForward(dir)) + low_elem = array->cur_elem; + else + high_elem = array->cur_elem; + } + + while (high_elem > low_elem) + { + Datum arrdatum; + + mid_elem = low_elem + ((high_elem - low_elem) / 2); + arrdatum = array->elem_values[mid_elem]; + + result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, + arrdatum, cur); + + if (result == 0) + { + /* + * Each array was deduplicated during initial preprocessing, so + * it's safe to quit as soon as we see an equal array element. + * This often saves an extra comparison or two... + */ + low_elem = mid_elem; + break; + } + + if (result > 0) + low_elem = mid_elem + 1; + else + high_elem = mid_elem; + } + + /* + * ...but our caller also cares about how its searched-for tuple datum + * compares to the array element we'll return. Set *set_elem_result with + * the result of that comparison specifically. + */ + if (low_elem != mid_elem) + result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, + array->elem_values[low_elem], cur); + + *set_elem_result = result; + + return low_elem; +} + /* * _bt_start_array_keys() -- Initialize array keys at start of a scan * @@ -532,29 +870,40 @@ _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir) skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem]; } - so->arraysStarted = true; + so->advanceDir = dir; } /* - * _bt_advance_array_keys() -- Advance to next set of array elements + * _bt_advance_array_keys_increment() -- Advance to next set of array elements + * + * Advances the array keys by a single increment in the current scan + * direction. When there are multiple array keys this can roll over from the + * lowest order array to higher order arrays. * * Returns true if there is another set of values to consider, false if not. * On true result, the scankeys are initialized with the next set of values. + * On false result, the scankeys stay the same, and the array keys are not + * advanced (every array remains at its final element for scan direction). + * + * Note: routine only initializes so->arrayKeyData[] scankeys. Caller must + * either call _bt_update_keys_with_arraykeys or call _bt_preprocess_keys to + * update the scan's search-type scankeys. */ -bool -_bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir) +static bool +_bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; bool found = false; - int i; + + Assert(!so->needPrimScan); /* * We must advance the last array key most quickly, since it will * correspond to the lowest-order index column among the available - * qualifications. This is necessary to ensure correct ordering of output - * when there are multiple array keys. + * qualifications. Rolling over like this is necessary to ensure correct + * ordering of output when there are multiple array keys. */ - for (i = so->numArrayKeys - 1; i >= 0; i--) + for (int i = so->numArrayKeys - 1; i >= 0; i--) { BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key]; @@ -588,85 +937,988 @@ _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir) break; } - /* advance parallel scan */ - if (scan->parallel_scan != NULL) - _bt_parallel_advance_array_keys(scan); + if (found) + return true; /* - * When no new array keys were found, the scan is "past the end" of the - * array keys. _bt_start_array_keys can still "restart" the array keys if - * a rescan is required. + * Don't allow the entire set of array keys to roll over: restore the + * array keys to the state they were in before we were called. + * + * This ensures that the array keys only ratchet forward (or backwards in + * the case of backward scans). Our "so->arrayKeyData[]" scan keys should + * always match the current "so->keyData[]" search-type scan keys (except + * for a brief moment during array key advancement). */ - if (!found) - so->arraysStarted = false; - - return found; -} - -/* - * _bt_mark_array_keys() -- Handle array keys during btmarkpos - * - * Save the current state of the array keys as the "mark" position. - */ -void -_bt_mark_array_keys(IndexScanDesc scan) -{ - BTScanOpaque so = (BTScanOpaque) scan->opaque; - int i; - - for (i = 0; i < so->numArrayKeys; i++) + for (int i = 0; i < so->numArrayKeys; i++) { - BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; + BTArrayKeyInfo *rollarray = &so->arrayKeys[i]; + ScanKey skey = &so->arrayKeyData[rollarray->scan_key]; - curArrayKey->mark_elem = curArrayKey->cur_elem; + if (ScanDirectionIsBackward(dir)) + rollarray->cur_elem = 0; + else + rollarray->cur_elem = rollarray->num_elems - 1; + skey->sk_argument = rollarray->elem_values[rollarray->cur_elem]; } + + return false; } /* - * _bt_restore_array_keys() -- Handle array keys during btrestrpos + * _bt_rewind_array_keys() -- Handle array keys during btrestrpos * - * Restore the array keys to where they were when the mark was set. + * Restore the array keys to the start of the key space for the current scan + * direction. */ void -_bt_restore_array_keys(IndexScanDesc scan) +_bt_rewind_array_keys(IndexScanDesc scan) { BTScanOpaque so = (BTScanOpaque) scan->opaque; bool changed = false; - int i; - /* Restore each array key to its position when the mark was set */ - for (i = 0; i < so->numArrayKeys; i++) + Assert(so->advanceDir != NoMovementScanDirection); + + /* + * Restore each array key to its initial position for the current scan + * direction as of the last time the arrays advanced + */ + for (int i = 0; i < so->numArrayKeys; i++) { BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key]; - int mark_elem = curArrayKey->mark_elem; + int first_elem_dir; - if (curArrayKey->cur_elem != mark_elem) + if (ScanDirectionIsForward(so->advanceDir)) + first_elem_dir = 0; + else + first_elem_dir = curArrayKey->num_elems - 1; + + if (curArrayKey->cur_elem != first_elem_dir) { - curArrayKey->cur_elem = mark_elem; - skey->sk_argument = curArrayKey->elem_values[mark_elem]; + curArrayKey->cur_elem = first_elem_dir; + skey->sk_argument = curArrayKey->elem_values[first_elem_dir]; changed = true; } } + if (changed) + _bt_update_keys_with_arraykeys(scan); + + Assert(_bt_verify_keys_with_arraykeys(scan)); + /* - * If we changed any keys, we must redo _bt_preprocess_keys. That might - * sound like overkill, but in cases with multiple keys per index column - * it seems necessary to do the full set of pushups. + * Invert the scan direction as of the last time the array keys advanced. * - * Also do this whenever the scan's set of array keys "wrapped around" at - * the end of the last primitive index scan. There won't have been a call - * to _bt_preprocess_keys from some other place following wrap around, so - * we do it for ourselves. + * This prevents _bt_steppage from fully trusting currPos.moreRight and + * currPos.moreLeft in cases where _bt_readpage/_bt_checkkeys don't get + * the opportunity to consider advancing the array keys as expected. */ - if (changed || !so->arraysStarted) - { - _bt_preprocess_keys(scan); - /* The mark should have been set on a consistent set of keys... */ - Assert(so->qual_ok); - } + if (ScanDirectionIsForward(so->advanceDir)) + so->advanceDir = BackwardScanDirection; + else + so->advanceDir = ForwardScanDirection; } +/* + * _bt_tuple_before_array_skeys() -- _bt_checkkeys array helper function + * + * Routine to determine if a continuescan=false tuple (set that way by an + * initial call to _bt_check_compare) must advance the scan's array keys. + * Only call here when _bt_check_compare already set continuescan=false. + * + * Returns true when caller passes a tuple that is < the current set of array + * keys for the most significant non-equal column/scan key (or > for backwards + * scans). This means that it cannot possibly be time to advance the array + * keys just yet. _bt_checkkeys caller should suppress its _bt_check_compare + * call, and return -- the tuple is treated as not satisfying our indexquals. + * + * Returns false when caller's tuple is >= the current array keys (or <=, in + * the case of backwards scans). This means that it is now time for our + * caller to advance the array keys (unless caller broke the rules by not + * checking with _bt_check_compare before calling here). + * + * Note: advancing the array keys may be required when every attribute value + * from caller's tuple is equal to corresponding scan key/array datums. See + * _bt_advance_array_keys and its handling of inequalities for details. + * + * Note: caller passes _bt_check_compare-set sktrig value to indicate which + * scan key triggered the call. If this is for any scan key that isn't a + * required equality strategy scan key, calling here is a no-op, meaning that + * we'll invariably return false. We just accept whatever _bt_check_compare + * indicated about the scan when it involves a required inequality scan key. + * We never care about nonrequired scan keys, including equality strategy + * array scan keys (though _bt_check_compare can temporarily end the scan to + * advance their arrays in _bt_advance_array_keys, which we'll never prevent). + */ +static bool +_bt_tuple_before_array_skeys(IndexScanDesc scan, BTReadPageState *pstate, + IndexTuple tuple, int sktrig, bool validtrig) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Relation rel = scan->indexRelation; + ScanDirection dir = pstate->dir; + TupleDesc itupdesc = RelationGetDescr(rel); + ScanKey cur; + int ntupatts = BTreeTupleGetNAtts(tuple, rel), + ikey; + + Assert(so->numArrayKeys > 0); + Assert(so->numberOfKeys > 0); + Assert(!so->needPrimScan); + + for (cur = so->keyData + sktrig, ikey = sktrig; + ikey < so->numberOfKeys; + cur++, ikey++) + { + int attnum = cur->sk_attno; + FmgrInfo *orderproc; + Datum tupdatum; + bool tupnull; + int32 result; + + /* + * Unlike _bt_check_compare and _bt_advance_array_keys, we never deal + * with inequality strategy scan keys (even those marked required). We + * also don't deal with non-required equality keys -- even when they + * happen to have arrays that might need to be advanced. + * + * Note: cannot "break" here due to corner cases involving redundant + * scan keys that weren't eliminated within _bt_preprocess_keys. + */ + if (cur->sk_strategy != BTEqualStrategyNumber || + (cur->sk_flags & SK_BT_REQFWD) == 0) + continue; + + /* Required equality scan keys always required in both directions */ + Assert((cur->sk_flags & SK_BT_REQFWD) && + (cur->sk_flags & SK_BT_REQBKWD)); + + if (attnum > ntupatts) + { + Assert(!validtrig); + + /* + * When we reach a high key's truncated attribute, assume that the + * tuple attribute's value is >= the scan's equality constraint + * scan keys, forcing another _bt_advance_array_keys call. + * + * You might wonder why we don't treat truncated attributes as + * having values < our equality constraints instead; we're not + * treating the truncated attributes as having -inf values here, + * which is how things are done in _bt_compare. + * + * We're often called during finaltup prechecks, where we help our + * caller to decide whether or not it should terminate the current + * primitive index scan. Our behavior here implements a policy of + * being slightly optimistic about what will be found on the next + * page when the current primitive scan continues onto that page. + * (This is also closest to what _bt_check_compare does.) + */ + return false; + } + + tupdatum = index_getattr(tuple, attnum, itupdesc, &tupnull); + + orderproc = &so->orderProcs[attnum - 1]; + result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, + cur->sk_argument, cur); + + /* + * Does this comparison indicate that caller must _not_ advance the + * scan's arrays just yet? (This implements the linear search process + * described in _bt_advance_array_keys.) + */ + if ((ScanDirectionIsForward(dir) && result < 0) || + (ScanDirectionIsBackward(dir) && result > 0)) + return true; + + /* + * Does this comparison indicate that caller should now advance the + * scan's arrays? + */ + if (validtrig || result != 0) + { + Assert(result != 0); + return false; + } + + /* + * Inconcusive -- need to check later scan keys, too. + * + * This must be a finaltup precheck, or perhaps a call made from an + * assertion. + */ + Assert(result == 0); + Assert(!validtrig); + } + + /* + * Default assumption is that caller must now advance the array keys. + * + * Note that we'll always end up here when sktrig corresponds to some + * non-required array type scan key that _bt_check_compare saw wasn't + * satisfied by caller's tuple. + */ + return false; +} + +/* + * _bt_array_keys_remain() -- start scheduled primitive index scan? + * + * Returns true if _bt_checkkeys scheduled another primitive index scan, just + * as the last one ended. Otherwise returns false, indicating that the array + * keys are now fully exhausted. + * + * Only call here during scans with one or more equality type array scan keys. + */ +bool +_bt_array_keys_remain(IndexScanDesc scan, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + Assert(so->numArrayKeys); + Assert(so->advanceDir == dir); + + /* + * Array keys are advanced within _bt_checkkeys when the scan reaches the + * leaf level (more precisely, they're advanced when the scan reaches the + * end of each distinct set of array elements). This process avoids + * repeat access to leaf pages (across multiple primitive index scans) by + * advancing the scan's array keys when it allows the primitive index scan + * to find nearby matching tuples (or when it eliminates ranges of array + * key space that can't possibly be satisfied by any index tuple). + * + * _bt_checkkeys sets a simple flag variable to schedule another primitive + * index scan. This tells us what to do. We cannot rely on _bt_first + * always reaching _bt_checkkeys, though. There are various cases where + * that won't happen. For example, if the index is completely empty, then + * _bt_first won't get as far as calling _bt_readpage/_bt_checkkeys. + * + * We also don't expect _bt_checkkeys to be reached when searching for a + * non-existent value that happens to be higher than any existing value in + * the index. No _bt_checkkeys are expected when _bt_readpage reads the + * rightmost page during such a scan -- even a _bt_checkkeys call against + * the high key won't happen. There is an analogous issue for backwards + * scans that search for a value lower than all existing index tuples. + * + * We don't actually require special handling for these cases -- we don't + * need to be explicitly instructed to _not_ perform another primitive + * index scan. This is correct for all of the cases we've listed so far, + * which all involve primitive index scans that access pages "near the + * boundaries of the key space" (the leftmost page, the rightmost page, or + * an imaginary empty leaf root page). If _bt_checkkeys cannot be reached + * by a primitive index scan for one set of array keys, it follows that it + * also won't be reached for any later set of array keys... + */ + if (!so->qual_ok) + { + /* + * ...though there is one exception: _bt_first's _bt_preprocess_keys + * call can determine that the scan's input scan keys can never be + * satisfied. That might be true for one set of array keys, but not + * the next set. + * + * Handle this by advancing the array keys incrementally ourselves. + * When this succeeds, start another primitive index scan. + */ + CHECK_FOR_INTERRUPTS(); + + Assert(!so->needPrimScan); + if (_bt_advance_array_keys_increment(scan, dir)) + return true; + + /* Array keys are now exhausted */ + } + + /* + * Has another primitive index scan been scheduled by _bt_checkkeys? + */ + if (so->needPrimScan) + { + /* Yes -- tell caller to call _bt_first once again */ + so->needPrimScan = false; + if (scan->parallel_scan != NULL) + _bt_parallel_next_primitive_scan(scan); + + return true; + } + + /* + * No more primitive index scans. Terminate the top-level scan. + */ + if (scan->parallel_scan != NULL) + _bt_parallel_done(scan); + + return false; +} + +/* + * _bt_advance_array_keys() -- Advance array elements using a tuple + * + * Like _bt_check_compare, our return value indicates if tuple satisfied the + * qual (specifically our new qual). There must be a new qual whenever we're + * called (unless the top-level scan terminates). After we return, all later + * calls to _bt_check_compare will also use the same new qual (a qual with the + * newly advanced array key values that were set here by us). + * + * We'll also set pstate.continuescan for caller. When this is set to false, + * it usually just ends the ongoing primitive index scan (we'll have scheduled + * another one in passing). But when all required array keys were exhausted, + * setting pstate.continuescan=false here ends the top-level index scan (since + * no new primitive scan will have been scheduled). Most calls here will have + * us set pstate.continuescan=true, which just indicates that the scan should + * proceed onto the next tuple (just like when _bt_check_compare does it). + * + * _bt_tuple_before_array_skeys is responsible for determining if the current + * place in the scan is >= the current array keys. Calling here before that + * point will prematurely advance the array keys, leading to wrong query + * results. + * + * We're responsible for ensuring that caller's tuple is <= current/newly + * advanced required array keys once we return. We try to find an exact + * match, but failing that we'll advance the array keys to whatever set of + * array elements comes next in the key space for the current scan direction. + * Required array keys "ratchet forwards". They can only advance as the scan + * itself advances through the index/key space. + * + * (The invariants are the same for backwards scans, except that the operators + * are flipped: just replace the precondition's >= operator with a <=, and the + * postcondition's <= operator with with a >=. In other words, just swap the + * precondition with the postcondition.) + * + * We also deal with "advancing" non-required arrays here. Sometimes that'll + * be the sole reason for calling here. These calls are the only exception to + * the general rule about always advancing required array keys (since they're + * the only case where we simply don't need to touch any required array, which + * must already be satisfied by caller's tuple). Calls triggered by any scan + * key that's required in the current scan direction are strictly guaranteed + * to advance the required array keys (or end the top-level scan), though. + * + * Note that we deal with all required equality strategy scan keys here; it's + * not limited to array scan keys. They're equality constraints for our + * purposes, and so are handled as degenerate single element arrays here. + * Obviously, they can never really advance in the way that real arrays can, + * but they must still affect how we advance real array scan keys, just like + * any other equality constraint. We have to keep around a 3-way ORDER proc + * for these (just using the "=" operator won't do), since in general whether + * the tuple is < or > some non-array equality key might influence advancement + * of any of the scan's actual arrays. The top-level scan can only terminate + * after it has processed the key space covered by the product of each and + * every equality constraint, including both non-arrays and (required) arrays. + * + * Note also that we may sometimes need to advance the array keys when the + * existing array keys are already an exact match for every corresponding + * value from caller's tuple according to _bt_check_compare. This is how we + * deal with inequalities that are required in the current scan direction. + * They can advance the array keys here, even though they don't influence the + * initial positioning strategy within _bt_first (only inequalities required + * in the _opposite_ direction to the scan influence _bt_first in this way). + * + * As discussed already, we guarantee that the array keys will either be + * advanced such that caller's tuple is <= the new array keys in respect of + * required array keys (plus any other required equality strategy scan keys) + * when we return (unless the arrays are totally exhausted instead). The real + * guarantee is actually slightly stronger than that, though it only matters + * to scans that have required inequality strategy scan keys. The precise + * promise we make is that the array keys will always advance to the maximum + * possible extent that we can know to be safe based on caller's tuple alone. + * Note that it's just about possible that every required equality strategy + * scan key will be satisfied (or could be satisfied by advancing the array + * keys), yet we might advance the array keys _beyond_ our exactly-matching + * element values due to a still-unsatisfied inequality strategy scan key. + */ +static bool +_bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, + IndexTuple tuple, int sktrig) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Relation rel = scan->indexRelation; + ScanDirection dir = pstate->dir; + TupleDesc itupdesc = RelationGetDescr(rel); + ScanKey cur; + int ikey, + arrayidx = 0, + ntupatts = BTreeTupleGetNAtts(tuple, rel); + bool arrays_advanced = false, + arrays_exhausted, + sktrigrequired = false, + beyond_end_advance = false, + foundRequiredOppositeDirOnly = false, + all_arraylike_sk_satisfied = true, + all_required_sk_satisfied = true; + + Assert(_bt_verify_keys_with_arraykeys(scan)); + + /* + * Iterate through the scan's search-type scankeys (so->keyData[]), and + * set input scan keys (so->arrayKeyData[]) to new array values + */ + for (cur = so->keyData, ikey = 0; ikey < so->numberOfKeys; cur++, ikey++) + { + BTArrayKeyInfo *array = NULL; + ScanKey skeyarray = NULL; + FmgrInfo *orderproc; + int attnum = cur->sk_attno; + Datum tupdatum; + bool requiredSameDir = false, + requiredOppositeDirOnly = false, + tupnull; + int32 result; + int set_elem = 0; + + /* + * Set up ORDER 3-way comparison function and array state + */ + orderproc = &so->orderProcs[attnum - 1]; + if (cur->sk_flags & SK_SEARCHARRAY && + cur->sk_strategy == BTEqualStrategyNumber) + { + Assert(arrayidx < so->numArrayKeys); + array = &so->arrayKeys[arrayidx++]; + skeyarray = &so->arrayKeyData[array->scan_key]; + Assert(skeyarray->sk_attno == attnum); + } + + if (((cur->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) || + ((cur->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir))) + requiredSameDir = true; + else if (((cur->sk_flags & SK_BT_REQFWD) && ScanDirectionIsBackward(dir)) || + ((cur->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsForward(dir))) + requiredOppositeDirOnly = true; + + /* + * Optimization: Skip over known-satisfied scan keys + */ + if (ikey < sktrig) + continue; + if (ikey == sktrig) + sktrigrequired = requiredSameDir; + + /* + * When we come across an inequality scan key that's required in the + * opposite direction only, and that might affect where our scan ends, + * remember it. We'll only need this information when all prior + * equality constraints are satisfied. + */ + if (requiredOppositeDirOnly && sktrigrequired && + all_arraylike_sk_satisfied) + { + Assert(cur->sk_strategy != BTEqualStrategyNumber); + Assert(all_required_sk_satisfied); + + foundRequiredOppositeDirOnly = true; + + continue; + } + + /* + * Other than that, we're not interested in scan keys that aren't + * required in the current scan direction (unless they're non-required + * array equality scan keys, which still need to be advanced by us) + */ + if (!requiredSameDir && !array) + continue; + + /* + * Handle a required non-array scan key that the initial call to + * _bt_check_compare indicated triggered array advancement, if any. + * + * The non-array scan key's strategy will be <, <=, or = during a + * forwards scan (or any one of =, >=, or > during a backwards scan). + * It follows that the corresponding tuple attribute's value must now + * be either > or >= the scan key value (for backwards scans it must + * be either < or <= that value). + * + * If this is a required equality strategy scan key, this is just an + * optimization; _bt_tuple_before_array_skeys already confirmed that + * this scan key places us ahead of caller's tuple. There's no need + * to repeat that work now. (We only do comparisons of any required + * non-array equality scan keys that come after the triggering key.) + * + * If this is a required inequality strategy scan key, we _must_ rely + * on _bt_check_compare like this; it knows all the intricacies around + * evaluating inequality strategy scan keys (e.g., row comparisons). + * There is no simple mapping onto the opclass ORDER proc we can use. + * But once we know that we have an unsatisfied inequality, we can + * treat it in the same way as an unsatisfied equality at this point. + * (We don't need to worry about later required inequalities, since + * there can't be any after the first one. While it's possible that + * _bt_preprocess_keys failed to determine which of several "required" + * scan keys for this same attribute and same scan direction are truly + * required, that changes nothing, really. Even in this corner case, + * we can safely assume that any other "required" inequality that is + * still satisfied must have been redundant all along.) + * + * The arrays advance correctly in both cases because both involve the + * scan reaching the end of the key space for a higher order array key + * (or some distinct set of higher-order array keys, taken together). + * The only real difference is that in the equality case the end is + * "strictly at the end of an array key", whereas in the inequality + * case it's "within an array key". Either way we'll increment higher + * order arrays by one increment (the next-highest array might need to + * roll over to the next-next highest array in turn, and so on). + * + * See below for a full explanation of "beyond end" advancement. + */ + if (ikey == sktrig && !array) + { + Assert(requiredSameDir); + Assert(!arrays_advanced); + + beyond_end_advance = true; + all_arraylike_sk_satisfied = all_required_sk_satisfied = false; + + continue; + } + + /* + * Nothing for us to do with a required inequality strategy scan key + * that wasn't the one that _bt_check_compare stopped on + */ + if (cur->sk_strategy != BTEqualStrategyNumber) + continue; + + /* + * Here we perform steps for all array scan keys after a required + * array scan key whose binary search triggered "beyond end of array + * element" array advancement due to encountering a tuple attribute + * value > the closest matching array key (or < for backwards scans). + * + * See below for a full explanation of "beyond end" advancement. + * + * NB: We must do this for all arrays -- not just required arrays. + * Otherwise the incremental array advancement step won't "carry". + */ + if (beyond_end_advance) + { + int final_elem_dir; + + if (ScanDirectionIsBackward(dir) || !array) + final_elem_dir = 0; + else + final_elem_dir = array->num_elems - 1; + + if (array && array->cur_elem != final_elem_dir) + { + array->cur_elem = final_elem_dir; + skeyarray->sk_argument = array->elem_values[final_elem_dir]; + arrays_advanced = true; + } + + continue; + } + + /* + * Here we perform steps for any required scan keys after the first + * required scan key whose tuple attribute was < the closest matching + * array key when we dealt with it (or > for backwards scans). + * + * This earlier required array key already puts us ahead of caller's + * tuple in the key space (for the current scan direction). We must + * make sure that subsequent lower-order array keys do not put us too + * far ahead (ahead of tuples that have yet to be seen by our caller). + * For example, when a tuple "(a, b) = (42, 5)" advances the array + * keys on "a" from 40 to 45, we must also set "b" to whatever the + * first array element for "b" is. It would be wrong to allow "b" to + * be set based on the tuple value. + * + * Perform the same steps with truncated high key attributes. You can + * think of this as a "binary search" for the element closest to the + * value -inf. Again, the arrays must never get ahead of the scan. + */ + if (!all_arraylike_sk_satisfied || attnum > ntupatts) + { + int first_elem_dir; + + if (ScanDirectionIsForward(dir) || !array) + first_elem_dir = 0; + else + first_elem_dir = array->num_elems - 1; + + if (array && array->cur_elem != first_elem_dir) + { + array->cur_elem = first_elem_dir; + skeyarray->sk_argument = array->elem_values[first_elem_dir]; + arrays_advanced = true; + } + + /* + * Truncated -inf value will always be assumed to satisfy any + * required equality scan keys according to _bt_check_compare. + * This avoids a later _bt_check_compare recheck. + * + * Deliberately don't unset all_required_sk_satisfied here. This + * follows _bt_tuple_before_array_skeys's example. We don't want + * to treat -inf as a non-match when making a final decision on + * whether to move to the next page. This implements a policy of + * being optimistic about finding real matches for lower-order + * required attributes that are truncated to -inf in finaltup. + */ + all_arraylike_sk_satisfied = false; + + continue; + } + + /* + * Search in scankey's array for the corresponding tuple attribute + * value from caller's tuple + */ + tupdatum = index_getattr(tuple, attnum, itupdesc, &tupnull); + + if (array) + { + bool ratchets = (requiredSameDir && !arrays_advanced); + + /* + * Binary search for closest match that's available from the array + */ + set_elem = _bt_binsrch_array_skey(orderproc, ratchets, dir, + tupdatum, tupnull, + array, cur, &result); + + /* + * Required arrays only ever ratchet forwards (backwards). + * + * This condition makes it safe for binary searches to skip over + * array elements that the scan must already be ahead of by now. + * That is strictly an optimization. Our assertion verifies that + * the condition holds, which doesn't depend on the optimization. + */ + Assert(!ratchets || + ((ScanDirectionIsForward(dir) && set_elem >= array->cur_elem) || + (ScanDirectionIsBackward(dir) && set_elem <= array->cur_elem))); + Assert(set_elem >= 0 && set_elem < array->num_elems); + } + else + { + Assert(requiredSameDir); + + /* + * This is a required non-array equality strategy scan key, which + * we'll treat as a degenerate single value array. + * + * We really do need an ORDER proc for this (we can't just rely on + * the scan key's equality operator). We need to know whether the + * tuple as a whole is either behind or ahead of (or covered by) + * the key space represented by our required arrays as a group. + * + * This scan key's imaginary "array" can't really advance, but it + * can still roll over like any other array. (Actually, this is + * no different to real single value arrays, which never advance + * without rolling over -- they can never truly advance, either.) + */ + result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, + cur->sk_argument, cur); + } + + /* + * Consider "beyond end of array element" array advancement. + * + * When the tuple attribute value is > the closest matching array key + * (or < in the backwards scan case), we need to ratchet this array + * forward (backward) by one increment, so that caller's tuple ends up + * being < final array value instead (or > final array value instead). + * This process has to work for all of the arrays, not just this one: + * it must "carry" to higher-order arrays when the set_elem that we + * just found happens to be the final one for the scan's direction. + * Incrementing (decrementing) set_elem itself isn't good enough. + * + * Our approach is to provisionally use set_elem as if it was an exact + * match now, then set each later/less significant array to whatever + * its final element is. Once outside the loop we'll then "increment + * this array's set_elem" by calling _bt_advance_array_keys_increment. + * That way the process rolls over to higher order arrays as needed. + * + * Under this scheme any required arrays only ever ratchet forwards + * (or backwards), and always do so to the maximum possible extent + * that we can know will be safe without seeing the scan's next tuple. + * We don't need any special handling of required equality scan keys + * that lack a real array for us to advance, either. It also won't + * matter if all of the scan's real arrays are non-required arrays. + */ + if (requiredSameDir && + ((ScanDirectionIsForward(dir) && result > 0) || + (ScanDirectionIsBackward(dir) && result < 0))) + beyond_end_advance = true; + + /* + * Also track whether all relevant attributes from caller's tuple will + * be equal to the scan's array keys once we're done with it + */ + if (result != 0) + { + all_arraylike_sk_satisfied = false; + if (requiredSameDir) + all_required_sk_satisfied = false; + } + + /* + * Optimization: If this call was triggered by a non-required array, + * and we know that tuple won't satisfy the qual, we give up right + * away. This often avoids advancing the array keys, which will save + * wasted cycles from calling _bt_update_keys_with_arraykeys below. + */ + if (!all_arraylike_sk_satisfied && !sktrigrequired) + { + Assert(!requiredSameDir && !foundRequiredOppositeDirOnly); + Assert(!beyond_end_advance); + + break; + } + + /* Advance array keys, even if set_elem isn't an exact match */ + if (array && array->cur_elem != set_elem) + { + array->cur_elem = set_elem; + skeyarray->sk_argument = array->elem_values[set_elem]; + arrays_advanced = true; + } + } + + /* + * Consider if we need to advance the array keys incrementally to finish + * off "beyond end of array element" array advancement. This is the only + * way that the array keys can be exhausted, which is the only way that + * the top-level index scan can be terminated here by us. + */ + arrays_exhausted = false; + if (beyond_end_advance) + { + /* Non-required scan keys never exhaust arrays/end top-level scan */ + Assert(sktrigrequired && !all_required_sk_satisfied); + + if (!_bt_advance_array_keys_increment(scan, dir)) + arrays_exhausted = true; + else + arrays_advanced = true; + } + + if (arrays_advanced) + { + /* + * We advanced the array keys. Finalize everything by performing an + * in-place update of the scan's search-type scan keys. + * + * If we missed this final step then any call to _bt_check_compare + * would use stale array keys until such time as _bt_preprocess_keys + * was once again called by _bt_first. + */ + _bt_update_keys_with_arraykeys(scan); + so->advanceDir = dir; + + /* + * If any required array keys were advanced, be prepared to recheck + * the final tuple against the new array keys (as an optimization) + */ + if (sktrigrequired) + pstate->finaltupchecked = false; + } + + /* + * If the array keys are now exhausted, end the top-level index scan + */ + Assert(!so->needPrimScan); + Assert(_bt_verify_keys_with_arraykeys(scan)); + if (arrays_exhausted) + { + Assert(sktrigrequired && !all_required_sk_satisfied); + + pstate->continuescan = false; + + /* Caller's tuple can't match new qual (if any), either */ + return false; + } + + /* + * Postcondition assertions (see header comments for a full explanation). + * + * Tuple must now be <= current/newly advanced required array keys. Same + * goes for other required equality type scan keys, which are "degenerate + * single value arrays" for our purposes. (As usual the rule is the same + * for backwards scans once the operators are flipped around.) + * + * Every call here is guaranteed to advance (or exhaust) all required + * arrays, with the sole exception of calls _bt_check_compare triggers + * when it encounters an unsatisfied non-required array scan key. + */ + Assert(_bt_tuple_before_array_skeys(scan, pstate, tuple, 0, false) == + !all_required_sk_satisfied); + Assert(arrays_advanced || !sktrigrequired); + Assert(sktrigrequired || all_required_sk_satisfied); + + /* + * The array keys aren't exhausted, so provisionally assume that the + * current primitive index scan will continue + */ + pstate->continuescan = true; + + /* + * Does caller's tuple now match the new qual? Call _bt_check_compare a + * second time to find out (unless it's already clear that it can't). + */ + if (all_arraylike_sk_satisfied && arrays_advanced) + { + bool continuescan; + int insktrig = sktrig + 1; + + if (likely(_bt_check_compare(dir, so, tuple, ntupatts, itupdesc, + so->numArrayKeys, &continuescan, + &insktrig, false, false))) + return true; + + /* + * Handle inequalities marked required in the current scan direction. + * + * It's just about possible that our _bt_check_compare call indicates + * that the scan should be terminated due to an unsatisfied inequality + * that wasn't initially recognized as such by us. Handle this by + * calling ourselves recursively while indicating that the trigger is + * now the inequality that we missed first time around. + * + * Note: we only need to do this in cases where the initial call to + * _bt_check_compare (that led to calling here) gave up upon finding + * an unsatisfied required equality/array scan key before it could + * reach the inequality. The second _bt_check_compare call took place + * after the array keys were advanced (to array keys that definitely + * match the tuple), so it can't have been overlooked a second time. + * + * Note: this is useful because we won't have to wait until the next + * tuple to advance the array keys a second time (to values that'll + * put the scan ahead of this tuple). Handling this ourselves isn't + * truly required. But it avoids complicating our contract. The only + * alternative is to allow an awkward exception to the general rule + * (the rule about always advancing the arrays to the maximum possible + * extent that caller's tuple can safely allow). + */ + if (!continuescan) + { + ScanKey inequal PG_USED_FOR_ASSERTS_ONLY = so->keyData + insktrig; + + Assert(sktrigrequired && all_required_sk_satisfied); + Assert(inequal->sk_strategy != BTEqualStrategyNumber); + Assert(((inequal->sk_flags & SK_BT_REQFWD) && + ScanDirectionIsForward(dir)) || + ((inequal->sk_flags & SK_BT_REQBKWD) && + ScanDirectionIsBackward(dir))); + + return _bt_advance_array_keys(scan, pstate, tuple, insktrig); + } + } + + /* + * Handle inequalities marked required in the opposite scan direction. + * + * If we advanced the array keys (which is now certain except in the case + * where we only needed to deal with non-required arrays), it's possible + * that the scan is now at the start of "matching" tuples (at least by the + * definition used by _bt_tuple_before_array_skeys), but is nevertheless + * still many leaf pages before the position that _bt_first is capable of + * repositioning the scan to. + * + * This can happen when we have an inequality scan key required in the + * opposite direction only, that's less significant than the scan key that + * triggered array advancement during our initial _bt_check_compare call. + * If even finaltup doesn't satisfy this less significant inequality scan + * key once we temporarily flip the scan direction, that indicates that + * even finaltup is before the _bt_first-wise initial position for these + * newly advanced array keys. + */ + if (all_required_sk_satisfied && foundRequiredOppositeDirOnly && + pstate->finaltup) + { + int nfinaltupatts = BTreeTupleGetNAtts(pstate->finaltup, rel); + ScanDirection flipped = -dir; + bool continuescan; + int opsktrig = 0; + + Assert(sktrigrequired && arrays_advanced); + + _bt_check_compare(flipped, so, pstate->finaltup, nfinaltupatts, + itupdesc, so->numArrayKeys, &continuescan, + &opsktrig, false, false); + + if (!continuescan && opsktrig > sktrig) + { + ScanKey inequal = so->keyData + opsktrig; + + if (((inequal->sk_flags & SK_BT_REQFWD) && + ScanDirectionIsForward(flipped)) || + ((inequal->sk_flags & SK_BT_REQBKWD) && + ScanDirectionIsBackward(flipped))) + { + Assert(inequal->sk_strategy != BTEqualStrategyNumber); + + /* + * Continuing the ongoing primitive index scan as-is risks + * uselessly scanning a huge number of leaf pages from before + * the page that we'll quickly jump to by descending the index + * anew. + * + * Play it safe: start a new primitive index scan. _bt_first + * is guaranteed to at least move the scan to the next leaf + * page. + */ + pstate->continuescan = false; + so->needPrimScan = true; + + return false; + } + } + + /* + * Caller's tuple might still be before the _bt_first-wise start of + * matches for the new array keys, but at least finaltup is at or + * ahead of that position. That's good enough; continue as-is. + */ + } + + /* + * Caller's tuple is < the newly advanced array keys (or > when this is a + * backwards scan). + * + * It's possible that later tuples will also turn out to have values that + * are still < the now-current array keys (or > the current array keys). + * Our caller will handle this by performing what amounts to a linear + * search of the page, implemented by calling _bt_check_compare and then + * _bt_tuple_before_array_skeys for each tuple. Our caller should locate + * the first tuple >= the array keys before long (or locate the first + * tuple <= the array keys before long). + * + * This approach has various advantages over a binary search of the page. + * We expect that our caller will either quickly discover the next tuple + * covered by the current array keys, or quickly discover that it needs + * another primitive index scan (using its finaltup precheck) instead. + * Either way, a binary search is unlikely to beat a simple linear search. + * + * It's also not clear that a binary search will be any faster when we + * really do have to search through hundreds of tuples beyond this one. + * Several binary searches (one per array advancement) might be required + * while reading through a single page. Our linear search is structured + * as one continuous search that just advances the arrays in passing, and + * that only needs a little extra logic to deal with inequality scan keys. + */ + if (!all_required_sk_satisfied && tuple == pstate->finaltup) + { + /* + * There is one exception: when the page's final tuple advances the + * array keys without exactly matching keys for any required arrays, + * start a new primitive index scan -- don't let our caller continue + * to the next leaf page. + * + * In the forward scan case, finaltup is the page high key. We don't + * insist on having an exact match for truncated -inf attributes. + * They're never exactly equal to any real array key, but it makes + * sense to be optimistic about finding matches on the next page. + */ + Assert(sktrigrequired && arrays_advanced); + + pstate->continuescan = false; + so->needPrimScan = true; + } + + /* In any case, this indextuple doesn't match the qual */ + return false; +} /* * _bt_preprocess_keys() -- Preprocess scan keys @@ -741,6 +1993,19 @@ _bt_restore_array_keys(IndexScanDesc scan) * Again, missing cross-type operators might cause us to fail to prove the * quals contradictory when they really are, but the scan will work correctly. * + * Index scans with array keys need to be able to advance each array's keys + * and make them the current search-type scan keys without calling here. They + * expect to be able to call _bt_update_keys_with_arraykeys instead. We need + * to be careful about that case when we determine redundancy; equality quals + * must not be eliminated as redundant on the basis of array input keys that + * might change before another call here can take place. + * + * Note, however, that the presence of an array scan key doesn't affect how we + * determine if index quals are contradictory. Contradictory qual scans move + * on to the next primitive index scan right away, by incrementing the scan's + * array keys once control reaches _bt_array_keys_remain. There won't be a + * call to _bt_update_keys_with_arraykeys, so there's nothing for us to break. + * * Row comparison keys are currently also treated without any smarts: * we just transfer them into the preprocessed array without any * editorialization. We can treat them the same as an ordinary inequality @@ -887,8 +2152,11 @@ _bt_preprocess_keys(IndexScanDesc scan) so->qual_ok = false; return; } - /* else discard the redundant non-equality key */ - xform[j] = NULL; + else if (!(eq->sk_flags & SK_SEARCHARRAY)) + { + /* else discard the redundant non-equality key */ + xform[j] = NULL; + } } /* else, cannot determine redundancy, keep both keys */ } @@ -978,6 +2246,22 @@ _bt_preprocess_keys(IndexScanDesc scan) continue; } + /* + * Is this an array scan key that _bt_preprocess_array_keys merged + * with some earlier array key during its initial preprocessing pass? + */ + if (cur->sk_flags & SK_BT_RDDNARRAY) + { + /* + * key is redundant for this primitive index scan (and will be + * redundant during all subsequent primitive index scans) + */ + Assert(cur->sk_flags & SK_SEARCHARRAY); + Assert(j == (BTEqualStrategyNumber - 1)); + Assert(so->numArrayKeys > 0); + continue; + } + /* have we seen one of these before? */ if (xform[j] == NULL) { @@ -991,7 +2275,26 @@ _bt_preprocess_keys(IndexScanDesc scan) &test_result)) { if (test_result) - xform[j] = cur; + { + if (j == (BTEqualStrategyNumber - 1) && + ((xform[j]->sk_flags & SK_SEARCHARRAY) || + (cur->sk_flags & SK_SEARCHARRAY))) + { + /* + * Must never replace an = array operator ourselves, + * nor can we ever fail to remember an = array + * operator. _bt_update_keys_with_arraykeys expects + * this. + */ + ScanKey outkey = &outkeys[new_numberOfKeys++]; + + memcpy(outkey, cur, sizeof(ScanKeyData)); + if (numberOfEqualCols == attno - 1) + _bt_mark_scankey_required(outkey); + } + else + xform[j] = cur; + } else if (j == (BTEqualStrategyNumber - 1)) { /* key == a && key == b, but a != b */ @@ -1019,6 +2322,95 @@ _bt_preprocess_keys(IndexScanDesc scan) so->numberOfKeys = new_numberOfKeys; } +/* + * _bt_update_keys_with_arraykeys() -- Finalize advancing array keys + * + * Transfers newly advanced array keys that were set in "so->arrayKeyData[]" + * over to corresponding "so->keyData[]" scan keys. Reuses most of the work + * that took place within _bt_preprocess_keys, only changing the array keys. + * + * It's safe to call here while holding a buffer lock, which isn't something + * that _bt_preprocess_keys can guarantee. + */ +static void +_bt_update_keys_with_arraykeys(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + ScanKey cur; + int ikey, + arrayidx = 0; + + Assert(so->qual_ok); + + for (cur = so->keyData, ikey = 0; ikey < so->numberOfKeys; cur++, ikey++) + { + BTArrayKeyInfo *array; + ScanKey skeyarray; + + Assert((cur->sk_flags & SK_BT_RDDNARRAY) == 0); + + /* Just update equality array scan keys */ + if (cur->sk_strategy != BTEqualStrategyNumber || + !(cur->sk_flags & SK_SEARCHARRAY)) + continue; + + array = &so->arrayKeys[arrayidx++]; + skeyarray = &so->arrayKeyData[array->scan_key]; + + /* Update the scan key's argument */ + Assert(cur->sk_attno == skeyarray->sk_attno); + cur->sk_argument = skeyarray->sk_argument; + } + + Assert(arrayidx == so->numArrayKeys); +} + +/* + * Verify that the scan's "so->arrayKeyData[]" scan keys are in agreement with + * the current "so->keyData[]" search-type scan keys. Used within assertions. + */ +#ifdef USE_ASSERT_CHECKING +static bool +_bt_verify_keys_with_arraykeys(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + ScanKey cur; + int ikey, + arrayidx = 0; + + if (!so->qual_ok) + return false; + + for (cur = so->keyData, ikey = 0; ikey < so->numberOfKeys; cur++, ikey++) + { + BTArrayKeyInfo *array; + ScanKey skeyarray; + + if (cur->sk_strategy != BTEqualStrategyNumber || + !(cur->sk_flags & SK_SEARCHARRAY)) + continue; + + array = &so->arrayKeys[arrayidx++]; + skeyarray = &so->arrayKeyData[array->scan_key]; + + /* Verify so->arrayKeyData[] input key has expected sk_argument */ + if (skeyarray->sk_argument != array->elem_values[array->cur_elem]) + return false; + + /* Verify so->arrayKeyData[] input key agrees with output key */ + if (cur->sk_attno != skeyarray->sk_attno) + return false; + if (cur->sk_argument != skeyarray->sk_argument) + return false; + } + + if (arrayidx != so->numArrayKeys) + return false; + + return true; +} +#endif + /* * Compare two scankey values using a specified operator. * @@ -1352,60 +2744,211 @@ _bt_mark_scankey_required(ScanKey skey) * * Return true if so, false if not. If the tuple fails to pass the qual, * we also determine whether there's any need to continue the scan beyond - * this tuple, and set *continuescan accordingly. See comments for + * this tuple, and set pstate.continuescan accordingly. See comments for * _bt_preprocess_keys(), above, about how this is done. * - * Forward scan callers can pass a high key tuple in the hopes of having - * us set *continuescan to false, and avoiding an unnecessary visit to - * the page to the right. + * Forward scan callers call with a high key tuple last in the hopes of having + * us set pstate.continuescan to false, and avoiding an unnecessary visit to + * the page to the right. Pass finaltup=true for these high key calls. + * Backwards scan callers shouldn't do this, but should still let us know + * which tuple is last by passing finaltup=true for the final non-pivot tuple + * (the non-pivot tuple at page offset number one). + * + * Callers with equality strategy array scan keys must set up page state that + * helps us know when to start or stop primitive index scans on their behalf. + * The finaltup tuple should be stashed in pstate.finaltup, so we don't have + * to wait until the finaltup call to be able to see what's up with the page. + * + * Advances the scan's array keys in passing when required. Note that we rely + * on _bt_readpage calling here in page offset number order (for the current + * scan direction). Any other order confuses array advancement. * * scan: index scan descriptor (containing a search-type scankey) + * pstate: Page level input and output parameters * tuple: index tuple to test + * finaltup: Is tuple the final one we'll be called with for this page? * tupnatts: number of attributes in tupnatts (high key may be truncated) - * dir: direction we are scanning in - * continuescan: output parameter (will be set correctly in all cases) - * continuescanPrechecked: indicates that *continuescan flag is known to + * continuescanPrechecked: indicates that continuescan flag is known to * be true for the last item on the page * haveFirstMatch: indicates that we already have at least one match * in the current page */ bool -_bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, - ScanDirection dir, bool *continuescan, +_bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, + IndexTuple tuple, bool finaltup, int tupnatts, bool continuescanPrechecked, bool haveFirstMatch) { - TupleDesc tupdesc; - BTScanOpaque so; - int keysz; - int ikey; - ScanKey key; + TupleDesc tupdesc = RelationGetDescr(scan->indexRelation); + BTScanOpaque so = (BTScanOpaque) scan->opaque; + int numArrayKeys = so->numArrayKeys; + int ikey = 0; + bool res; Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts); + Assert(!numArrayKeys || so->advanceDir == pstate->dir); + Assert(!so->needPrimScan); + + res = _bt_check_compare(pstate->dir, so, tuple, tupnatts, tupdesc, + numArrayKeys, &pstate->continuescan, &ikey, + continuescanPrechecked, haveFirstMatch); + + /* + * Only one _bt_check_compare call is required in the common case where + * there are no equality-type array scan keys. Otherwise we can only + * accept _bt_check_compare's answer unreservedly when it didn't set + * continuescan=false. + */ + if (!numArrayKeys || pstate->continuescan) + return res; + + /* + * _bt_check_compare call set continuescan=false in the presence of + * equality type array keys. + * + * While we might really need to end the top-level index scan, most of the + * time this just means that the scan needs to reconsider its array keys. + */ + if (_bt_tuple_before_array_skeys(scan, pstate, tuple, ikey, true)) + { + /* + * Current tuple is < the current array scan keys/equality constraints + * (or > in the backward scan case). Don't need to advance the array + * keys. Must decide whether to start a new primitive scan instead. + * + * If this tuple isn't the finaltup for the page, then recheck the + * finaltup stashed in pstate as an optimization. That allows us to + * quit scanning this page early when it's clearly hopeless (we don't + * need to wait for the finaltup call to give up on a primitive scan). + */ + if (finaltup || (!pstate->finaltupchecked && pstate->finaltup && + _bt_tuple_before_array_skeys(scan, pstate, + pstate->finaltup, + 0, false))) + { + /* + * Give up on the ongoing primitive index scan. + * + * Even the final tuple (the high key for forward scans, or the + * tuple from page offset number 1 for backward scans) is before + * the current array keys. That strongly suggests that continuing + * this primitive scan would be less efficient than starting anew. + * + * See also: _bt_advance_array_keys's handling of the case where + * finaltup itself advances the array keys to non-matching values. + */ + pstate->continuescan = false; + + /* + * Set up a new primitive index scan that will reposition the + * top-level scan to the first leaf page whose key space is + * covered by our array keys. The top-level scan will "skip" a + * part of the index that can only contain non-matching tuples. + * + * Note: the next primitive index scan is guaranteed to land on + * some later leaf page (ideally it won't be this page's sibling). + * It follows that the top-level scan can never access the same + * leaf page more than once (unless the scan changes direction or + * btrestrpos is called). btcostestimate relies on this. + */ + so->needPrimScan = true; + } + else + { + /* + * Stick with the ongoing primitive index scan, for now (override + * _bt_check_compare's suggestion that we end the scan). + * + * Note: we will end up here again and again given a group of + * tuples > the previous array keys and < the now-current keys + * (though only after an initial finaltup precheck determined that + * this page definitely covers key space from both array keysets). + * In effect, we perform a linear search of the page's remaining + * unscanned tuples every time the arrays advance past the key + * space of the scan's then-current tuple. + */ + pstate->continuescan = true; + + /* + * Our finaltup precheck determined that it is >= the current keys + * (though the _current_ tuple is still < the current array keys). + * + * Remember that fact in pstate now. This avoids wasting cycles + * on repeating the same precheck step (checking the same finaltup + * against the same array keys) during later calls here for later + * tuples from this same leaf page. + */ + pstate->finaltupchecked = true; + } + + /* In any case, this indextuple doesn't match the qual */ + return false; + } + + /* + * Caller's tuple is >= the current set of array keys and other equality + * constraint scan keys (or <= if this is a backwards scans). It's now + * clear that we _must_ advance any required array keys in lockstep with + * the scan (unless the required array keys become exhausted instead, or + * unless the ikey trigger corresponds to a non-required array scan key). + * + * Note: we might even advance the required arrays when all existing keys + * are already equal to the values from the tuple at this point. See the + * comments above _bt_advance_array_keys about required-inequality-driven + * array advancement. + * + * Note: we _won't_ advance any required arrays when the ikey/trigger scan + * key corresponds to a non-required array found to be unsatisfied by the + * current keys. (We might not even "advance" the non-required array.) + */ + return _bt_advance_array_keys(scan, pstate, tuple, ikey); +} + +/* + * Test whether an indextuple satisfies current scan condition. + * + * Return true if so, false if not. If not, also clear *continuescan if + * it's not possible for any future tuples in the current scan direction to + * pass the qual with the current set of array keys. + * + * This is a subroutine for _bt_checkkeys. It is written with the assumption + * that reaching the end of each distinct set of array keys terminates the + * ongoing primitive index scan. It is up to our caller (which has more high + * level context than us) to override that initial determination when it makes + * more sense to advance the array keys and continue with further tuples from + * the same leaf page. + */ +static bool +_bt_check_compare(ScanDirection dir, BTScanOpaque so, + IndexTuple tuple, int tupnatts, TupleDesc tupdesc, + int numArrayKeys, bool *continuescan, int *ikey, + bool continuescanPrechecked, bool haveFirstMatch) +{ + ScanKey key; + int keysz; + + Assert(!numArrayKeys || !continuescanPrechecked); *continuescan = true; /* default assumption */ - - tupdesc = RelationGetDescr(scan->indexRelation); - so = (BTScanOpaque) scan->opaque; keysz = so->numberOfKeys; - for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++) + for (key = so->keyData + *ikey; *ikey < keysz; key++, (*ikey)++) { Datum datum; bool isNull; Datum test; bool requiredSameDir = false, - requiredOppositeDir = false; + requiredOppositeDirOnly = false; /* - * Check if the key is required for ordered scan in the same or - * opposite direction. Save as flag variables for future usage. + * Check if the key is required in the current scan direction, in the + * opposite scan direction _only_, or in neither direction */ if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) || ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir))) requiredSameDir = true; else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsBackward(dir)) || ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsForward(dir))) - requiredOppositeDir = true; + requiredOppositeDirOnly = true; /* * If the caller told us the *continuescan flag is known to be true @@ -1423,7 +2966,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, * Both cases above work except for the row keys, where NULLs could be * found in the middle of matching values. */ - if ((requiredSameDir || (requiredOppositeDir && haveFirstMatch)) && + if ((requiredSameDir || (requiredOppositeDirOnly && haveFirstMatch)) && !(key->sk_flags & SK_ROW_HEADER) && continuescanPrechecked) continue; @@ -1435,7 +2978,6 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, * right could be any possible value. Assume that truncated * attribute passes the qual. */ - Assert(ScanDirectionIsForward(dir)); Assert(BTreeTupleIsPivot(tuple)); continue; } @@ -1525,12 +3067,29 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, } /* - * Apply the key-checking function. When the key is required for the - * opposite direction scan, it must be already satisfied as soon as - * there is already match on the page. Except for the NULLs checking, - * which have already done above. + * Apply the key checking function. When the key is required for + * opposite-direction scans it must be an inequality satisfied by + * _bt_first(), barring NULLs, which we just checked a moment ago. + * + * (Also can't apply this optimization with scans that use arrays, + * since _bt_advance_array_keys() sometimes allows the scan to see a + * few tuples from before the would-be _bt_first() starting position + * for the scan's just-advanced array keys.) + * + * Even required equality quals (that can't use this optimization due + * to being required in both scan directions) rely on the assumption + * that _bt_first() will always use the quals for initial positioning + * purposes. We stop the scan as soon as any required equality qual + * fails, so it had better only happen at the end of equal tuples in + * the current scan direction (never at the start of equal tuples). + * See comments in _bt_first(). + * + * (The required equality quals issue also has specific implications + * for scans that use arrays. They sometimes perform a linear search + * of remaining unscanned tuples, forcing the primitive index scan to + * continue until it locates tuples >= the scan's new array keys.) */ - if (!(requiredOppositeDir && haveFirstMatch)) + if (!(requiredOppositeDirOnly && haveFirstMatch) || numArrayKeys) { test = FunctionCall2Coll(&key->sk_func, key->sk_collation, datum, key->sk_argument); @@ -1548,15 +3107,25 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, * Tuple fails this qual. If it's a required qual for the current * scan direction, then we can conclude no further tuples will * pass, either. - * - * Note: because we stop the scan as soon as any required equality - * qual fails, it is critical that equality quals be used for the - * initial positioning in _bt_first() when they are available. See - * comments in _bt_first(). */ if (requiredSameDir) *continuescan = false; + /* + * Always set continuescan=false for equality-type array keys that + * don't pass -- even for an array scan key not marked required. + * + * A non-required scan key (array or otherwise) can never actually + * terminate the scan. It's just convenient for callers to treat + * continuescan=false as a signal that it might be time to advance + * the array keys, independent of whether they're required or not. + * (Even setting continuescan=false with a required scan key won't + * usually end a scan that uses arrays.) + */ + if (numArrayKeys && (key->sk_flags & SK_SEARCHARRAY) && + key->sk_strategy == BTEqualStrategyNumber) + *continuescan = false; + /* * In any case, this indextuple doesn't match the qual. */ @@ -1575,7 +3144,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, * it's not possible for any future tuples in the current scan direction * to pass the qual. * - * This is a subroutine for _bt_checkkeys, which see for more info. + * This is a subroutine for _bt_checkkeys/_bt_check_compare. */ static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, @@ -1604,7 +3173,6 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, * right could be any possible value. Assume that truncated * attribute passes the qual. */ - Assert(ScanDirectionIsForward(dir)); Assert(BTreeTupleIsPivot(tuple)); cmpresult = 0; if (subkey->sk_flags & SK_ROW_END) diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 32c6a8bbd..772c294f5 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -106,8 +106,7 @@ static List *build_index_paths(PlannerInfo *root, RelOptInfo *rel, IndexOptInfo *index, IndexClauseSet *clauses, bool useful_predicate, ScanTypeControl scantype, - bool *skip_nonnative_saop, - bool *skip_lower_saop); + bool *skip_nonnative_saop); static List *build_paths_for_OR(PlannerInfo *root, RelOptInfo *rel, List *clauses, List *other_clauses); static List *generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel, @@ -706,8 +705,6 @@ eclass_already_used(EquivalenceClass *parent_ec, Relids oldrelids, * index AM supports them natively, we should just include them in simple * index paths. If not, we should exclude them while building simple index * paths, and then make a separate attempt to include them in bitmap paths. - * Furthermore, we should consider excluding lower-order ScalarArrayOpExpr - * quals so as to create ordered paths. */ static void get_index_paths(PlannerInfo *root, RelOptInfo *rel, @@ -716,37 +713,17 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel, { List *indexpaths; bool skip_nonnative_saop = false; - bool skip_lower_saop = false; ListCell *lc; /* * Build simple index paths using the clauses. Allow ScalarArrayOpExpr - * clauses only if the index AM supports them natively, and skip any such - * clauses for index columns after the first (so that we produce ordered - * paths if possible). + * clauses only if the index AM supports them natively. */ indexpaths = build_index_paths(root, rel, index, clauses, index->predOK, ST_ANYSCAN, - &skip_nonnative_saop, - &skip_lower_saop); - - /* - * If we skipped any lower-order ScalarArrayOpExprs on an index with an AM - * that supports them, then try again including those clauses. This will - * produce paths with more selectivity but no ordering. - */ - if (skip_lower_saop) - { - indexpaths = list_concat(indexpaths, - build_index_paths(root, rel, - index, clauses, - index->predOK, - ST_ANYSCAN, - &skip_nonnative_saop, - NULL)); - } + &skip_nonnative_saop); /* * Submit all the ones that can form plain IndexScan plans to add_path. (A @@ -784,7 +761,6 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel, index, clauses, false, ST_BITMAPSCAN, - NULL, NULL); *bitindexpaths = list_concat(*bitindexpaths, indexpaths); } @@ -817,27 +793,19 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel, * to true if we found any such clauses (caller must initialize the variable * to false). If it's NULL, we do not ignore ScalarArrayOpExpr clauses. * - * If skip_lower_saop is non-NULL, we ignore ScalarArrayOpExpr clauses for - * non-first index columns, and we set *skip_lower_saop to true if we found - * any such clauses (caller must initialize the variable to false). If it's - * NULL, we do not ignore non-first ScalarArrayOpExpr clauses, but they will - * result in considering the scan's output to be unordered. - * * 'rel' is the index's heap relation * 'index' is the index for which we want to generate paths * 'clauses' is the collection of indexable clauses (IndexClause nodes) * 'useful_predicate' indicates whether the index has a useful predicate * 'scantype' indicates whether we need plain or bitmap scan support * 'skip_nonnative_saop' indicates whether to accept SAOP if index AM doesn't - * 'skip_lower_saop' indicates whether to accept non-first-column SAOP */ static List * build_index_paths(PlannerInfo *root, RelOptInfo *rel, IndexOptInfo *index, IndexClauseSet *clauses, bool useful_predicate, ScanTypeControl scantype, - bool *skip_nonnative_saop, - bool *skip_lower_saop) + bool *skip_nonnative_saop) { List *result = NIL; IndexPath *ipath; @@ -848,7 +816,6 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, List *orderbyclausecols; List *index_pathkeys; List *useful_pathkeys; - bool found_lower_saop_clause; bool pathkeys_possibly_useful; bool index_is_ordered; bool index_only_scan; @@ -880,19 +847,11 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, * on by btree and possibly other places.) The list can be empty, if the * index AM allows that. * - * found_lower_saop_clause is set true if we accept a ScalarArrayOpExpr - * index clause for a non-first index column. This prevents us from - * assuming that the scan result is ordered. (Actually, the result is - * still ordered if there are equality constraints for all earlier - * columns, but it seems too expensive and non-modular for this code to be - * aware of that refinement.) - * * We also build a Relids set showing which outer rels are required by the * selected clauses. Any lateral_relids are included in that, but not * otherwise accounted for. */ index_clauses = NIL; - found_lower_saop_clause = false; outer_relids = bms_copy(rel->lateral_relids); for (indexcol = 0; indexcol < index->nkeycolumns; indexcol++) { @@ -903,30 +862,20 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, IndexClause *iclause = (IndexClause *) lfirst(lc); RestrictInfo *rinfo = iclause->rinfo; - /* We might need to omit ScalarArrayOpExpr clauses */ - if (IsA(rinfo->clause, ScalarArrayOpExpr)) + /* + * We might need to omit ScalarArrayOpExpr clauses when index AM + * lacks native support + */ + if (!index->amsearcharray && IsA(rinfo->clause, ScalarArrayOpExpr)) { - if (!index->amsearcharray) + if (skip_nonnative_saop) { - if (skip_nonnative_saop) - { - /* Ignore because not supported by index */ - *skip_nonnative_saop = true; - continue; - } - /* Caller had better intend this only for bitmap scan */ - Assert(scantype == ST_BITMAPSCAN); - } - if (indexcol > 0) - { - if (skip_lower_saop) - { - /* Caller doesn't want to lose index ordering */ - *skip_lower_saop = true; - continue; - } - found_lower_saop_clause = true; + /* Ignore because not supported by index */ + *skip_nonnative_saop = true; + continue; } + /* Caller had better intend this only for bitmap scan */ + Assert(scantype == ST_BITMAPSCAN); } /* OK to include this clause */ @@ -956,11 +905,9 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, /* * 2. Compute pathkeys describing index's ordering, if any, then see how * many of them are actually useful for this query. This is not relevant - * if we are only trying to build bitmap indexscans, nor if we have to - * assume the scan is unordered. + * if we are only trying to build bitmap indexscans. */ pathkeys_possibly_useful = (scantype != ST_BITMAPSCAN && - !found_lower_saop_clause && has_useful_pathkeys(root, rel)); index_is_ordered = (index->sortopfamily != NULL); if (index_is_ordered && pathkeys_possibly_useful) @@ -1212,7 +1159,6 @@ build_paths_for_OR(PlannerInfo *root, RelOptInfo *rel, index, &clauseset, useful_predicate, ST_BITMAPSCAN, - NULL, NULL); result = list_concat(result, indexpaths); } diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index cea777e9d..47de61da1 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -6557,8 +6557,6 @@ genericcostestimate(PlannerInfo *root, double numIndexTuples; double spc_random_page_cost; double num_sa_scans; - double num_outer_scans; - double num_scans; double qual_op_cost; double qual_arg_cost; List *selectivityQuals; @@ -6573,7 +6571,7 @@ genericcostestimate(PlannerInfo *root, /* * Check for ScalarArrayOpExpr index quals, and estimate the number of - * index scans that will be performed. + * primitive index scans that will be performed for caller */ num_sa_scans = 1; foreach(l, indexQuals) @@ -6603,19 +6601,8 @@ genericcostestimate(PlannerInfo *root, */ numIndexTuples = costs->numIndexTuples; if (numIndexTuples <= 0.0) - { numIndexTuples = indexSelectivity * index->rel->tuples; - /* - * The above calculation counts all the tuples visited across all - * scans induced by ScalarArrayOpExpr nodes. We want to consider the - * average per-indexscan number, so adjust. This is a handy place to - * round to integer, too. (If caller supplied tuple estimate, it's - * responsible for handling these considerations.) - */ - numIndexTuples = rint(numIndexTuples / num_sa_scans); - } - /* * We can bound the number of tuples by the index size in any case. Also, * always estimate at least one tuple is touched, even when @@ -6653,27 +6640,31 @@ genericcostestimate(PlannerInfo *root, * * The above calculations are all per-index-scan. However, if we are in a * nestloop inner scan, we can expect the scan to be repeated (with - * different search keys) for each row of the outer relation. Likewise, - * ScalarArrayOpExpr quals result in multiple index scans. This creates - * the potential for cache effects to reduce the number of disk page - * fetches needed. We want to estimate the average per-scan I/O cost in - * the presence of caching. + * different search keys) for each row of the outer relation. This + * creates the potential for cache effects to reduce the number of disk + * page fetches needed. We want to estimate the average per-scan I/O cost + * in the presence of caching. * * We use the Mackert-Lohman formula (see costsize.c for details) to * estimate the total number of page fetches that occur. While this * wasn't what it was designed for, it seems a reasonable model anyway. * Note that we are counting pages not tuples anymore, so we take N = T = * index size, as if there were one "tuple" per page. + * + * Note: we assume that there will be no repeat index page fetches across + * ScalarArrayOpExpr primitive scans from the same logical index scan. + * This is guaranteed to be true for btree indexes, but is very optimistic + * with index AMs that cannot natively execute ScalarArrayOpExpr quals. + * However, these same index AMs also accept our default pessimistic + * approach to counting num_sa_scans (btree caller caps this), so we don't + * expect the final indexTotalCost to be wildly over-optimistic. */ - num_outer_scans = loop_count; - num_scans = num_sa_scans * num_outer_scans; - - if (num_scans > 1) + if (loop_count > 1) { double pages_fetched; /* total page fetches ignoring cache effects */ - pages_fetched = numIndexPages * num_scans; + pages_fetched = numIndexPages * loop_count; /* use Mackert and Lohman formula to adjust for cache effects */ pages_fetched = index_pages_fetched(pages_fetched, @@ -6683,11 +6674,9 @@ genericcostestimate(PlannerInfo *root, /* * Now compute the total disk access cost, and then report a pro-rated - * share for each outer scan. (Don't pro-rate for ScalarArrayOpExpr, - * since that's internal to the indexscan.) + * share for each outer scan */ - indexTotalCost = (pages_fetched * spc_random_page_cost) - / num_outer_scans; + indexTotalCost = (pages_fetched * spc_random_page_cost) / loop_count; } else { @@ -6703,10 +6692,8 @@ genericcostestimate(PlannerInfo *root, * evaluated once at the start of the scan to reduce them to runtime keys * to pass to the index AM (see nodeIndexscan.c). We model the per-tuple * CPU costs as cpu_index_tuple_cost plus one cpu_operator_cost per - * indexqual operator. Because we have numIndexTuples as a per-scan - * number, we have to multiply by num_sa_scans to get the correct result - * for ScalarArrayOpExpr cases. Similarly add in costs for any index - * ORDER BY expressions. + * indexqual operator. Similarly add in costs for any index ORDER BY + * expressions. * * Note: this neglects the possible costs of rechecking lossy operators. * Detecting that that might be needed seems more expensive than it's @@ -6719,7 +6706,7 @@ genericcostestimate(PlannerInfo *root, indexStartupCost = qual_arg_cost; indexTotalCost += qual_arg_cost; - indexTotalCost += numIndexTuples * num_sa_scans * (cpu_index_tuple_cost + qual_op_cost); + indexTotalCost += numIndexTuples * (cpu_index_tuple_cost + qual_op_cost); /* * Generic assumption about index correlation: there isn't any. @@ -6797,7 +6784,6 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, bool eqQualHere; bool found_saop; bool found_is_null_op; - double num_sa_scans; ListCell *lc; /* @@ -6812,17 +6798,12 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, * * For a RowCompareExpr, we consider only the first column, just as * rowcomparesel() does. - * - * If there's a ScalarArrayOpExpr in the quals, we'll actually perform N - * index scans not one, but the ScalarArrayOpExpr's operator can be - * considered to act the same as it normally does. */ indexBoundQuals = NIL; indexcol = 0; eqQualHere = false; found_saop = false; found_is_null_op = false; - num_sa_scans = 1; foreach(lc, path->indexclauses) { IndexClause *iclause = lfirst_node(IndexClause, lc); @@ -6862,14 +6843,9 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, else if (IsA(clause, ScalarArrayOpExpr)) { ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause; - Node *other_operand = (Node *) lsecond(saop->args); - double alength = estimate_array_length(root, other_operand); clause_op = saop->opno; found_saop = true; - /* count number of SA scans induced by indexBoundQuals only */ - if (alength > 1) - num_sa_scans *= alength; } else if (IsA(clause, NullTest)) { @@ -6929,13 +6905,6 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, JOIN_INNER, NULL); numIndexTuples = btreeSelectivity * index->rel->tuples; - - /* - * As in genericcostestimate(), we have to adjust for any - * ScalarArrayOpExpr quals included in indexBoundQuals, and then round - * to integer. - */ - numIndexTuples = rint(numIndexTuples / num_sa_scans); } /* @@ -6945,6 +6914,48 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, genericcostestimate(root, path, loop_count, &costs); + /* + * Now compensate for btree's ability to efficiently execute scans with + * SAOP clauses. + * + * btree automatically combines individual ScalarArrayOpExpr primitive + * index scans whenever the tuples covered by the next set of array keys + * are close to tuples covered by the current set. This makes the final + * number of descents particularly difficult to estimate. However, btree + * scans never visit any single leaf page more than once. That puts a + * natural floor under the worst case number of descents. + * + * It's particularly important that we not wildly overestimate the number + * of descents needed for a clause list with several SAOPs -- the costs + * really aren't multiplicative in the way genericcostestimate expects. In + * general, most distinct combinations of SAOP keys will tend to not find + * any matching tuples. Furthermore, btree scans search for the next set + * of array keys using the next tuple in line, and so won't even need a + * direct comparison to eliminate most non-matching sets of array keys. + * + * Clamp the number of descents to the estimated number of leaf page + * visits. This is still fairly pessimistic, but tends to result in more + * accurate costing of scans with several SAOP clauses -- especially when + * each array has more than a few elements. The cost of adding additional + * array constants to a low-order SAOP column should saturate past a + * certain point (except where selectivity estimates continue to shift). + * + * Also clamp the number of descents to 1/3 the number of index pages. + * This avoids implausibly high estimates with low selectivity paths, + * where scans frequently require no more than one or two descents. + * + * XXX Ideally, we'd also account for the fact that non-boundary SAOP + * clause quals (which the B-Tree code uses "non-required" scan keys for) + * won't actually contribute to the total number of descents of the index. + * This would require pushing down more context into genericcostestimate. + */ + if (costs.num_sa_scans > 1) + { + costs.num_sa_scans = Min(costs.num_sa_scans, costs.numIndexPages); + costs.num_sa_scans = Min(costs.num_sa_scans, index->pages / 3); + costs.num_sa_scans = Max(costs.num_sa_scans, 1); + } + /* * Add a CPU-cost component to represent the costs of initial btree * descent. We don't charge any I/O cost for touching upper btree levels, @@ -6952,9 +6963,9 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, * comparisons to descend a btree of N leaf tuples. We charge one * cpu_operator_cost per comparison. * - * If there are ScalarArrayOpExprs, charge this once per SA scan. The - * ones after the first one are not startup cost so far as the overall - * plan is concerned, so add them only to "total" cost. + * If there are ScalarArrayOpExprs, charge this once per estimated + * primitive SA scan. The ones after the first one are not startup cost + * so far as the overall plan goes, so just add them to "total" cost. */ if (index->tuples > 1) /* avoid computing log(0) */ { @@ -6971,7 +6982,8 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, * in cases where only a single leaf page is expected to be visited. This * cost is somewhat arbitrarily set at 50x cpu_operator_cost per page * touched. The number of such pages is btree tree height plus one (ie, - * we charge for the leaf page too). As above, charge once per SA scan. + * we charge for the leaf page too). As above, charge once per estimated + * primitive SA scan. */ descentCost = (index->tree_height + 1) * DEFAULT_PAGE_CPU_MULTIPLIER * cpu_operator_cost; costs.indexStartupCost += descentCost; diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 6eb162052..779a15df3 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -960,7 +960,7 @@ typedef struct BTScanPosData * moreLeft and moreRight track whether we think there may be matching * index entries to the left and right of the current page, respectively. * We can clear the appropriate one of these flags when _bt_checkkeys() - * returns continuescan = false. + * sets BTReadPageState.continuescan = false. */ bool moreLeft; bool moreRight; @@ -1024,7 +1024,6 @@ typedef struct BTArrayKeyInfo { int scan_key; /* index of associated key in arrayKeyData */ int cur_elem; /* index of current element in elem_values */ - int mark_elem; /* index of marked element in elem_values */ int num_elems; /* number of elems in current array value */ Datum *elem_values; /* array of num_elems Datums */ } BTArrayKeyInfo; @@ -1038,13 +1037,14 @@ typedef struct BTScanOpaqueData /* workspace for SK_SEARCHARRAY support */ ScanKey arrayKeyData; /* modified copy of scan->keyData */ - bool arraysStarted; /* Started array keys, but have yet to "reach - * past the end" of all arrays? */ int numArrayKeys; /* number of equality-type array keys (-1 if * there are any unsatisfiable array keys) */ - int arrayKeyCount; /* count indicating number of array scan keys - * processed */ + ScanDirection advanceDir; /* Scan direction when arrays last advanced */ + bool needPrimScan; /* Need primscan to continue in advanceDir? */ BTArrayKeyInfo *arrayKeys; /* info about each equality-type array key */ + FmgrInfo *orderProcs; /* ORDER procs for equality constraint keys */ + int numPrimScans; /* Running tally of # primitive index scans + * (used to coordinate parallel workers) */ MemoryContext arrayContext; /* scan-lifespan context for array data */ /* info about killed items if any (killedItems is NULL if never used) */ @@ -1075,6 +1075,29 @@ typedef struct BTScanOpaqueData typedef BTScanOpaqueData *BTScanOpaque; +/* + * _bt_readpage state used across _bt_checkkeys calls for a page + * + * When _bt_readpage is called during a forward scan that has one or more + * equality-type SK_SEARCHARRAY scan keys, it has an extra responsibility: to + * set up information about the final tuple from the page. This must happen + * before the first call to _bt_checkkeys. _bt_checkkeys uses the final tuple + * to manage advancement of the scan's array keys more efficiently. + */ +typedef struct BTReadPageState +{ + /* Input parameters, set by _bt_readpage */ + ScanDirection dir; /* current scan direction */ + IndexTuple finaltup; /* final tuple (high key for forward scans) */ + + /* Output parameters, set by _bt_checkkeys */ + bool continuescan; /* Terminate ongoing (primitive) index scan? */ + + /* Private _bt_checkkeys-managed state */ + bool finaltupchecked; /* final tuple checked against current + * SK_SEARCHARRAY array keys? */ +} BTReadPageState; + /* * We use some private sk_flags bits in preprocessed scan keys. We're allowed * to use bits 16-31 (see skey.h). The uppermost bits are copied from the @@ -1082,6 +1105,7 @@ typedef BTScanOpaqueData *BTScanOpaque; */ #define SK_BT_REQFWD 0x00010000 /* required to continue forward scan */ #define SK_BT_REQBKWD 0x00020000 /* required to continue backward scan */ +#define SK_BT_RDDNARRAY 0x00040000 /* redundant in array preprocessing */ #define SK_BT_INDOPTION_SHIFT 24 /* must clear the above bits */ #define SK_BT_DESC (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT) #define SK_BT_NULLS_FIRST (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT) @@ -1152,7 +1176,7 @@ extern bool btcanreturn(Relation index, int attno); extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno); extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page); extern void _bt_parallel_done(IndexScanDesc scan); -extern void _bt_parallel_advance_array_keys(IndexScanDesc scan); +extern void _bt_parallel_next_primitive_scan(IndexScanDesc scan); /* * prototypes for functions in nbtdedup.c @@ -1245,13 +1269,12 @@ extern BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup); extern void _bt_freestack(BTStack stack); extern void _bt_preprocess_array_keys(IndexScanDesc scan); extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir); -extern bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir); -extern void _bt_mark_array_keys(IndexScanDesc scan); -extern void _bt_restore_array_keys(IndexScanDesc scan); +extern bool _bt_array_keys_remain(IndexScanDesc scan, ScanDirection dir); +extern void _bt_rewind_array_keys(IndexScanDesc scan); extern void _bt_preprocess_keys(IndexScanDesc scan); -extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, - int tupnatts, ScanDirection dir, bool *continuescan, - bool requiredMatchedByPrecheck, bool haveFirstMatch); +extern bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, + IndexTuple tuple, bool finaltup, int tupnatts, + bool continuescanPrechecked, bool haveFirstMatch); extern void _bt_killitems(IndexScanDesc scan); extern BTCycleId _bt_vacuum_cycleid(Relation rel); extern BTCycleId _bt_start_vacuum(Relation rel); diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out index 8311a03c3..d159091ab 100644 --- a/src/test/regress/expected/btree_index.out +++ b/src/test/regress/expected/btree_index.out @@ -434,3 +434,482 @@ ALTER INDEX btree_part_idx ALTER COLUMN id SET (n_distinct=100); ERROR: ALTER action ALTER COLUMN ... SET cannot be performed on relation "btree_part_idx" DETAIL: This operation is not supported for partitioned indexes. DROP TABLE btree_part; +-- Add tests to give coverage of various subtle issues. +-- +-- XXX This may not be suitable for commit, due to taking up too many cycles. +-- +-- Here we don't remember the scan's array keys before processing a page, only +-- after processing a page (which is implicit, it's just the scan's current +-- keys). So when we move the scan backwards we think that the top-level scan +-- should terminate, when in reality it should jump backwards to the leaf page +-- that we last visited. +create temp table backup_wrong_tbl (district int4, warehouse int4, orderid int4, orderline int4); +create index backup_wrong_idx on backup_wrong_tbl (district, warehouse, orderid, orderline); +insert into backup_wrong_tbl +select district, warehouse, orderid, orderline +from + generate_series(1, 3) district, + generate_series(1, 2) warehouse, + generate_series(1, 51) orderid, + generate_series(1, 10) orderline; +begin; +declare back_up_terminate_toplevel_wrong cursor for +select * from backup_wrong_tbl +where district in (1, 3) and warehouse in (1,2) +and orderid in (48, 50) +order by district, warehouse, orderid, orderline; +fetch forward 60 from back_up_terminate_toplevel_wrong; + district | warehouse | orderid | orderline +----------+-----------+---------+----------- + 1 | 1 | 48 | 1 + 1 | 1 | 48 | 2 + 1 | 1 | 48 | 3 + 1 | 1 | 48 | 4 + 1 | 1 | 48 | 5 + 1 | 1 | 48 | 6 + 1 | 1 | 48 | 7 + 1 | 1 | 48 | 8 + 1 | 1 | 48 | 9 + 1 | 1 | 48 | 10 + 1 | 1 | 50 | 1 + 1 | 1 | 50 | 2 + 1 | 1 | 50 | 3 + 1 | 1 | 50 | 4 + 1 | 1 | 50 | 5 + 1 | 1 | 50 | 6 + 1 | 1 | 50 | 7 + 1 | 1 | 50 | 8 + 1 | 1 | 50 | 9 + 1 | 1 | 50 | 10 + 1 | 2 | 48 | 1 + 1 | 2 | 48 | 2 + 1 | 2 | 48 | 3 + 1 | 2 | 48 | 4 + 1 | 2 | 48 | 5 + 1 | 2 | 48 | 6 + 1 | 2 | 48 | 7 + 1 | 2 | 48 | 8 + 1 | 2 | 48 | 9 + 1 | 2 | 48 | 10 + 1 | 2 | 50 | 1 + 1 | 2 | 50 | 2 + 1 | 2 | 50 | 3 + 1 | 2 | 50 | 4 + 1 | 2 | 50 | 5 + 1 | 2 | 50 | 6 + 1 | 2 | 50 | 7 + 1 | 2 | 50 | 8 + 1 | 2 | 50 | 9 + 1 | 2 | 50 | 10 + 3 | 1 | 48 | 1 + 3 | 1 | 48 | 2 + 3 | 1 | 48 | 3 + 3 | 1 | 48 | 4 + 3 | 1 | 48 | 5 + 3 | 1 | 48 | 6 + 3 | 1 | 48 | 7 + 3 | 1 | 48 | 8 + 3 | 1 | 48 | 9 + 3 | 1 | 48 | 10 + 3 | 1 | 50 | 1 + 3 | 1 | 50 | 2 + 3 | 1 | 50 | 3 + 3 | 1 | 50 | 4 + 3 | 1 | 50 | 5 + 3 | 1 | 50 | 6 + 3 | 1 | 50 | 7 + 3 | 1 | 50 | 8 + 3 | 1 | 50 | 9 + 3 | 1 | 50 | 10 +(60 rows) + +fetch backward 29 from back_up_terminate_toplevel_wrong; + district | warehouse | orderid | orderline +----------+-----------+---------+----------- + 3 | 1 | 50 | 9 + 3 | 1 | 50 | 8 + 3 | 1 | 50 | 7 + 3 | 1 | 50 | 6 + 3 | 1 | 50 | 5 + 3 | 1 | 50 | 4 + 3 | 1 | 50 | 3 + 3 | 1 | 50 | 2 + 3 | 1 | 50 | 1 + 3 | 1 | 48 | 10 + 3 | 1 | 48 | 9 + 3 | 1 | 48 | 8 + 3 | 1 | 48 | 7 + 3 | 1 | 48 | 6 + 3 | 1 | 48 | 5 + 3 | 1 | 48 | 4 + 3 | 1 | 48 | 3 + 3 | 1 | 48 | 2 + 3 | 1 | 48 | 1 + 1 | 2 | 50 | 10 + 1 | 2 | 50 | 9 + 1 | 2 | 50 | 8 + 1 | 2 | 50 | 7 + 1 | 2 | 50 | 6 + 1 | 2 | 50 | 5 + 1 | 2 | 50 | 4 + 1 | 2 | 50 | 3 + 1 | 2 | 50 | 2 + 1 | 2 | 50 | 1 +(29 rows) + +fetch forward 12 from back_up_terminate_toplevel_wrong; + district | warehouse | orderid | orderline +----------+-----------+---------+----------- + 1 | 2 | 50 | 2 + 1 | 2 | 50 | 3 + 1 | 2 | 50 | 4 + 1 | 2 | 50 | 5 + 1 | 2 | 50 | 6 + 1 | 2 | 50 | 7 + 1 | 2 | 50 | 8 + 1 | 2 | 50 | 9 + 1 | 2 | 50 | 10 + 3 | 1 | 48 | 1 + 3 | 1 | 48 | 2 + 3 | 1 | 48 | 3 +(12 rows) + +fetch backward 30 from back_up_terminate_toplevel_wrong; + district | warehouse | orderid | orderline +----------+-----------+---------+----------- + 3 | 1 | 48 | 2 + 3 | 1 | 48 | 1 + 1 | 2 | 50 | 10 + 1 | 2 | 50 | 9 + 1 | 2 | 50 | 8 + 1 | 2 | 50 | 7 + 1 | 2 | 50 | 6 + 1 | 2 | 50 | 5 + 1 | 2 | 50 | 4 + 1 | 2 | 50 | 3 + 1 | 2 | 50 | 2 + 1 | 2 | 50 | 1 + 1 | 2 | 48 | 10 + 1 | 2 | 48 | 9 + 1 | 2 | 48 | 8 + 1 | 2 | 48 | 7 + 1 | 2 | 48 | 6 + 1 | 2 | 48 | 5 + 1 | 2 | 48 | 4 + 1 | 2 | 48 | 3 + 1 | 2 | 48 | 2 + 1 | 2 | 48 | 1 + 1 | 1 | 50 | 10 + 1 | 1 | 50 | 9 + 1 | 1 | 50 | 8 + 1 | 1 | 50 | 7 + 1 | 1 | 50 | 6 + 1 | 1 | 50 | 5 + 1 | 1 | 50 | 4 + 1 | 1 | 50 | 3 +(30 rows) + +fetch forward 31 from back_up_terminate_toplevel_wrong; + district | warehouse | orderid | orderline +----------+-----------+---------+----------- + 1 | 1 | 50 | 4 + 1 | 1 | 50 | 5 + 1 | 1 | 50 | 6 + 1 | 1 | 50 | 7 + 1 | 1 | 50 | 8 + 1 | 1 | 50 | 9 + 1 | 1 | 50 | 10 + 1 | 2 | 48 | 1 + 1 | 2 | 48 | 2 + 1 | 2 | 48 | 3 + 1 | 2 | 48 | 4 + 1 | 2 | 48 | 5 + 1 | 2 | 48 | 6 + 1 | 2 | 48 | 7 + 1 | 2 | 48 | 8 + 1 | 2 | 48 | 9 + 1 | 2 | 48 | 10 + 1 | 2 | 50 | 1 + 1 | 2 | 50 | 2 + 1 | 2 | 50 | 3 + 1 | 2 | 50 | 4 + 1 | 2 | 50 | 5 + 1 | 2 | 50 | 6 + 1 | 2 | 50 | 7 + 1 | 2 | 50 | 8 + 1 | 2 | 50 | 9 + 1 | 2 | 50 | 10 + 3 | 1 | 48 | 1 + 3 | 1 | 48 | 2 + 3 | 1 | 48 | 3 + 3 | 1 | 48 | 4 +(31 rows) + +fetch backward 32 from back_up_terminate_toplevel_wrong; + district | warehouse | orderid | orderline +----------+-----------+---------+----------- + 3 | 1 | 48 | 3 + 3 | 1 | 48 | 2 + 3 | 1 | 48 | 1 + 1 | 2 | 50 | 10 + 1 | 2 | 50 | 9 + 1 | 2 | 50 | 8 + 1 | 2 | 50 | 7 + 1 | 2 | 50 | 6 + 1 | 2 | 50 | 5 + 1 | 2 | 50 | 4 + 1 | 2 | 50 | 3 + 1 | 2 | 50 | 2 + 1 | 2 | 50 | 1 + 1 | 2 | 48 | 10 + 1 | 2 | 48 | 9 + 1 | 2 | 48 | 8 + 1 | 2 | 48 | 7 + 1 | 2 | 48 | 6 + 1 | 2 | 48 | 5 + 1 | 2 | 48 | 4 + 1 | 2 | 48 | 3 + 1 | 2 | 48 | 2 + 1 | 2 | 48 | 1 + 1 | 1 | 50 | 10 + 1 | 1 | 50 | 9 + 1 | 1 | 50 | 8 + 1 | 1 | 50 | 7 + 1 | 1 | 50 | 6 + 1 | 1 | 50 | 5 + 1 | 1 | 50 | 4 + 1 | 1 | 50 | 3 + 1 | 1 | 50 | 2 +(32 rows) + +fetch forward 33 from back_up_terminate_toplevel_wrong; + district | warehouse | orderid | orderline +----------+-----------+---------+----------- + 1 | 1 | 50 | 3 + 1 | 1 | 50 | 4 + 1 | 1 | 50 | 5 + 1 | 1 | 50 | 6 + 1 | 1 | 50 | 7 + 1 | 1 | 50 | 8 + 1 | 1 | 50 | 9 + 1 | 1 | 50 | 10 + 1 | 2 | 48 | 1 + 1 | 2 | 48 | 2 + 1 | 2 | 48 | 3 + 1 | 2 | 48 | 4 + 1 | 2 | 48 | 5 + 1 | 2 | 48 | 6 + 1 | 2 | 48 | 7 + 1 | 2 | 48 | 8 + 1 | 2 | 48 | 9 + 1 | 2 | 48 | 10 + 1 | 2 | 50 | 1 + 1 | 2 | 50 | 2 + 1 | 2 | 50 | 3 + 1 | 2 | 50 | 4 + 1 | 2 | 50 | 5 + 1 | 2 | 50 | 6 + 1 | 2 | 50 | 7 + 1 | 2 | 50 | 8 + 1 | 2 | 50 | 9 + 1 | 2 | 50 | 10 + 3 | 1 | 48 | 1 + 3 | 1 | 48 | 2 + 3 | 1 | 48 | 3 + 3 | 1 | 48 | 4 + 3 | 1 | 48 | 5 +(33 rows) + +fetch backward 34 from back_up_terminate_toplevel_wrong; + district | warehouse | orderid | orderline +----------+-----------+---------+----------- + 3 | 1 | 48 | 4 + 3 | 1 | 48 | 3 + 3 | 1 | 48 | 2 + 3 | 1 | 48 | 1 + 1 | 2 | 50 | 10 + 1 | 2 | 50 | 9 + 1 | 2 | 50 | 8 + 1 | 2 | 50 | 7 + 1 | 2 | 50 | 6 + 1 | 2 | 50 | 5 + 1 | 2 | 50 | 4 + 1 | 2 | 50 | 3 + 1 | 2 | 50 | 2 + 1 | 2 | 50 | 1 + 1 | 2 | 48 | 10 + 1 | 2 | 48 | 9 + 1 | 2 | 48 | 8 + 1 | 2 | 48 | 7 + 1 | 2 | 48 | 6 + 1 | 2 | 48 | 5 + 1 | 2 | 48 | 4 + 1 | 2 | 48 | 3 + 1 | 2 | 48 | 2 + 1 | 2 | 48 | 1 + 1 | 1 | 50 | 10 + 1 | 1 | 50 | 9 + 1 | 1 | 50 | 8 + 1 | 1 | 50 | 7 + 1 | 1 | 50 | 6 + 1 | 1 | 50 | 5 + 1 | 1 | 50 | 4 + 1 | 1 | 50 | 3 + 1 | 1 | 50 | 2 + 1 | 1 | 50 | 1 +(34 rows) + +fetch forward 35 from back_up_terminate_toplevel_wrong; + district | warehouse | orderid | orderline +----------+-----------+---------+----------- + 1 | 1 | 50 | 2 + 1 | 1 | 50 | 3 + 1 | 1 | 50 | 4 + 1 | 1 | 50 | 5 + 1 | 1 | 50 | 6 + 1 | 1 | 50 | 7 + 1 | 1 | 50 | 8 + 1 | 1 | 50 | 9 + 1 | 1 | 50 | 10 + 1 | 2 | 48 | 1 + 1 | 2 | 48 | 2 + 1 | 2 | 48 | 3 + 1 | 2 | 48 | 4 + 1 | 2 | 48 | 5 + 1 | 2 | 48 | 6 + 1 | 2 | 48 | 7 + 1 | 2 | 48 | 8 + 1 | 2 | 48 | 9 + 1 | 2 | 48 | 10 + 1 | 2 | 50 | 1 + 1 | 2 | 50 | 2 + 1 | 2 | 50 | 3 + 1 | 2 | 50 | 4 + 1 | 2 | 50 | 5 + 1 | 2 | 50 | 6 + 1 | 2 | 50 | 7 + 1 | 2 | 50 | 8 + 1 | 2 | 50 | 9 + 1 | 2 | 50 | 10 + 3 | 1 | 48 | 1 + 3 | 1 | 48 | 2 + 3 | 1 | 48 | 3 + 3 | 1 | 48 | 4 + 3 | 1 | 48 | 5 + 3 | 1 | 48 | 6 +(35 rows) + +commit; +create temp table outer_table (a int, b int); +create temp table restore_buggy_primscan_table (x int, y int); +create index buggy_idx on restore_buggy_primscan_table (x, y) with (deduplicate_items=off); +insert into outer_table select 1, b_vals from generate_series(1006, 1580) b_vals; +insert into restore_buggy_primscan_table select 1, x_vals from generate_series(1006, 1580) x_vals; +insert into outer_table select 1, 1370 from generate_series(1, 9) j; +insert into restore_buggy_primscan_table select 1, 1371 from generate_series(1, 9) j; +insert into restore_buggy_primscan_table select 1, 1380 from generate_series(1, 9) j; +vacuum analyze outer_table; +vacuum analyze restore_buggy_primscan_table; +select count(*), o.a, o.b + from + outer_table o + inner join + restore_buggy_primscan_table bug + on o.a = bug.x and o.b = bug.y +where + bug.x = 1 and + bug.y = any(array[(select array_agg(i) from generate_series(1370, 1390) i where i % 10 = 0)]) +group by o.a, o.b; + count | a | b +-------+---+------ + 10 | 1 | 1370 + 10 | 1 | 1380 + 1 | 1 | 1390 +(3 rows) + +-- Get test coverage for when so->needPrimScan is set at the point of calling +-- _bt_restore_array_keys(). This is handled like the case where the scan +-- direction changes "within" a page, relying on code from _bt_readnextpage(). +create temp table outer_tab( + a int, + b int +); +create index outer_tab_idx on outer_tab(a, b) with (deduplicate_items = off); +create temp table primscanmarkcov_table( + a int, + b int +); +create index interesting_coverage_idx on primscanmarkcov_table(a, b) with (deduplicate_items = off); +insert into outer_tab select 1, i from generate_series(1530, 1780) i; +insert into primscanmarkcov_table select 1, i from generate_series(1530, 1780) i; +insert into outer_tab select 1, 1550 from generate_series(1, 200) i; +insert into primscanmarkcov_table select 1, 1551 from generate_series(1, 200) i; +vacuum analyze outer_tab; +vacuum analyze primscanmarkcov_table ; +with range_ints as ( select i from generate_series(1530, 1780) i) +select + count(*), buggy.a, buggy.b from +outer_tab o + inner join +primscanmarkcov_table buggy + on o.a = buggy.a and o.b = buggy.b +where + o.a = 1 and o.b = any (array[(select array_agg(i) from range_ints where i % 50 = 0)]) and + buggy.a = 1 and buggy.b = any (array[(select array_agg(i) from range_ints where i % 50 = 0)]) +group by buggy.a, buggy.b +order by buggy.a, buggy.b; + count | a | b +-------+---+------ + 201 | 1 | 1550 + 1 | 1 | 1600 + 1 | 1 | 1650 + 1 | 1 | 1700 + 1 | 1 | 1750 +(5 rows) + +-- Get test coverage for when so->needPrimScan is set at the point of calling +-- _bt_restore_array_keys() for backwards scans. More or less comparable to +-- the last test. +create temp table backwards_prim_outer_table (a int, b int); +create temp table backwards_restore_buggy_primscan_table (x int, y int); +create index backward_prim_buggy_idx on backwards_restore_buggy_primscan_table (x, y) with (deduplicate_items=off); +create index backwards_prim_drive_idx on backwards_prim_outer_table (a, b) with (deduplicate_items=off); +insert into backwards_prim_outer_table select 0, 1360; +insert into backwards_prim_outer_table select 1, b_vals from generate_series(1012, 1406) b_vals where b_vals % 10 = 0; +insert into backwards_prim_outer_table select 1, 1370; +vacuum analyze backwards_prim_outer_table; -- Be tidy +-- Fill up "backwards_prim_drive_idx" index with 396 items, just about fitting +-- onto its only page, which is a root leaf page: +insert into backwards_restore_buggy_primscan_table select 0, 1360; +insert into backwards_restore_buggy_primscan_table select 1, x_vals from generate_series(1012, 1406) x_vals; +vacuum analyze backwards_restore_buggy_primscan_table; -- Be tidy +-- Now cause two page splits, leaving 4 leaf pages in total: +insert into backwards_restore_buggy_primscan_table select 1, 1370 from generate_series(1,250) i; +-- Now "buggy" index looks like this: +-- +-- ┌───┬───────┬───────┬────────┬────────┬────────────┬───────┬───────┬───────────────────┬─────────┬───────────┬──────────────────┐ +-- │ i │ blkno │ flags │ nhtids │ nhblks │ ndeadhblks │ nlive │ ndead │ nhtidschecksimple │ avgsize │ freespace │ highkey │ +-- ├───┼───────┼───────┼────────┼────────┼────────────┼───────┼───────┼───────────────────┼─────────┼───────────┼──────────────────┤ +-- │ 1 │ 1 │ 1 │ 203 │ 1 │ 0 │ 204 │ 0 │ 0 │ 16 │ 4,068 │ (x, y)=(1, 1214) │ +-- │ 2 │ 4 │ 1 │ 156 │ 2 │ 0 │ 157 │ 0 │ 0 │ 16 │ 5,008 │ (x, y)=(1, 1370) │ +-- │ 3 │ 5 │ 1 │ 251 │ 2 │ 0 │ 252 │ 0 │ 0 │ 16 │ 3,108 │ (x, y)=(1, 1371) │ +-- │ 4 │ 2 │ 1 │ 36 │ 1 │ 0 │ 36 │ 0 │ 0 │ 16 │ 7,428 │ ∅ │ +-- └───┴───────┴───────┴────────┴────────┴────────────┴───────┴───────┴───────────────────┴─────────┴───────────┴──────────────────┘ +select count(*), o.a, o.b + from + backwards_prim_outer_table o + inner join + backwards_restore_buggy_primscan_table bug + on o.a = bug.x and o.b = bug.y +where + bug.x in (0, 1) and + bug.y = any(array[(select array_agg(i) from generate_series(1360, 1370) i where i % 10 = 0)]) +group by o.a, o.b +order by o.a desc, o.b desc; + count | a | b +-------+---+------ + 502 | 1 | 1370 + 1 | 1 | 1360 + 1 | 0 | 1360 +(3 rows) + diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index 79fa117cb..f5865494c 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -1910,7 +1910,7 @@ SELECT count(*) FROM dupindexcols (1 row) -- --- Check ordering of =ANY indexqual results (bug in 9.2.0) +-- Check that index scans with =ANY indexquals return rows in index order -- explain (costs off) SELECT unique1 FROM tenk1 @@ -1936,12 +1936,11 @@ explain (costs off) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) ORDER BY thousand; - QUERY PLAN -------------------------------------------------------- + QUERY PLAN +-------------------------------------------------------------------------------- Index Only Scan using tenk1_thous_tenthous on tenk1 - Index Cond: (thousand < 2) - Filter: (tenthous = ANY ('{1001,3000}'::integer[])) -(3 rows) + Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[]))) +(2 rows) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) @@ -1952,29 +1951,25 @@ ORDER BY thousand; 1 | 1001 (2 rows) -SET enable_indexonlyscan = OFF; explain (costs off) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) -ORDER BY thousand; - QUERY PLAN --------------------------------------------------------------------------------------- - Sort - Sort Key: thousand - -> Index Scan using tenk1_thous_tenthous on tenk1 - Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[]))) -(4 rows) +ORDER BY thousand DESC, tenthous DESC; + QUERY PLAN +-------------------------------------------------------------------------------- + Index Only Scan Backward using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[]))) +(2 rows) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) -ORDER BY thousand; +ORDER BY thousand DESC, tenthous DESC; thousand | tenthous ----------+---------- - 0 | 3000 1 | 1001 + 0 | 3000 (2 rows) -RESET enable_indexonlyscan; -- -- Check elimination of constant-NULL subexpressions -- diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index a2fad81d7..3a0456746 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -8837,10 +8837,9 @@ where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1 and j2.id1 >= any (array[1,5]); Merge Cond: (j1.id1 = j2.id1) Join Filter: (j2.id2 = j1.id2) -> Index Scan using j1_id1_idx on j1 - -> Index Only Scan using j2_pkey on j2 + -> Index Scan using j2_id1_idx on j2 Index Cond: (id1 >= ANY ('{1,5}'::integer[])) - Filter: ((id1 % 1000) = 1) -(7 rows) +(6 rows) select * from j1 inner join j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2 diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql index ef8435423..330edbb1d 100644 --- a/src/test/regress/sql/btree_index.sql +++ b/src/test/regress/sql/btree_index.sql @@ -267,3 +267,150 @@ CREATE TABLE btree_part (id int4) PARTITION BY RANGE (id); CREATE INDEX btree_part_idx ON btree_part(id); ALTER INDEX btree_part_idx ALTER COLUMN id SET (n_distinct=100); DROP TABLE btree_part; + +-- Add tests to give coverage of various subtle issues. +-- +-- XXX This may not be suitable for commit, due to taking up too many cycles. +-- +-- Here we don't remember the scan's array keys before processing a page, only +-- after processing a page (which is implicit, it's just the scan's current +-- keys). So when we move the scan backwards we think that the top-level scan +-- should terminate, when in reality it should jump backwards to the leaf page +-- that we last visited. +create temp table backup_wrong_tbl (district int4, warehouse int4, orderid int4, orderline int4); +create index backup_wrong_idx on backup_wrong_tbl (district, warehouse, orderid, orderline); +insert into backup_wrong_tbl +select district, warehouse, orderid, orderline +from + generate_series(1, 3) district, + generate_series(1, 2) warehouse, + generate_series(1, 51) orderid, + generate_series(1, 10) orderline; + +begin; +declare back_up_terminate_toplevel_wrong cursor for +select * from backup_wrong_tbl +where district in (1, 3) and warehouse in (1,2) +and orderid in (48, 50) +order by district, warehouse, orderid, orderline; + +fetch forward 60 from back_up_terminate_toplevel_wrong; +fetch backward 29 from back_up_terminate_toplevel_wrong; +fetch forward 12 from back_up_terminate_toplevel_wrong; +fetch backward 30 from back_up_terminate_toplevel_wrong; +fetch forward 31 from back_up_terminate_toplevel_wrong; +fetch backward 32 from back_up_terminate_toplevel_wrong; +fetch forward 33 from back_up_terminate_toplevel_wrong; +fetch backward 34 from back_up_terminate_toplevel_wrong; +fetch forward 35 from back_up_terminate_toplevel_wrong; +commit; + +create temp table outer_table (a int, b int); +create temp table restore_buggy_primscan_table (x int, y int); + +create index buggy_idx on restore_buggy_primscan_table (x, y) with (deduplicate_items=off); + +insert into outer_table select 1, b_vals from generate_series(1006, 1580) b_vals; +insert into restore_buggy_primscan_table select 1, x_vals from generate_series(1006, 1580) x_vals; + +insert into outer_table select 1, 1370 from generate_series(1, 9) j; +insert into restore_buggy_primscan_table select 1, 1371 from generate_series(1, 9) j; +insert into restore_buggy_primscan_table select 1, 1380 from generate_series(1, 9) j; + +vacuum analyze outer_table; +vacuum analyze restore_buggy_primscan_table; + +select count(*), o.a, o.b + from + outer_table o + inner join + restore_buggy_primscan_table bug + on o.a = bug.x and o.b = bug.y +where + bug.x = 1 and + bug.y = any(array[(select array_agg(i) from generate_series(1370, 1390) i where i % 10 = 0)]) +group by o.a, o.b; + +-- Get test coverage for when so->needPrimScan is set at the point of calling +-- _bt_restore_array_keys(). This is handled like the case where the scan +-- direction changes "within" a page, relying on code from _bt_readnextpage(). +create temp table outer_tab( + a int, + b int +); +create index outer_tab_idx on outer_tab(a, b) with (deduplicate_items = off); + +create temp table primscanmarkcov_table( + a int, + b int +); +create index interesting_coverage_idx on primscanmarkcov_table(a, b) with (deduplicate_items = off); + +insert into outer_tab select 1, i from generate_series(1530, 1780) i; +insert into primscanmarkcov_table select 1, i from generate_series(1530, 1780) i; + +insert into outer_tab select 1, 1550 from generate_series(1, 200) i; +insert into primscanmarkcov_table select 1, 1551 from generate_series(1, 200) i; + +vacuum analyze outer_tab; +vacuum analyze primscanmarkcov_table ; + +with range_ints as ( select i from generate_series(1530, 1780) i) + +select + count(*), buggy.a, buggy.b from +outer_tab o + inner join +primscanmarkcov_table buggy + on o.a = buggy.a and o.b = buggy.b +where + o.a = 1 and o.b = any (array[(select array_agg(i) from range_ints where i % 50 = 0)]) and + buggy.a = 1 and buggy.b = any (array[(select array_agg(i) from range_ints where i % 50 = 0)]) +group by buggy.a, buggy.b +order by buggy.a, buggy.b; + +-- Get test coverage for when so->needPrimScan is set at the point of calling +-- _bt_restore_array_keys() for backwards scans. More or less comparable to +-- the last test. +create temp table backwards_prim_outer_table (a int, b int); +create temp table backwards_restore_buggy_primscan_table (x int, y int); + +create index backward_prim_buggy_idx on backwards_restore_buggy_primscan_table (x, y) with (deduplicate_items=off); +create index backwards_prim_drive_idx on backwards_prim_outer_table (a, b) with (deduplicate_items=off); + +insert into backwards_prim_outer_table select 0, 1360; +insert into backwards_prim_outer_table select 1, b_vals from generate_series(1012, 1406) b_vals where b_vals % 10 = 0; +insert into backwards_prim_outer_table select 1, 1370; +vacuum analyze backwards_prim_outer_table; -- Be tidy + +-- Fill up "backwards_prim_drive_idx" index with 396 items, just about fitting +-- onto its only page, which is a root leaf page: +insert into backwards_restore_buggy_primscan_table select 0, 1360; +insert into backwards_restore_buggy_primscan_table select 1, x_vals from generate_series(1012, 1406) x_vals; +vacuum analyze backwards_restore_buggy_primscan_table; -- Be tidy + +-- Now cause two page splits, leaving 4 leaf pages in total: +insert into backwards_restore_buggy_primscan_table select 1, 1370 from generate_series(1,250) i; + +-- Now "buggy" index looks like this: +-- +-- ┌───┬───────┬───────┬────────┬────────┬────────────┬───────┬───────┬───────────────────┬─────────┬───────────┬──────────────────┐ +-- │ i │ blkno │ flags │ nhtids │ nhblks │ ndeadhblks │ nlive │ ndead │ nhtidschecksimple │ avgsize │ freespace │ highkey │ +-- ├───┼───────┼───────┼────────┼────────┼────────────┼───────┼───────┼───────────────────┼─────────┼───────────┼──────────────────┤ +-- │ 1 │ 1 │ 1 │ 203 │ 1 │ 0 │ 204 │ 0 │ 0 │ 16 │ 4,068 │ (x, y)=(1, 1214) │ +-- │ 2 │ 4 │ 1 │ 156 │ 2 │ 0 │ 157 │ 0 │ 0 │ 16 │ 5,008 │ (x, y)=(1, 1370) │ +-- │ 3 │ 5 │ 1 │ 251 │ 2 │ 0 │ 252 │ 0 │ 0 │ 16 │ 3,108 │ (x, y)=(1, 1371) │ +-- │ 4 │ 2 │ 1 │ 36 │ 1 │ 0 │ 36 │ 0 │ 0 │ 16 │ 7,428 │ ∅ │ +-- └───┴───────┴───────┴────────┴────────┴────────────┴───────┴───────┴───────────────────┴─────────┴───────────┴──────────────────┘ + +select count(*), o.a, o.b + from + backwards_prim_outer_table o + inner join + backwards_restore_buggy_primscan_table bug + on o.a = bug.x and o.b = bug.y +where + bug.x in (0, 1) and + bug.y = any(array[(select array_agg(i) from generate_series(1360, 1370) i where i % 10 = 0)]) +group by o.a, o.b +order by o.a desc, o.b desc; diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql index d49ce9f30..9d68ef624 100644 --- a/src/test/regress/sql/create_index.sql +++ b/src/test/regress/sql/create_index.sql @@ -753,7 +753,7 @@ SELECT count(*) FROM dupindexcols WHERE f1 BETWEEN 'WA' AND 'ZZZ' and id < 1000 and f1 ~<~ 'YX'; -- --- Check ordering of =ANY indexqual results (bug in 9.2.0) +-- Check that index scans with =ANY indexquals return rows in index order -- explain (costs off) @@ -774,18 +774,14 @@ SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) ORDER BY thousand; -SET enable_indexonlyscan = OFF; - explain (costs off) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) -ORDER BY thousand; +ORDER BY thousand DESC, tenthous DESC; SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) -ORDER BY thousand; - -RESET enable_indexonlyscan; +ORDER BY thousand DESC, tenthous DESC; -- -- Check elimination of constant-NULL subexpressions -- 2.43.0