diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 1c7466c8158..9ec023fae38 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -126,9 +126,9 @@ static void bt_check_every_level(Relation rel, Relation heaprel, static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level); static void bt_target_page_check(BtreeCheckState *state); -static ScanKey bt_right_page_check_scankey(BtreeCheckState *state); -static void bt_downlink_check(BtreeCheckState *state, BlockNumber childblock, - ScanKey targetkey); +static BTScanInsert bt_right_page_check_scankey(BtreeCheckState *state); +static void bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey, + BlockNumber childblock); static void bt_downlink_missing_check(BtreeCheckState *state); static void bt_tuple_present_callback(Relation index, HeapTuple htup, Datum *values, bool *isnull, @@ -136,14 +136,14 @@ static void bt_tuple_present_callback(Relation index, HeapTuple htup, static inline bool offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset); static inline bool invariant_leq_offset(BtreeCheckState *state, - ScanKey key, + BTScanInsert key, OffsetNumber upperbound); static inline bool invariant_geq_offset(BtreeCheckState *state, - ScanKey key, + BTScanInsert key, OffsetNumber lowerbound); static inline bool invariant_leq_nontarget_offset(BtreeCheckState *state, - Page other, - ScanKey key, + BTScanInsert key, + Page nontarget, OffsetNumber upperbound); static Page palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum); @@ -835,8 +835,8 @@ bt_target_page_check(BtreeCheckState *state) { ItemId itemid; IndexTuple itup; - ScanKey skey; size_t tupsize; + BTScanInsert skey; CHECK_FOR_INTERRUPTS(); @@ -1018,7 +1018,7 @@ bt_target_page_check(BtreeCheckState *state) */ else if (offset == max) { - ScanKey rightkey; + BTScanInsert rightkey; /* Get item in next/right page */ rightkey = bt_right_page_check_scankey(state); @@ -1070,7 +1070,7 @@ bt_target_page_check(BtreeCheckState *state) { BlockNumber childblock = BTreeInnerTupleGetDownLink(itup); - bt_downlink_check(state, childblock, skey); + bt_downlink_check(state, skey, childblock); } } @@ -1099,11 +1099,12 @@ bt_target_page_check(BtreeCheckState *state) * Note that !readonly callers must reverify that target page has not * been concurrently deleted. */ -static ScanKey +static BTScanInsert bt_right_page_check_scankey(BtreeCheckState *state) { BTPageOpaque opaque; ItemId rightitem; + IndexTuple firstitup; BlockNumber targetnext; Page rightpage; OffsetNumber nline; @@ -1291,8 +1292,8 @@ bt_right_page_check_scankey(BtreeCheckState *state) * Return first real item scankey. Note that this relies on right page * memory remaining allocated. */ - return _bt_mkscankey(state->rel, - (IndexTuple) PageGetItem(rightpage, rightitem)); + firstitup = (IndexTuple) PageGetItem(rightpage, rightitem); + return _bt_mkscankey(state->rel, firstitup); } /* @@ -1305,8 +1306,8 @@ bt_right_page_check_scankey(BtreeCheckState *state) * verification this way around is much more practical. */ static void -bt_downlink_check(BtreeCheckState *state, BlockNumber childblock, - ScanKey targetkey) +bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey, + BlockNumber childblock) { OffsetNumber offset; OffsetNumber maxoffset; @@ -1411,8 +1412,7 @@ bt_downlink_check(BtreeCheckState *state, BlockNumber childblock, if (offset_is_negative_infinity(copaque, offset)) continue; - if (!invariant_leq_nontarget_offset(state, child, - targetkey, offset)) + if (!invariant_leq_nontarget_offset(state, targetkey, child, offset)) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("down-link lower bound invariant violated for index \"%s\"", @@ -1760,13 +1760,12 @@ offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset) * to corruption. */ static inline bool -invariant_leq_offset(BtreeCheckState *state, ScanKey key, +invariant_leq_offset(BtreeCheckState *state, BTScanInsert key, OffsetNumber upperbound) { - int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel); int32 cmp; - cmp = _bt_compare(state->rel, nkeyatts, key, state->target, upperbound); + cmp = _bt_compare(state->rel, key, state->target, upperbound); return cmp <= 0; } @@ -1779,13 +1778,12 @@ invariant_leq_offset(BtreeCheckState *state, ScanKey key, * to corruption. */ static inline bool -invariant_geq_offset(BtreeCheckState *state, ScanKey key, +invariant_geq_offset(BtreeCheckState *state, BTScanInsert key, OffsetNumber lowerbound) { - int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel); int32 cmp; - cmp = _bt_compare(state->rel, nkeyatts, key, state->target, lowerbound); + cmp = _bt_compare(state->rel, key, state->target, lowerbound); return cmp >= 0; } @@ -1801,14 +1799,12 @@ invariant_geq_offset(BtreeCheckState *state, ScanKey key, * to corruption. */ static inline bool -invariant_leq_nontarget_offset(BtreeCheckState *state, - Page nontarget, ScanKey key, - OffsetNumber upperbound) +invariant_leq_nontarget_offset(BtreeCheckState *state, BTScanInsert key, + Page nontarget, OffsetNumber upperbound) { - int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel); int32 cmp; - cmp = _bt_compare(state->rel, nkeyatts, key, nontarget, upperbound); + cmp = _bt_compare(state->rel, key, nontarget, upperbound); return cmp <= 0; } diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 3680e69b89a..eb4df2ebbe6 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -609,6 +609,9 @@ original search scankey is consulted as each index entry is sequentially scanned to decide whether to return the entry and whether the scan can stop (see _bt_checkkeys()). +HEIKKI: The above probably needs some updating, now that we have a +separate BTScanInsert struct to represent an insertion scan key. + We use term "pivot" index tuples to distinguish tuples which don't point to heap tuples, but rather used for tree navigation. Pivot tuples includes all tuples on non-leaf pages and high keys on leaf pages. Note that pivot diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 5c2b8034f5e..c3a6c883449 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -51,19 +51,15 @@ typedef struct static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf); -static TransactionId _bt_check_unique(Relation rel, IndexTuple itup, - Relation heapRel, Buffer buf, OffsetNumber offset, - ScanKey itup_scankey, +static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, + Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken); -static void _bt_findinsertloc(Relation rel, - Buffer *bufptr, - OffsetNumber *offsetptr, - int keysz, - ScanKey scankey, - IndexTuple newtup, +static OffsetNumber _bt_findinsertloc(Relation rel, + BTInsertState insertstate, BTStack stack, Relation heapRel); +static bool _bt_useduplicatepage(Relation rel, Relation heapRel, BTInsertState insertstate); static void _bt_insertonpg(Relation rel, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, @@ -83,8 +79,8 @@ static void _bt_checksplitloc(FindSplitData *state, int dataitemstoleft, Size firstoldonrightsz); static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off); -static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum, - int keysz, ScanKey scankey); +static bool _bt_isequal(TupleDesc itupdesc, BTScanInsert itup_key, + Page page, OffsetNumber offnum); static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel); /* @@ -110,18 +106,28 @@ _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel) { bool is_unique = false; - int indnkeyatts; - ScanKey itup_scankey; + BTInsertStateData insertstate; + BTScanInsert itup_key; BTStack stack = NULL; Buffer buf; - OffsetNumber offset; bool fastpath; + bool checkingunique = (checkUnique != UNIQUE_CHECK_NO); - indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); - Assert(indnkeyatts != 0); + /* we need a search key to do our search, so build one */ + itup_key = _bt_mkscankey(rel, itup); - /* we need an insertion scan key to do our search, so build one */ - itup_scankey = _bt_mkscankey(rel, itup); + /* + * Fill in the BTInsertState working area, to track the current page + * and position within the page, to insert to. + */ + insertstate.itup = itup; + insertstate.itemsz = IndexTupleSize(itup); + insertstate.itemsz = MAXALIGN(insertstate.itemsz); /* be safe, PageAddItem will do this but we + * need to be consistent */ + insertstate.itup_key = itup_key; + + insertstate.bounds_valid = false; + insertstate.buf = InvalidBuffer; /* * It's very common to have an index on an auto-incremented or @@ -144,10 +150,8 @@ _bt_doinsert(Relation rel, IndexTuple itup, */ top: fastpath = false; - offset = InvalidOffsetNumber; if (RelationGetTargetBlock(rel) != InvalidBlockNumber) { - Size itemsz; Page page; BTPageOpaque lpageop; @@ -166,9 +170,6 @@ top: page = BufferGetPage(buf); lpageop = (BTPageOpaque) PageGetSpecialPointer(page); - itemsz = IndexTupleSize(itup); - itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this - * but we need to be consistent */ /* * Check if the page is still the rightmost leaf page, has enough @@ -177,10 +178,9 @@ top: */ if (P_ISLEAF(lpageop) && P_RIGHTMOST(lpageop) && !P_IGNORE(lpageop) && - (PageGetFreeSpace(page) > itemsz) && + (PageGetFreeSpace(page) > insertstate.itemsz) && PageGetMaxOffsetNumber(page) >= P_FIRSTDATAKEY(lpageop) && - _bt_compare(rel, indnkeyatts, itup_scankey, page, - P_FIRSTDATAKEY(lpageop)) > 0) + _bt_compare(rel, itup_key, page, P_FIRSTDATAKEY(lpageop)) > 0) { /* * The right-most block should never have an incomplete split. @@ -219,10 +219,12 @@ top: * Find the first page containing this key. Buffer returned by * _bt_search() is locked in exclusive mode. */ - stack = _bt_search(rel, indnkeyatts, itup_scankey, false, &buf, BT_WRITE, - NULL); + stack = _bt_search(rel, itup_key, &buf, BT_WRITE, NULL); } + insertstate.buf = buf; + buf = InvalidBuffer; /* insertstate.buf now owns the buffer */ + /* * If we're not allowing duplicates, make sure the key isn't already in * the index. @@ -244,19 +246,18 @@ top: * let the tuple in and return false for possibly non-unique, or true for * definitely unique. */ - if (checkUnique != UNIQUE_CHECK_NO) + if (checkingunique) { TransactionId xwait; uint32 speculativeToken; - offset = _bt_binsrch(rel, buf, indnkeyatts, itup_scankey, false); - xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey, + xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique, &is_unique, &speculativeToken); if (TransactionIdIsValid(xwait)) { /* Have to wait for the other guy ... */ - _bt_relbuf(rel, buf); + _bt_relbuf(rel, insertstate.buf); /* * If it's a speculative insertion, wait for it to finish (ie. to @@ -277,6 +278,8 @@ top: if (checkUnique != UNIQUE_CHECK_EXISTING) { + OffsetNumber newitemoff; + /* * The only conflict predicate locking cares about for indexes is when * an index tuple insert conflicts with an existing lock. Since the @@ -286,22 +289,22 @@ top: * This reasoning also applies to INCLUDE indexes, whose extra * attributes are not considered part of the key space. */ - CheckForSerializableConflictIn(rel, NULL, buf); + CheckForSerializableConflictIn(rel, NULL, insertstate.buf); /* do the insertion */ - _bt_findinsertloc(rel, &buf, &offset, indnkeyatts, itup_scankey, itup, - stack, heapRel); - _bt_insertonpg(rel, buf, InvalidBuffer, stack, itup, offset, false); + newitemoff = _bt_findinsertloc(rel, &insertstate, stack, heapRel); + _bt_insertonpg(rel, insertstate.buf, InvalidBuffer, stack, itup, newitemoff, + false); } else { /* just release the buffer */ - _bt_relbuf(rel, buf); + _bt_relbuf(rel, insertstate.buf); } /* be tidy */ if (stack) _bt_freestack(stack); - _bt_freeskey(itup_scankey); + pfree(itup_key); return is_unique; } @@ -309,10 +312,6 @@ top: /* * _bt_check_unique() -- Check for violation of unique index constraint * - * offset points to the first possible item that could conflict. It can - * also point to end-of-page, which means that the first tuple to check - * is the first tuple on the next page. - * * Returns InvalidTransactionId if there is no conflict, else an xact ID * we must wait for to see if it commits a conflicting tuple. If an actual * conflict is detected, no return --- just ereport(). If an xact ID is @@ -324,16 +323,22 @@ top: * InvalidTransactionId because we don't want to wait. In this case we * set *is_unique to false if there is a potential conflict, and the * core code must redo the uniqueness check later. + * + * As a side-effect, sets state in insertstate that can later be used by + * _bt_findinsertloc() to reuse most of the binary search work we do + * here. */ static TransactionId -_bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, - Buffer buf, OffsetNumber offset, ScanKey itup_scankey, +_bt_check_unique(Relation rel, BTInsertState insertstate, + Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken) { TupleDesc itupdesc = RelationGetDescr(rel); - int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + IndexTuple itup = insertstate->itup; + BTScanInsert itup_key = insertstate->itup_key; SnapshotData SnapshotDirty; + OffsetNumber offset; OffsetNumber maxoff; Page page; BTPageOpaque opaque; @@ -345,10 +350,18 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, InitDirtySnapshot(SnapshotDirty); - page = BufferGetPage(buf); + page = BufferGetPage(insertstate->buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); maxoff = PageGetMaxOffsetNumber(page); + /* + * Find the first tuple with the same key. + * + * This also saves the binary search bounds in insertstate. We use them + * in the fastpath below, but also in the _bt_findinsertloc() call later. + */ + offset = _bt_binsrch_insert(rel, insertstate); + /* * Scan over all equal tuples, looking for live conflicts. */ @@ -364,6 +377,26 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, */ if (offset <= maxoff) { + /* + * Fastpath: In most cases, we can use _bt_binsrch search bounds + * to limit our consideration to items that are definitely + * duplicates. This fastpath doesn't apply, when the original + * page is empty, or when initial offset is past the end of the + * original page, which may indicate that we need to examine a + * second or subsequent page. + * + * Note that this optimization avoids calling _bt_isequal() + * entirely when there are no duplicates, as long as the location + * where the key would belong to is not at the end of the page. + */ + if (nbuf == InvalidBuffer && offset == insertstate->stricthigh) + { + Assert(insertstate->low >= P_FIRSTDATAKEY(opaque)); + Assert(insertstate->low <= insertstate->stricthigh); + Assert(!_bt_isequal(itupdesc, itup_key, page, offset)); + break; + } + curitemid = PageGetItemId(page, offset); /* @@ -378,7 +411,7 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, * first, so that we didn't actually get to exit any sooner * anyway. So now we just advance over killed items as quickly as * we can. We only apply _bt_isequal() when we get to a non-killed - * item or the end of the page. + * item. */ if (!ItemIdIsDead(curitemid)) { @@ -391,7 +424,7 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, * in real comparison, but only for ordering/finding items on * pages. - vadim 03/24/97 */ - if (!_bt_isequal(itupdesc, page, offset, indnkeyatts, itup_scankey)) + if (!_bt_isequal(itupdesc, itup_key, page, offset)) break; /* we're past all the equal tuples */ /* okay, we gotta fetch the heap tuple ... */ @@ -488,7 +521,7 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, * otherwise be masked by this unique constraint * violation. */ - CheckForSerializableConflictIn(rel, NULL, buf); + CheckForSerializableConflictIn(rel, NULL, insertstate->buf); /* * This is a definite conflict. Break the tuple down into @@ -500,7 +533,8 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, */ if (nbuf != InvalidBuffer) _bt_relbuf(rel, nbuf); - _bt_relbuf(rel, buf); + _bt_relbuf(rel, insertstate->buf); + insertstate->buf = InvalidBuffer; { Datum values[INDEX_MAX_KEYS]; @@ -540,7 +574,7 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, if (nbuf != InvalidBuffer) MarkBufferDirtyHint(nbuf, true); else - MarkBufferDirtyHint(buf, true); + MarkBufferDirtyHint(insertstate->buf, true); } } } @@ -552,11 +586,25 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, offset = OffsetNumberNext(offset); else { + int highkeycmp; + /* If scankey == hikey we gotta check the next page too */ if (P_RIGHTMOST(opaque)) break; - if (!_bt_isequal(itupdesc, page, P_HIKEY, - indnkeyatts, itup_scankey)) + highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY); + + /* + * HEIKKI: This assertion might fire if the user-defined opclass + * is broken. It's just an assertion, so maybe that's ok. With a + * broken opclass, it's obviously "garbage in, garbage out", but + * we should try to behave sanely anyway. I don't remember what + * our general policy on that is; should we assert, elog(ERROR), + * or continue silently in that case? An elog(ERROR) or + * elog(WARNING) would feel best to me, but I don't remember what + * we usually do. + */ + Assert(highkeycmp <= 0); + if (highkeycmp != 0) break; /* Advance to next non-dead page --- there must be one */ for (;;) @@ -611,46 +659,37 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, * Once we have chosen the page to put the key on, we'll insert it before * any existing equal keys because of the way _bt_binsrch() works. * - * If there's not enough room in the space, we try to make room by - * removing any LP_DEAD tuples. + * _bt_check_unique() saves the progress of the binary search it + * performs, in 'insertstate'. In the common case that there were no + * duplicates, we don't need to do any additional binary search + * comparisons here. Though occasionally, we may still not be able to + * reuse the saved state for our own reasons. * - * On entry, *bufptr and *offsetptr point to the first legal position - * where the new tuple could be inserted. The caller should hold an - * exclusive lock on *bufptr. *offsetptr can also be set to - * InvalidOffsetNumber, in which case the function will search for the - * right location within the page if needed. On exit, they point to the - * chosen insert location. If _bt_findinsertloc decides to move right, - * the lock and pin on the original page will be released and the new - * page returned to the caller is exclusively locked instead. + * On entry, insertstate->buf points to the first legal page where the new + * tuple could be inserted. It must be exclusively-locked. * - * newtup is the new tuple we're inserting, and scankey is an insertion - * type scan key for it. + * On exit, insertstate->buf points to the chosen insertion page, and the + * offset within that page is returned. If _bt_findinsertloc decides to move + * right, the lock and pin on the original page is released, and the new + * page is exclusively locked instead. + * + * This is also where opportunistic microvacuuming of LP_DEAD tuples + * occurs. */ -static void +static OffsetNumber _bt_findinsertloc(Relation rel, - Buffer *bufptr, - OffsetNumber *offsetptr, - int keysz, - ScanKey scankey, - IndexTuple newtup, + BTInsertState insertstate, BTStack stack, Relation heapRel) { - Buffer buf = *bufptr; + Buffer buf = insertstate->buf; + BTScanInsert itup_key = insertstate->itup_key; Page page = BufferGetPage(buf); - Size itemsz; BTPageOpaque lpageop; - bool movedright, - vacuumed; OffsetNumber newitemoff; - OffsetNumber firstlegaloff = *offsetptr; lpageop = (BTPageOpaque) PageGetSpecialPointer(page); - itemsz = IndexTupleSize(newtup); - itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we - * need to be consistent */ - /* * Check whether the item can fit on a btree page at all. (Eventually, we * ought to try to apply TOAST methods if not.) We actually need to be @@ -659,12 +698,14 @@ _bt_findinsertloc(Relation rel, * include the ItemId. * * NOTE: if you change this, see also the similar code in _bt_buildadd(). + * + * HEIKKI: perhaps this should be moved to beginning of _bt_doinsert now... */ - if (itemsz > BTMaxItemSize(page)) + if (insertstate->itemsz > BTMaxItemSize(page)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", - itemsz, BTMaxItemSize(page), + insertstate->itemsz, BTMaxItemSize(page), RelationGetRelationName(rel)), errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" "Consider a function index of an MD5 hash of the value, " @@ -672,56 +713,43 @@ _bt_findinsertloc(Relation rel, errtableconstraint(heapRel, RelationGetRelationName(rel)))); - /*---------- - * If we will need to split the page to put the item on this page, - * check whether we can put the tuple somewhere to the right, - * instead. Keep scanning right until we - * (a) find a page with enough free space, - * (b) reach the last page where the tuple can legally go, or - * (c) get tired of searching. - * (c) is not flippant; it is important because if there are many - * pages' worth of equal keys, it's better to split one of the early - * pages than to scan all the way to the end of the run of equal keys - * on every insert. We implement "get tired" as a random choice, - * since stopping after scanning a fixed number of pages wouldn't work - * well (we'd never reach the right-hand side of previously split - * pages). Currently the probability of moving right is set at 0.99, - * which may seem too high to change the behavior much, but it does an - * excellent job of preventing O(N^2) behavior with many equal keys. - *---------- - */ - movedright = false; - vacuumed = false; - while (PageGetFreeSpace(page) < itemsz) + Assert(P_ISLEAF(lpageop) && !P_INCOMPLETE_SPLIT(lpageop)); + for (;;) { + int cmpval; Buffer rbuf; BlockNumber rblkno; /* - * before considering moving right, see if we can obtain enough space - * by erasing LP_DEAD items + * An earlier _bt_check_unique() call may well have saved bounds that + * we can use to skip the high key check. This fastpath cannot be + * used when there are no items on the existing page (other than the + * high key), or when it looks like the new item belongs on the page + * but it might go on a later page instead. */ - if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop)) - { - _bt_vacuum_one_page(rel, buf, heapRel); - - /* - * remember that we vacuumed this page, because that makes the - * hint supplied by the caller invalid - */ - vacuumed = true; + if (insertstate->bounds_valid && insertstate->low <= insertstate->stricthigh && + insertstate->stricthigh <= PageGetMaxOffsetNumber(page)) + break; - if (PageGetFreeSpace(page) >= itemsz) - break; /* OK, now we have enough space */ - } + /* + * If this is the last page that the tuple can legally go to, stop + * here. + */ + if (P_RIGHTMOST(lpageop)) + break; + cmpval = _bt_compare(rel, itup_key, page, P_HIKEY); + if (cmpval != 0) + break; /* - * nope, so check conditions (b) and (c) enumerated above + * Otherwise, we have a choice to insert here, or move right to a + * later page. Try to balance space utilization the best we can. */ - if (P_RIGHTMOST(lpageop) || - _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 || - random() <= (MAX_RANDOM_VALUE / 100)) + if (_bt_useduplicatepage(rel, heapRel, insertstate)) + { + /* decided to insert here */ break; + } /* * step right to next non-dead page @@ -763,27 +791,100 @@ _bt_findinsertloc(Relation rel, } _bt_relbuf(rel, buf); buf = rbuf; - movedright = true; - vacuumed = false; + insertstate->buf = buf; + insertstate->bounds_valid = false; } + /* Loop should not break until correct page located */ + Assert(P_RIGHTMOST(lpageop) || + _bt_compare(rel, itup_key, page, P_HIKEY) <= 0); + /* - * Now we are on the right page, so find the insert position. If we moved - * right at all, we know we should insert at the start of the page. If we - * didn't move right, we can use the firstlegaloff hint if the caller - * supplied one, unless we vacuumed the page which might have moved tuples - * around making the hint invalid. If we didn't move right or can't use - * the hint, find the position by searching. + * If the page we're about to insert to doesn't have enough room for the + * new tuple, we will have to split it. If it looks like the page has + * LP_DEAD items, try to remove them, in hope of making room for the new + * item and avoiding the split. */ - if (movedright) - newitemoff = P_FIRSTDATAKEY(lpageop); - else if (firstlegaloff != InvalidOffsetNumber && !vacuumed) - newitemoff = firstlegaloff; - else - newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false); + if (P_HAS_GARBAGE(lpageop) && PageGetFreeSpace(page) < insertstate->itemsz) + { + _bt_vacuum_one_page(rel, buf, heapRel); + insertstate->bounds_valid = false; + } + + /* + * Find the position within the page to insert to. (This will reuse the + * binary search bounds in 'insertstate', if _bt_check_unique was called + * earlier and we're inserting to the first legal page.) + */ + newitemoff = _bt_binsrch_insert(rel, insertstate); - *bufptr = buf; - *offsetptr = newitemoff; + return newitemoff; +} + +/* + * _bt_useduplicatepage() -- Settle for this page of duplicates? + * + * If we have the choice to insert to current page, or to some later + * later page to the right, this function decides what to do. + * + * If the current page doesn't have enough free space for the new + * tuple, we "microvacuum" the page, removing LP_DEAD items, in + * hope that it will make enough room. + * + * Returns true if caller should proceed with insert on the current + * page. Otherwise, caller should move on to the page to the right. + */ +static bool +_bt_useduplicatepage(Relation rel, Relation heapRel, + BTInsertState insertstate) +{ + Buffer buf = insertstate->buf; + Page page = BufferGetPage(buf); + BTPageOpaque lpageop; + + lpageop = (BTPageOpaque) PageGetSpecialPointer(page); + Assert(P_ISLEAF(lpageop) && !P_RIGHTMOST(lpageop)); + + /* Easy case -- there is space free on this page already */ + if (PageGetFreeSpace(page) >= insertstate->itemsz) + return true; + + /* + * Before considering moving right, see if we can obtain enough space by + * erasing LP_DEAD items. + */ + if (P_HAS_GARBAGE(lpageop)) + { + _bt_vacuum_one_page(rel, buf, heapRel); + insertstate->bounds_valid = false; + + if (PageGetFreeSpace(page) >= insertstate->itemsz) + return true; /* OK, now we have enough space */ + } + + /*---------- + * It's now clear that _bt_findinsertloc() caller will need to split + * the page if it is to insert new item on to it. The choice to move + * right to the next page remains open to it, but we should not search + * for free space exhaustively when there are many pages to look through. + * + * _bt_findinsertloc() keeps scanning right until it: + * (a) reaches the last page where the tuple can legally go + * Or until we: + * (b) find a page with enough free space, or + * (c) get tired of searching. + * (c) is not flippant; it is important because if there are many + * pages' worth of equal keys, it's better to split one of the early + * pages than to scan all the way to the end of the run of equal keys + * on every insert. We implement "get tired" as a random choice, + * since stopping after scanning a fixed number of pages wouldn't work + * well (we'd never reach the right-hand side of previously split + * pages). The probability of moving right is set at 0.99, which may + * seem too high to change the behavior much, but it does an excellent + * job of preventing O(N^2) behavior with many equal keys. + *---------- + */ + return random() <= (MAX_RANDOM_VALUE / 100); } /*---------- @@ -2311,24 +2412,21 @@ _bt_pgaddtup(Page page, * Rule is simple: NOT_NULL not equal NULL, NULL not equal NULL too. */ static bool -_bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum, - int keysz, ScanKey scankey) +_bt_isequal(TupleDesc itupdesc, BTScanInsert itup_key, Page page, + OffsetNumber offnum) { IndexTuple itup; + ScanKey scankey; int i; - /* Better be comparing to a leaf item */ + /* Better be comparing to a non-pivot item */ Assert(P_ISLEAF((BTPageOpaque) PageGetSpecialPointer(page))); + Assert(offnum >= P_FIRSTDATAKEY((BTPageOpaque) PageGetSpecialPointer(page))); + scankey = itup_key->scankeys; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); - /* - * It's okay that we might perform a comparison against a truncated page - * high key when caller needs to determine if _bt_check_unique scan must - * continue on to the next page. Caller never asks us to compare non-key - * attributes within an INCLUDE index. - */ - for (i = 1; i <= keysz; i++) + for (i = 1; i <= itup_key->keysz; i++) { AttrNumber attno; Datum datum; diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 1d72fe54081..2694811fd66 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -1370,7 +1370,7 @@ _bt_pagedel(Relation rel, Buffer buf) */ if (!stack) { - ScanKey itup_scankey; + BTScanInsert itup_key; ItemId itemid; IndexTuple targetkey; Buffer lbuf; @@ -1420,12 +1420,10 @@ _bt_pagedel(Relation rel, Buffer buf) } /* we need an insertion scan key for the search, so build one */ - itup_scankey = _bt_mkscankey(rel, targetkey); - /* find the leftmost leaf page containing this key */ - stack = _bt_search(rel, - IndexRelationGetNumberOfKeyAttributes(rel), - itup_scankey, false, &lbuf, BT_READ, NULL); - /* don't need a pin on the page */ + itup_key = _bt_mkscankey(rel, targetkey); + /* get stack to leaf page by searching index */ + stack = _bt_search(rel, itup_key, &lbuf, BT_READ, NULL); + /* don't need a lock or second pin on the page */ _bt_relbuf(rel, lbuf); /* diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 92832237a8b..88b030ee778 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -24,6 +24,7 @@ #include "utils/rel.h" +static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf); static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum); static void _bt_saveitem(BTScanOpaque so, int itemIndex, @@ -71,13 +72,9 @@ _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp) * _bt_search() -- Search the tree for a particular scankey, * or more precisely for the first leaf page it could be on. * - * The passed scankey must be an insertion-type scankey (see nbtree/README), + * The passed scankey is an insertion-type scankey (see nbtree/README), * but it can omit the rightmost column(s) of the index. * - * When nextkey is false (the usual case), we are looking for the first - * item >= scankey. When nextkey is true, we are looking for the first - * item strictly greater than scankey. - * * Return value is a stack of parent-page pointers. *bufP is set to the * address of the leaf-page buffer, which is read-locked and pinned. * No locks are held on the parent pages, however! @@ -93,8 +90,8 @@ _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp) * during the search will be finished. */ BTStack -_bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, - Buffer *bufP, int access, Snapshot snapshot) +_bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, + Snapshot snapshot) { BTStack stack_in = NULL; int page_access = BT_READ; @@ -130,8 +127,7 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, * if the leaf page is split and we insert to the parent page). But * this is a good opportunity to finish splits of internal pages too. */ - *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, - (access == BT_WRITE), stack_in, + *bufP = _bt_moveright(rel, key, *bufP, (access == BT_WRITE), stack_in, page_access, snapshot); /* if this is a leaf page, we're done */ @@ -144,7 +140,7 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, * Find the appropriate item on the internal page, and get the child * page that it points to. */ - offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey); + offnum = _bt_binsrch(rel, key, *bufP); itemid = PageGetItemId(page, offnum); itup = (IndexTuple) PageGetItem(page, itemid); blkno = BTreeInnerTupleGetDownLink(itup); @@ -167,8 +163,8 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, new_stack->bts_parent = stack_in; /* - * Page level 1 is lowest non-leaf page level prior to leaves. So, - * if we're on the level 1 and asked to lock leaf page in write mode, + * Page level 1 is lowest non-leaf page level prior to leaves. So, if + * we're on the level 1 and asked to lock leaf page in write mode, * then lock next page in write mode, because it must be a leaf. */ if (opaque->btpo.level == 1 && access == BT_WRITE) @@ -198,8 +194,8 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, * need to move right in the tree. See Lehman and Yao for an * excruciatingly precise description. */ - *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, - true, stack_in, BT_WRITE, snapshot); + *bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE, + snapshot); } return stack_in; @@ -215,16 +211,17 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, * or strictly to the right of it. * * This routine decides whether or not we need to move right in the - * tree by examining the high key entry on the page. If that entry - * is strictly less than the scankey, or <= the scankey in the nextkey=true - * case, then we followed the wrong link and we need to move right. + * tree by examining the high key entry on the page. If that entry is + * strictly less than the scankey, or <= the scankey in the + * key.nextkey=true case, then we followed the wrong link and we need + * to move right. * - * The passed scankey must be an insertion-type scankey (see nbtree/README), - * but it can omit the rightmost column(s) of the index. + * The passed insertion-type scankey can omit the rightmost column(s) of the + * index. (see nbtree/README) * - * When nextkey is false (the usual case), we are looking for the first - * item >= scankey. When nextkey is true, we are looking for the first - * item strictly greater than scankey. + * When key.nextkey is false (the usual case), we are looking for the first + * item >= key. When key.nextkey is true, we are looking for the first item + * strictly greater than key. * * If forupdate is true, we will attempt to finish any incomplete splits * that we encounter. This is required when locking a target page for an @@ -241,10 +238,8 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, */ Buffer _bt_moveright(Relation rel, + BTScanInsert key, Buffer buf, - int keysz, - ScanKey scankey, - bool nextkey, bool forupdate, BTStack stack, int access, @@ -269,7 +264,7 @@ _bt_moveright(Relation rel, * We also have to move right if we followed a link that brought us to a * dead page. */ - cmpval = nextkey ? 0 : 1; + cmpval = key->nextkey ? 0 : 1; for (;;) { @@ -304,7 +299,7 @@ _bt_moveright(Relation rel, continue; } - if (P_IGNORE(opaque) || _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval) + if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval) { /* step right one page */ buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access); @@ -324,13 +319,6 @@ _bt_moveright(Relation rel, /* * _bt_binsrch() -- Do a binary search for a key on a particular page. * - * The passed scankey must be an insertion-type scankey (see nbtree/README), - * but it can omit the rightmost column(s) of the index. - * - * When nextkey is false (the usual case), we are looking for the first - * item >= scankey. When nextkey is true, we are looking for the first - * item strictly greater than scankey. - * * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first * key >= given scankey, or > scankey if nextkey is true. (NOTE: in * particular, this means it is possible to return a value 1 greater than the @@ -348,12 +336,10 @@ _bt_moveright(Relation rel, * the given page. _bt_binsrch() has no lock or refcount side effects * on the buffer. */ -OffsetNumber +static OffsetNumber _bt_binsrch(Relation rel, - Buffer buf, - int keysz, - ScanKey scankey, - bool nextkey) + BTScanInsert key, + Buffer buf) { Page page; BTPageOpaque opaque; @@ -375,7 +361,7 @@ _bt_binsrch(Relation rel, * This can never happen on an internal page, however, since they are * never empty (an internal page must have children). */ - if (high < low) + if (unlikely(high < low)) return low; /* @@ -392,7 +378,7 @@ _bt_binsrch(Relation rel, */ high++; /* establish the loop invariant for high */ - cmpval = nextkey ? 0 : 1; /* select comparison value */ + cmpval = key->nextkey ? 0 : 1; /* select comparison value */ while (high > low) { @@ -400,7 +386,7 @@ _bt_binsrch(Relation rel, /* We have low <= mid < high, so mid points at a real slot */ - result = _bt_compare(rel, keysz, scankey, page, mid); + result = _bt_compare(rel, key, page, mid); if (result >= cmpval) low = mid + 1; @@ -427,14 +413,117 @@ _bt_binsrch(Relation rel, return OffsetNumberPrev(low); } -/*---------- - * _bt_compare() -- Compare scankey to a particular tuple on the page. +/* + * Like _bt_binsrch(), but with support for caching the binary search bounds. + * Used during insertion, and only on the leaf level. * - * The passed scankey must be an insertion-type scankey (see nbtree/README), - * but it can omit the rightmost column(s) of the index. + * Caches the bounds fields in insertstate, so that a subsequent call can + * reuse the low and strict high bound of original binary search. Callers + * that use these fields directly must be prepared for the case where + * stricthigh isn't on the same page (it exceeds maxoff for the page), and + * the case where there are no items on the page (high < low). + */ +OffsetNumber +_bt_binsrch_insert(Relation rel, BTInsertState insertstate) +{ + BTScanInsert key = insertstate->itup_key; + Page page; + BTPageOpaque opaque; + OffsetNumber low, + high, + stricthigh; + int32 result, + cmpval; + + page = BufferGetPage(insertstate->buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + Assert(P_ISLEAF(opaque)); + Assert(!key->nextkey); + + if (!insertstate->bounds_valid) + { + low = P_FIRSTDATAKEY(opaque); + high = PageGetMaxOffsetNumber(page); + } + else + { + /* Restore result of previous binary search against same page */ + low = insertstate->low; + high = insertstate->stricthigh; + } + + /* + * If there are no keys on the page, return the first available slot. Note + * this covers two cases: the page is really empty (no keys), or it + * contains only a high key. The latter case is possible after vacuuming. + * This can never happen on an internal page, however, since they are + * never empty (an internal page must have children). + */ + if (unlikely(high < low)) + { + /* Caller can't use stricthigh */ + insertstate->bounds_valid = false; + return low; + } + + /* + * Binary search to find the first key on the page >= scan key. (nextkey + * is always false when inserting) + * + * For nextkey=false (cmpval=1), the loop invariant is: all slots before + * 'low' are < scan key, all slots at or after 'high' are >= scan key. + * + * We can fall out when high == low. + */ + if (!insertstate->bounds_valid) + high++; /* establish the loop invariant for high */ + stricthigh = high; /* high initially strictly higher */ + + cmpval = 1; /* select comparison value */ + + while (high > low) + { + OffsetNumber mid = low + ((high - low) / 2); + + /* We have low <= mid < high, so mid points at a real slot */ + + result = _bt_compare(rel, key, page, mid); + + if (result >= cmpval) + low = mid + 1; + else + { + high = mid; + + /* + * high can only be reused by more restrictive binary search when + * it's known to be strictly greater than the original scankey + */ + if (result != 0) + stricthigh = high; + } + } + + /* + * At this point we have high == low, but be careful: they could point + * past the last slot on the page. + * + * On a leaf page, we always return the first key >= scan key (resp. > + * scan key), which could be the last slot + 1. + */ + insertstate->low = low; + insertstate->stricthigh = stricthigh; + insertstate->bounds_valid = true; + + return low; +} + + + +/*---------- + * _bt_compare() -- Compare insertion-type scankey to tuple on a page. * - * keysz: number of key conditions to be checked (might be less than the - * number of index columns!) * page/offnum: location of btree item to be compared to. * * This routine returns: @@ -455,17 +544,17 @@ _bt_binsrch(Relation rel, */ int32 _bt_compare(Relation rel, - int keysz, - ScanKey scankey, + BTScanInsert key, Page page, OffsetNumber offnum) { TupleDesc itupdesc = RelationGetDescr(rel); BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); IndexTuple itup; - int i; + ScanKey scankey; Assert(_bt_check_natts(rel, page, offnum)); + Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel)); /* * Force result ">" if target item is first data item on an internal page @@ -488,7 +577,8 @@ _bt_compare(Relation rel, * _bt_first). */ - for (i = 1; i <= keysz; i++) + scankey = key->scankeys; + for (int i = 1; i <= key->keysz; i++) { Datum datum; bool isNull; @@ -574,8 +664,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) StrategyNumber strat; bool nextkey; bool goback; + BTScanInsertData inskey; ScanKey startKeys[INDEX_MAX_KEYS]; - ScanKeyData scankeys[INDEX_MAX_KEYS]; ScanKeyData notnullkeys[INDEX_MAX_KEYS]; int keysCount = 0; int i; @@ -821,8 +911,9 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) /* * We want to start the scan somewhere within the index. Set up an * insertion scankey we can use to search for the boundary point we - * identified above. The insertion scankey is built in the local - * scankeys[] array, using the keys identified by startKeys[]. + * identified above. The insertion scankey is built using the keys + * identified by startKeys[]. (Remaining insertion scankey fields are + * initialized after initial-positioning strategy is finalized.) */ Assert(keysCount <= INDEX_MAX_KEYS); for (i = 0; i < keysCount; i++) @@ -850,7 +941,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) _bt_parallel_done(scan); return false; } - memcpy(scankeys + i, subkey, sizeof(ScanKeyData)); + memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData)); /* * If the row comparison is the last positioning key we accepted, @@ -882,7 +973,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) if (subkey->sk_flags & SK_ISNULL) break; /* can't use null keys */ Assert(keysCount < INDEX_MAX_KEYS); - memcpy(scankeys + keysCount, subkey, sizeof(ScanKeyData)); + memcpy(inskey.scankeys + keysCount, subkey, + sizeof(ScanKeyData)); keysCount++; if (subkey->sk_flags & SK_ROW_END) { @@ -928,7 +1020,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) FmgrInfo *procinfo; procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC); - ScanKeyEntryInitializeWithInfo(scankeys + i, + ScanKeyEntryInitializeWithInfo(inskey.scankeys + i, cur->sk_flags, cur->sk_attno, InvalidStrategy, @@ -949,7 +1041,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"", BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype, cur->sk_attno, RelationGetRelationName(rel)); - ScanKeyEntryInitialize(scankeys + i, + ScanKeyEntryInitialize(inskey.scankeys + i, cur->sk_flags, cur->sk_attno, InvalidStrategy, @@ -1052,12 +1144,15 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) return false; } + /* Initialize remaining insertion scan key fields */ + inskey.nextkey = nextkey; + inskey.keysz = keysCount; + /* * Use the manufactured insertion scan key to descend the tree and * position ourselves on the target leaf page. */ - stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ, - scan->xs_snapshot); + stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot); /* don't need to keep the stack around... */ _bt_freestack(stack); @@ -1086,7 +1181,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) _bt_initialize_more_data(so, dir); /* position to the precise item on the page */ - offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey); + offnum = _bt_binsrch(rel, &inskey, buf); /* * If nextkey = false, we are positioned at the first item >= scan key, or diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index dc398e11867..759859c302e 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -254,6 +254,7 @@ typedef struct BTWriteState { Relation heap; Relation index; + BTScanInsert inskey; /* generic insertion scankey */ bool btws_use_wal; /* dump pages to WAL? */ BlockNumber btws_pages_alloced; /* # pages allocated */ BlockNumber btws_pages_written; /* # pages written out */ @@ -531,6 +532,7 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) wstate.heap = btspool->heap; wstate.index = btspool->index; + wstate.inskey = _bt_mkscankey(wstate.index, NULL); /* * We need to log index creation in WAL iff WAL archiving/streaming is @@ -1076,7 +1078,6 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) TupleDesc tupdes = RelationGetDescr(wstate->index); int i, keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index); - ScanKey indexScanKey = NULL; SortSupport sortKeys; if (merge) @@ -1089,7 +1090,6 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) /* the preparation of merge */ itup = tuplesort_getindextuple(btspool->sortstate, true); itup2 = tuplesort_getindextuple(btspool2->sortstate, true); - indexScanKey = _bt_mkscankey_nodata(wstate->index); /* Prepare SortSupport data for each column */ sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData)); @@ -1097,7 +1097,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) for (i = 0; i < keysz; i++) { SortSupport sortKey = sortKeys + i; - ScanKey scanKey = indexScanKey + i; + ScanKey scanKey = wstate->inskey->scankeys + i; int16 strategy; sortKey->ssup_cxt = CurrentMemoryContext; @@ -1116,8 +1116,6 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey); } - _bt_freeskey(indexScanKey); - for (;;) { load1 = true; /* load BTSpool next ? */ diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 2c05fb5e451..4438ea29c09 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -56,34 +56,37 @@ static bool _bt_check_rowcompare(ScanKey skey, * Build an insertion scan key that contains comparison data from itup * as well as comparator routines appropriate to the key datatypes. * - * The result is intended for use with _bt_compare(). + * Result is intended for use with _bt_compare(). Callers that don't + * need to fill out the insertion scankey arguments (e.g. they use an own + * ad-hoc comparison routine) can pass a NULL index tuple. */ -ScanKey +BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup) { + BTScanInsert key; ScanKey skey; TupleDesc itupdesc; - int indnatts PG_USED_FOR_ASSERTS_ONLY; int indnkeyatts; int16 *indoption; + int tupnatts; int i; itupdesc = RelationGetDescr(rel); - indnatts = IndexRelationGetNumberOfAttributes(rel); indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); indoption = rel->rd_indoption; + tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0; - Assert(indnkeyatts > 0); - Assert(indnkeyatts <= indnatts); - Assert(BTreeTupleGetNAtts(itup, rel) == indnatts || - BTreeTupleGetNAtts(itup, rel) == indnkeyatts); + Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel)); /* * We'll execute search using scan key constructed on key columns. Non-key * (INCLUDE index) columns are always omitted from scan keys. */ - skey = (ScanKey) palloc(indnkeyatts * sizeof(ScanKeyData)); - + key = palloc(offsetof(BTScanInsertData, scankeys) + + sizeof(ScanKeyData) * indnkeyatts); + key->nextkey = false; + key->keysz = Min(indnkeyatts, tupnatts); + skey = key->scankeys; for (i = 0; i < indnkeyatts; i++) { FmgrInfo *procinfo; @@ -96,56 +99,19 @@ _bt_mkscankey(Relation rel, IndexTuple itup) * comparison can be needed. */ procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC); - arg = index_getattr(itup, i + 1, itupdesc, &null); - flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT); - ScanKeyEntryInitializeWithInfo(&skey[i], - flags, - (AttrNumber) (i + 1), - InvalidStrategy, - InvalidOid, - rel->rd_indcollation[i], - procinfo, - arg); - } - - return skey; -} - -/* - * _bt_mkscankey_nodata - * Build an insertion scan key that contains 3-way comparator routines - * appropriate to the key datatypes, but no comparison data. The - * comparison data ultimately used must match the key datatypes. - * - * The result cannot be used with _bt_compare(), unless comparison - * data is first stored into the key entries. Currently this - * routine is only called by nbtsort.c and tuplesort.c, which have - * their own comparison routines. - */ -ScanKey -_bt_mkscankey_nodata(Relation rel) -{ - ScanKey skey; - int indnkeyatts; - int16 *indoption; - int i; - - indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); - indoption = rel->rd_indoption; - - skey = (ScanKey) palloc(indnkeyatts * sizeof(ScanKeyData)); - - for (i = 0; i < indnkeyatts; i++) - { - FmgrInfo *procinfo; - int flags; /* - * We can use the cached (default) support procs since no cross-type - * comparison can be needed. + * If the caller provides no tuple, the key arguments should never be + * used. Set them to NULL, anyway, to be defensive. */ - procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC); - flags = SK_ISNULL | (indoption[i] << SK_BT_INDOPTION_SHIFT); + if (i < tupnatts) + arg = index_getattr(itup, i + 1, itupdesc, &null); + else + { + arg = (Datum) 0; + null = true; + } + flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT); ScanKeyEntryInitializeWithInfo(&skey[i], flags, (AttrNumber) (i + 1), @@ -153,19 +119,10 @@ _bt_mkscankey_nodata(Relation rel) InvalidOid, rel->rd_indcollation[i], procinfo, - (Datum) 0); + arg); } - return skey; -} - -/* - * free a scan key made by either _bt_mkscankey or _bt_mkscankey_nodata. - */ -void -_bt_freeskey(ScanKey skey) -{ - pfree(skey); + return key; } /* diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index 7b10fd2974c..f97a82ae7b3 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -884,7 +884,7 @@ tuplesort_begin_cluster(TupleDesc tupDesc, { Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, randomAccess); - ScanKey indexScanKey; + BTScanInsert indexScanKey; MemoryContext oldcontext; int i; @@ -919,7 +919,7 @@ tuplesort_begin_cluster(TupleDesc tupDesc, state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ - indexScanKey = _bt_mkscankey_nodata(indexRel); + indexScanKey = _bt_mkscankey(indexRel, NULL); if (state->indexInfo->ii_Expressions != NULL) { @@ -945,7 +945,7 @@ tuplesort_begin_cluster(TupleDesc tupDesc, for (i = 0; i < state->nKeys; i++) { SortSupport sortKey = state->sortKeys + i; - ScanKey scanKey = indexScanKey + i; + ScanKey scanKey = indexScanKey->scankeys + i; int16 strategy; sortKey->ssup_cxt = CurrentMemoryContext; @@ -964,7 +964,7 @@ tuplesort_begin_cluster(TupleDesc tupDesc, PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); } - _bt_freeskey(indexScanKey); + pfree(indexScanKey); MemoryContextSwitchTo(oldcontext); @@ -981,7 +981,7 @@ tuplesort_begin_index_btree(Relation heapRel, { Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, randomAccess); - ScanKey indexScanKey; + BTScanInsert indexScanKey; MemoryContext oldcontext; int i; @@ -1014,7 +1014,7 @@ tuplesort_begin_index_btree(Relation heapRel, state->indexRel = indexRel; state->enforceUnique = enforceUnique; - indexScanKey = _bt_mkscankey_nodata(indexRel); + indexScanKey = _bt_mkscankey(indexRel, NULL); /* Prepare SortSupport data for each column */ state->sortKeys = (SortSupport) palloc0(state->nKeys * @@ -1023,7 +1023,7 @@ tuplesort_begin_index_btree(Relation heapRel, for (i = 0; i < state->nKeys; i++) { SortSupport sortKey = state->sortKeys + i; - ScanKey scanKey = indexScanKey + i; + ScanKey scanKey = indexScanKey->scankeys + i; int16 strategy; sortKey->ssup_cxt = CurrentMemoryContext; @@ -1042,7 +1042,7 @@ tuplesort_begin_index_btree(Relation heapRel, PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); } - _bt_freeskey(indexScanKey); + pfree(indexScanKey); MemoryContextSwitchTo(oldcontext); diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 4fb92d60a12..920f05cbb0c 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -319,6 +319,77 @@ typedef struct BTStackData typedef BTStackData *BTStack; +/* + * BTScanInsert is the btree-private state needed to find an initial position + * for an indexscan, or to insert new tuples -- an "insertion scankey" (not + * to be confused with a search scankey. It's used to descend a B-Tree + * using _bt_search. + * + * When nextkey is false (the usual case), _bt_search and _bt_binsrch will + * locate the first item >= scankey. When nextkey is true, they will locate + * the first item > scan key. + * + * keysz is the number of insertion scankeys present. + * + * scankeys is an array of scan key entries for attributes that are compared. + * During insertion, there must be a scan key for every attribute, but when + * starting a regular index scan, some can be omitted. The array is used as a + * flexible array member, though it's sized in a way that makes it possible to + * use stack allocations. + * + * NOTE: The scankeys array looks similar to the scan keys used outside the + * btree code, the kind passed to btbeginscan() and btrescan(), but it is used + * differently. The sk_func pointers in BTScanInsert point to btree comparison + * support functions (ie, 3-way comparators that return int4 values interpreted + * as <0, =0, >0), instead of boolean-returning less-than functions like int4lt. + * Also, there is exactly one entry per index column, in the same order as the + * columns in the index (although there might be fewer keys than index columns, + * indicating that we have no constraints for the remaining index columns.) + * See nbtree/README for full details. + */ + +typedef struct BTScanInsertData +{ + /* State used to locate a position at the leaf level */ + bool nextkey; + int keysz; /* Size of scankeys */ + ScanKeyData scankeys[INDEX_MAX_KEYS]; /* Must appear last */ +} BTScanInsertData; + +typedef BTScanInsertData *BTScanInsert; + +/* + * Working area used during insertion, to track where to insert to. + * + * _bt_doinsert fills this in, after descending the tree to the (first legal) + * leaf page the new tuple belongs to. It is used to track the current decision + * while we perform uniqueness checks and decide the final page to insert to. + * + * (This should be private within nbtinsert.c, but it's also used by + * _bt_binsrch_insert, which is defined in nbtsearch.c) + */ +typedef struct BTInsertStateData +{ + /* Item we're inserting */ + IndexTuple itup; + Size itemsz; + BTScanInsert itup_key; + + /* leaf page to insert to */ + Buffer buf; + + /* + * Cache of bounds within the current buffer, where the new key value + * belongs to. Only used for insertions where _bt_check_unique is called. + * See _bt_binsrch_insert and _bt_findinsertloc for details. + */ + bool bounds_valid; + OffsetNumber low; + OffsetNumber stricthigh; +} BTInsertStateData; + +typedef BTInsertStateData *BTInsertState; + /* * BTScanOpaqueData is the btree-private state needed for an indexscan. * This consists of preprocessed scan keys (see _bt_preprocess_keys() for @@ -558,16 +629,12 @@ extern int _bt_pagedel(Relation rel, Buffer buf); /* * prototypes for functions in nbtsearch.c */ -extern BTStack _bt_search(Relation rel, - int keysz, ScanKey scankey, bool nextkey, - Buffer *bufP, int access, Snapshot snapshot); -extern Buffer _bt_moveright(Relation rel, Buffer buf, int keysz, - ScanKey scankey, bool nextkey, bool forupdate, BTStack stack, - int access, Snapshot snapshot); -extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, - ScanKey scankey, bool nextkey); -extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, - Page page, OffsetNumber offnum); +extern BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, + int access, Snapshot snapshot); +extern Buffer _bt_moveright(Relation rel, BTScanInsert key, Buffer buf, + bool forupdate, BTStack stack, int access, Snapshot snapshot); +extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate); +extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum); extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, @@ -576,9 +643,7 @@ extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, /* * prototypes for functions in nbtutils.c */ -extern ScanKey _bt_mkscankey(Relation rel, IndexTuple itup); -extern ScanKey _bt_mkscankey_nodata(Relation rel); -extern void _bt_freeskey(ScanKey skey); +extern BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup); extern void _bt_freestack(BTStack stack); extern void _bt_preprocess_array_keys(IndexScanDesc scan); extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir);