From f471aa15ffb79421bdb1db9c532aba82115f0b34 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Fri, 27 Apr 2018 12:47:39 -0700 Subject: [PATCH v5 1/3] Make nbtree indexes have unique keys in tuples. Make nbtree treat all index tuples as having a heap TID trailing attribute. Heap TID becomes a first class part of the key space on all levels of the tree. Index searches can distinguish duplicates by heap TID, though for now this is only used by insertions that need to find a leaf page to insert a tuple on. This general approach has numerous benefits for performance, and may enable a later enhancement that has nbtree vacuuming perform "retail index tuple deletion". Naively adding a new attribute to every pivot tuple has unacceptable overhead (it bloats internal pages), so suffix truncation of pivot tuples is also introduced. This will usually truncate away the "extra" heap TID attribute from pivot tuples during a leaf page split, and may also truncate away additional user attributes. This can increase fan-out when there are multiple indexed attributes, though this is of secondary importance. Truncation can only occur at the attribute granularity, which isn't particularly effective, but works well enough for now. We completely remove the logic that allows a search for free space among multiple pages full of duplicates to "get tired". This has significant benefits for free space management in secondary indexes on low cardinality attributes. Unique checking still has to start with the first page that its heap-TID-free insertion scan key leads it to, though insertion can then quickly find the leaf page and offset its new tuple unambiguously belongs at (in the unique case there will rarely be multiple pages full of duplicates, so being unable to descend the tree to directly find the insertion target leaf page will seldom be much of a problem). Note that this version of the patch doesn't yet deal with on-disk compatibility issues. That will follow in a later revision. --- contrib/amcheck/verify_nbtree.c | 259 ++++-- contrib/pageinspect/expected/btree.out | 2 +- contrib/pgstattuple/expected/pgstattuple.out | 10 +- src/backend/access/nbtree/README | 114 ++- src/backend/access/nbtree/nbtinsert.c | 862 ++++++++++++++---- src/backend/access/nbtree/nbtpage.c | 8 +- src/backend/access/nbtree/nbtsearch.c | 190 +++- src/backend/access/nbtree/nbtsort.c | 56 +- src/backend/access/nbtree/nbtutils.c | 244 ++++- src/backend/access/nbtree/nbtxlog.c | 41 +- src/backend/access/rmgrdesc/nbtdesc.c | 8 - src/backend/storage/page/bufpage.c | 4 +- src/backend/utils/sort/tuplesort.c | 13 +- src/include/access/nbtree.h | 97 +- src/include/access/nbtxlog.h | 19 +- src/test/regress/expected/domain.out | 4 +- src/test/regress/expected/foreign_key.out | 4 +- src/test/regress/expected/join.out | 2 +- src/test/regress/expected/truncate.out | 5 +- src/test/regress/expected/typed_table.out | 11 +- src/test/regress/expected/updatable_views.out | 18 +- src/test/regress/sql/domain.sql | 2 + src/test/regress/sql/foreign_key.sql | 2 + src/test/regress/sql/truncate.sql | 2 + src/test/regress/sql/typed_table.sql | 2 + src/test/regress/sql/updatable_views.sql | 2 + src/tools/pgindent/typedefs.list | 2 + 27 files changed, 1475 insertions(+), 508 deletions(-) diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index a1438a2855..87a929dff9 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -45,6 +45,13 @@ PG_MODULE_MAGIC; */ #define InvalidBtreeLevel ((uint32) InvalidBlockNumber) +/* + * Convenience macro to get number of key attributes in tuple in low-context + * fashion + */ +#define BTreeTupleGetNKeyAtts(itup, rel) \ + Min(IndexRelationGetNumberOfKeyAttributes(rel), BTreeTupleGetNAtts(itup, rel)) + /* * State associated with verifying a B-Tree index * @@ -125,26 +132,30 @@ static void bt_check_every_level(Relation rel, Relation heaprel, static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level); static void bt_target_page_check(BtreeCheckState *state); -static ScanKey bt_right_page_check_scankey(BtreeCheckState *state); +static IndexTuple bt_right_page_check_tuple(BtreeCheckState *state); static void bt_downlink_check(BtreeCheckState *state, BlockNumber childblock, - ScanKey targetkey); + ScanKey targetkey, ItemPointer scantid, int tupnkeyatts); static void bt_downlink_missing_check(BtreeCheckState *state); static void bt_tuple_present_callback(Relation index, HeapTuple htup, Datum *values, bool *isnull, bool tupleIsAlive, void *checkstate); static inline bool offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset); +static inline bool invariant_l_offset(BtreeCheckState *state, + int tupnkeyatts, ScanKey key, ItemPointer scantid, + OffsetNumber upperbound); static inline bool invariant_leq_offset(BtreeCheckState *state, - ScanKey key, + int tupnkeyatts, ScanKey key, ItemPointer scantid, OffsetNumber upperbound); -static inline bool invariant_geq_offset(BtreeCheckState *state, - ScanKey key, - OffsetNumber lowerbound); -static inline bool invariant_leq_nontarget_offset(BtreeCheckState *state, - Page other, - ScanKey key, - OffsetNumber upperbound); +static inline bool invariant_g_offset(BtreeCheckState *state, + int tupnkeyatts, ScanKey key, ItemPointer scantid, + OffsetNumber lowerbound); +static inline bool invariant_l_nontarget_offset(BtreeCheckState *state, + Page other, int tupnkeyatts, ScanKey key, + ItemPointer scantid, OffsetNumber upperbound); static Page palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum); +static inline ItemPointer BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, + IndexTuple itup, bool isleaf); /* * bt_index_check(index regclass, heapallindexed boolean) @@ -834,8 +845,10 @@ bt_target_page_check(BtreeCheckState *state) { ItemId itemid; IndexTuple itup; - ScanKey skey; size_t tupsize; + int tupnkeyatts; + ScanKey skey; + ItemPointer scantid; CHECK_FOR_INTERRUPTS(); @@ -902,8 +915,17 @@ bt_target_page_check(BtreeCheckState *state) if (offset_is_negative_infinity(topaque, offset)) continue; - /* Build insertion scankey for current page offset */ + /* + * Build insertion scankey for current page offset/tuple. + * + * As required by _bt_mkscankey(), track number of key attributes, + * which is needed so that _bt_compare() calls handle truncated + * attributes correctly. Never count non-key attributes in + * non-truncated tuples as key attributes, though. + */ + tupnkeyatts = BTreeTupleGetNKeyAtts(itup, state->rel); skey = _bt_mkscankey(state->rel, itup); + scantid = BTreeTupleGetHeapTIDCareful(state, itup, P_ISLEAF(topaque)); /* Fingerprint leaf page tuples (those that point to the heap) */ if (state->heapallindexed && P_ISLEAF(topaque) && !ItemIdIsDead(itemid)) @@ -930,7 +952,7 @@ bt_target_page_check(BtreeCheckState *state) * and probably not markedly more effective in practice. */ if (!P_RIGHTMOST(topaque) && - !invariant_leq_offset(state, skey, P_HIKEY)) + !invariant_leq_offset(state, tupnkeyatts, skey, scantid, P_HIKEY)) { char *itid, *htid; @@ -956,11 +978,11 @@ bt_target_page_check(BtreeCheckState *state) * * Item order check * * * Check that items are stored on page in logical order, by checking - * current item is less than or equal to next item (if any). + * current item is strictly less than next item (if any). */ if (OffsetNumberNext(offset) <= max && - !invariant_leq_offset(state, skey, - OffsetNumberNext(offset))) + !invariant_l_offset(state, tupnkeyatts, skey, scantid, + OffsetNumberNext(offset))) { char *itid, *htid, @@ -1017,16 +1039,28 @@ bt_target_page_check(BtreeCheckState *state) */ else if (offset == max) { + IndexTuple righttup; ScanKey rightkey; + int righttupnkeyatts; + ItemPointer rightscantid; /* Get item in next/right page */ - rightkey = bt_right_page_check_scankey(state); + righttup = bt_right_page_check_tuple(state); - if (rightkey && - !invariant_geq_offset(state, rightkey, max)) + /* Set up right item scankey */ + if (righttup) + { + righttupnkeyatts = BTreeTupleGetNKeyAtts(righttup, state->rel); + rightkey = _bt_mkscankey(state->rel, righttup); + rightscantid = BTreeTupleGetHeapTIDCareful(state, righttup, + P_ISLEAF(topaque)); + } + + if (righttup && !invariant_g_offset(state, righttupnkeyatts, + rightkey, rightscantid, max)) { /* - * As explained at length in bt_right_page_check_scankey(), + * As explained at length in bt_right_page_check_tuple(), * there is a known !readonly race that could account for * apparent violation of invariant, which we must check for * before actually proceeding with raising error. Our canary @@ -1069,7 +1103,7 @@ bt_target_page_check(BtreeCheckState *state) { BlockNumber childblock = BTreeInnerTupleGetDownLink(itup); - bt_downlink_check(state, childblock, skey); + bt_downlink_check(state, childblock, skey, scantid, tupnkeyatts); } } @@ -1083,9 +1117,9 @@ bt_target_page_check(BtreeCheckState *state) } /* - * Return a scankey for an item on page to right of current target (or the + * Return an index tuple for an item on page to right of current target (or the * first non-ignorable page), sufficient to check ordering invariant on last - * item in current target page. Returned scankey relies on local memory + * item in current target page. Returned tuple relies on local memory * allocated for the child page, which caller cannot pfree(). Caller's memory * context should be reset between calls here. * @@ -1098,8 +1132,8 @@ bt_target_page_check(BtreeCheckState *state) * Note that !readonly callers must reverify that target page has not * been concurrently deleted. */ -static ScanKey -bt_right_page_check_scankey(BtreeCheckState *state) +static IndexTuple +bt_right_page_check_tuple(BtreeCheckState *state) { BTPageOpaque opaque; ItemId rightitem; @@ -1287,11 +1321,10 @@ bt_right_page_check_scankey(BtreeCheckState *state) } /* - * Return first real item scankey. Note that this relies on right page - * memory remaining allocated. + * Return first real item. Note that this relies on right page memory + * remaining allocated. */ - return _bt_mkscankey(state->rel, - (IndexTuple) PageGetItem(rightpage, rightitem)); + return (IndexTuple) PageGetItem(rightpage, rightitem); } /* @@ -1305,7 +1338,7 @@ bt_right_page_check_scankey(BtreeCheckState *state) */ static void bt_downlink_check(BtreeCheckState *state, BlockNumber childblock, - ScanKey targetkey) + ScanKey targetkey, ItemPointer scantid, int tupnkeyatts) { OffsetNumber offset; OffsetNumber maxoffset; @@ -1354,7 +1387,8 @@ bt_downlink_check(BtreeCheckState *state, BlockNumber childblock, /* * Verify child page has the downlink key from target page (its parent) as - * a lower bound. + * a lower bound; downlink must be strictly less than all keys on the + * page. * * Check all items, rather than checking just the first and trusting that * the operator class obeys the transitive law. @@ -1404,14 +1438,14 @@ bt_downlink_check(BtreeCheckState *state, BlockNumber childblock, /* * Skip comparison of target page key against "negative infinity" * item, if any. Checking it would indicate that it's not an upper - * bound, but that's only because of the hard-coding within - * _bt_compare(). + * bound, but that's only because of the hard-coding for negative + * inifinity items within _bt_compare(). */ if (offset_is_negative_infinity(copaque, offset)) continue; - if (!invariant_leq_nontarget_offset(state, child, - targetkey, offset)) + if (!invariant_l_nontarget_offset(state, child, tupnkeyatts, + targetkey, scantid, offset)) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("down-link lower bound invariant violated for index \"%s\"", @@ -1751,6 +1785,54 @@ offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset) return !P_ISLEAF(opaque) && offset == P_FIRSTDATAKEY(opaque); } +/* + * Does the invariant hold that the key is strictly less than a given upper + * bound offset item? + * + * If this function returns false, convention is that caller throws error due + * to corruption. + */ +static inline bool +invariant_l_offset(BtreeCheckState *state, int tupnkeyatts, ScanKey key, + ItemPointer scantid, OffsetNumber upperbound) +{ + int32 cmp; + + cmp = _bt_compare(state->rel, tupnkeyatts, key, scantid, state->target, + upperbound); + + /* + * _bt_compare interprets the absence of attributes in scan keys as + * meaning that they're not participating in a search, not as negative + * infinity (only tuples within the index are treated as negative + * infinity). Compensate for that here. + */ + if (cmp == 0) + { + BTPageOpaque topaque; + ItemId itemid; + IndexTuple ritup; + int uppnkeyatts; + ItemPointer rheaptid; + + itemid = PageGetItemId(state->target, upperbound); + ritup = (IndexTuple) PageGetItem(state->target, itemid); + uppnkeyatts = BTreeTupleGetNKeyAtts(ritup, state->rel); + + /* Get heap TID for item to the right */ + topaque = (BTPageOpaque) PageGetSpecialPointer(state->target); + rheaptid = BTreeTupleGetHeapTIDCareful(state, ritup, + P_ISLEAF(topaque)); + + if (uppnkeyatts == tupnkeyatts) + return scantid == NULL && rheaptid != NULL; + + return tupnkeyatts < uppnkeyatts; + } + + return cmp < 0; +} + /* * Does the invariant hold that the key is less than or equal to a given upper * bound offset item? @@ -1759,57 +1841,93 @@ offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset) * to corruption. */ static inline bool -invariant_leq_offset(BtreeCheckState *state, ScanKey key, - OffsetNumber upperbound) +invariant_leq_offset(BtreeCheckState *state, int tupnkeyatts, ScanKey key, + ItemPointer scantid, OffsetNumber upperbound) { - int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel); int32 cmp; - cmp = _bt_compare(state->rel, nkeyatts, key, state->target, upperbound); + cmp = _bt_compare(state->rel, tupnkeyatts, key, scantid, state->target, + upperbound); return cmp <= 0; } /* - * Does the invariant hold that the key is greater than or equal to a given - * lower bound offset item? + * Does the invariant hold that the key is strictly greater than a given lower + * bound offset item? * * If this function returns false, convention is that caller throws error due * to corruption. */ static inline bool -invariant_geq_offset(BtreeCheckState *state, ScanKey key, - OffsetNumber lowerbound) +invariant_g_offset(BtreeCheckState *state, int tupnkeyatts, ScanKey key, + ItemPointer scantid, OffsetNumber lowerbound) { - int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel); int32 cmp; - cmp = _bt_compare(state->rel, nkeyatts, key, state->target, lowerbound); + /* + * No need to consider possibility that scankey has attributes that we + * need to force to be interpreted as negative infinity, since scan key + * has to be strictly greater than lower bound offset. + */ + cmp = _bt_compare(state->rel, tupnkeyatts, key, scantid, state->target, + lowerbound); - return cmp >= 0; + return cmp > 0; } /* - * Does the invariant hold that the key is less than or equal to a given upper + * Does the invariant hold that the key is strictly less than a given upper * bound offset item, with the offset relating to a caller-supplied page that - * is not the current target page? Caller's non-target page is typically a - * child page of the target, checked as part of checking a property of the - * target page (i.e. the key comes from the target). + * is not the current target page? + * + * Caller's non-target page is a child page of the target, checked as part of + * checking a property of the target page (i.e. the key comes from the + * target). * * If this function returns false, convention is that caller throws error due * to corruption. */ static inline bool -invariant_leq_nontarget_offset(BtreeCheckState *state, - Page nontarget, ScanKey key, - OffsetNumber upperbound) +invariant_l_nontarget_offset(BtreeCheckState *state, Page nontarget, + int tupnkeyatts, ScanKey key, + ItemPointer scantid, OffsetNumber upperbound) { - int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel); int32 cmp; - cmp = _bt_compare(state->rel, nkeyatts, key, nontarget, upperbound); + cmp = _bt_compare(state->rel, tupnkeyatts, key, scantid, nontarget, + upperbound); - return cmp <= 0; + /* + * _bt_compare interprets the absence of attributes in scan keys as + * meaning that they're not participating in a search, not as negative + * infinity (only tuples within the index are treated as negative + * infinity). Compensate for that here. + */ + if (cmp == 0) + { + ItemId itemid; + IndexTuple child; + int uppnkeyatts; + ItemPointer childheaptid; + BTPageOpaque copaque; + + copaque = (BTPageOpaque) PageGetSpecialPointer(nontarget); + itemid = PageGetItemId(nontarget, upperbound); + child = (IndexTuple) PageGetItem(nontarget, itemid); + uppnkeyatts = BTreeTupleGetNKeyAtts(child, state->rel); + + /* Get heap TID for item from child/non-target */ + childheaptid = + BTreeTupleGetHeapTIDCareful(state, child, P_ISLEAF(copaque)); + + if (uppnkeyatts == tupnkeyatts) + return scantid == NULL && childheaptid != NULL; + + return tupnkeyatts < uppnkeyatts; + } + + return cmp < 0; } /* @@ -1965,3 +2083,32 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum) return page; } + +/* + * BTreeTupleGetHeapTID() wrapper that lets caller enforce that a heap TID must + * be present in cases where that is mandatory. + * + * This doesn't add much as of BTREE_VERSION 4, since the INDEX_ALT_TID_MASK + * bit is effectively a proxy for whether or not the tuple is a pivot tuple. + * It may become more useful in the future, when non-pivot tuples support their + * own alternative INDEX_ALT_TID_MASK representation. + * + * Note that it is incorrect to specify the tuple as a non-pivot when passing a + * leaf tuple that came from the high key offset, since that is actually a + * pivot tuple. + */ +static inline ItemPointer +BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, IndexTuple itup, + bool nonpivot) +{ + ItemPointer result = BTreeTupleGetHeapTID(itup); + BlockNumber targetblock = state->targetblock; + + if (result == NULL && nonpivot) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("block %u or its right sibling block or child block in index \"%s\" contains non-pivot tuple that lacks a heap TID", + targetblock, RelationGetRelationName(state->rel)))); + + return result; +} diff --git a/contrib/pageinspect/expected/btree.out b/contrib/pageinspect/expected/btree.out index 2aaa4df53b..07c2dcd771 100644 --- a/contrib/pageinspect/expected/btree.out +++ b/contrib/pageinspect/expected/btree.out @@ -5,7 +5,7 @@ CREATE INDEX test1_a_idx ON test1 USING btree (a); SELECT * FROM bt_metap('test1_a_idx'); -[ RECORD 1 ]-----------+------- magic | 340322 -version | 3 +version | 4 root | 1 level | 0 fastroot | 1 diff --git a/contrib/pgstattuple/expected/pgstattuple.out b/contrib/pgstattuple/expected/pgstattuple.out index 9858ea69d4..9920dbfd40 100644 --- a/contrib/pgstattuple/expected/pgstattuple.out +++ b/contrib/pgstattuple/expected/pgstattuple.out @@ -48,7 +48,7 @@ select version, tree_level, from pgstatindex('test_pkey'); version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+-------------------- - 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN + 4 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN (1 row) select version, tree_level, @@ -58,7 +58,7 @@ select version, tree_level, from pgstatindex('test_pkey'::text); version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+-------------------- - 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN + 4 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN (1 row) select version, tree_level, @@ -68,7 +68,7 @@ select version, tree_level, from pgstatindex('test_pkey'::name); version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+-------------------- - 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN + 4 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN (1 row) select version, tree_level, @@ -78,7 +78,7 @@ select version, tree_level, from pgstatindex('test_pkey'::regclass); version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+-------------------- - 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN + 4 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN (1 row) select pg_relpages('test'); @@ -232,7 +232,7 @@ create index test_partition_hash_idx on test_partition using hash (a); select pgstatindex('test_partition_idx'); pgstatindex ------------------------------ - (3,0,8192,0,0,0,0,0,NaN,NaN) + (4,0,8192,0,0,0,0,0,NaN,NaN) (1 row) select pgstathashindex('test_partition_hash_idx'); diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 3680e69b89..dc6c65d201 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -34,30 +34,47 @@ Differences to the Lehman & Yao algorithm We have made the following changes in order to incorporate the L&Y algorithm into Postgres: -The requirement that all btree keys be unique is too onerous, -but the algorithm won't work correctly without it. Fortunately, it is -only necessary that keys be unique on a single tree level, because L&Y -only use the assumption of key uniqueness when re-finding a key in a -parent page (to determine where to insert the key for a split page). -Therefore, we can use the link field to disambiguate multiple -occurrences of the same user key: only one entry in the parent level -will be pointing at the page we had split. (Indeed we need not look at -the real "key" at all, just at the link field.) We can distinguish -items at the leaf level in the same way, by examining their links to -heap tuples; we'd never have two items for the same heap tuple. +The requirement that all btree keys be unique is satisfied by treating +heap TID as a tie-breaker attribute. Logical duplicates are sorted in +descending item pointer order. We don't use btree keys to +disambiguate downlinks from the internal pages during a page split, +though: only one entry in the parent level will be pointing at the +page we just split, so the link fields can be used to re-find +downlinks in the parent via a linear search. -Lehman and Yao assume that the key range for a subtree S is described -by Ki < v <= Ki+1 where Ki and Ki+1 are the adjacent keys in the parent -page. This does not work for nonunique keys (for example, if we have -enough equal keys to spread across several leaf pages, there *must* be -some equal bounding keys in the first level up). Therefore we assume -Ki <= v <= Ki+1 instead. A search that finds exact equality to a -bounding key in an upper tree level must descend to the left of that -key to ensure it finds any equal keys in the preceding page. An -insertion that sees the high key of its target page is equal to the key -to be inserted has a choice whether or not to move right, since the new -key could go on either page. (Currently, we try to find a page where -there is room for the new key without a split.) +Lehman and Yao require that the key range for a subtree S is described +by Ki < v <= Ki+1 where Ki and Ki+1 are the adjacent keys in the +parent page, but do not account for the need to search the tree based +only on leading index attributes in a composite index. Since heap TID +is always used to make btree keys unique (even in unique indexes), +every btree index is treated as a composite index internally. A +search that finds exact equality to a pivot tuple in an upper tree +level must descend to the left of that key to ensure it finds any +equal keys, even when scan values were provided for all attributes. +An insertion that sees that the high key of its target page is equal +to the key to be inserted cannot move right, since the downlink for +the right sibling in the parent must always be strictly less than +right sibling keys (this is always possible because the leftmost +downlink on any non-leaf level is always a negative infinity +downlink). + +We might be able to avoid moving left in the event of a full match on +all attributes up to and including the heap TID attribute, but that +would be a very narrow win, since it's rather unlikely that heap TID +will be an exact match. We can avoid moving left unnecessarily when +all user-visible keys are equal by avoiding exact equality; a +sentinel value that's less than any possible heap TID is used by most +index scans. This is effective because of suffix truncation. An +"extra" heap TID attribute in pivot tuples is almost always avoided. +All truncated attributes compare as minus infinity, even against a +sentinel value, and the sentinel value is less than any real TID +value, so an unnecessary move to the left is avoided regardless of +whether or not a heap TID is present in the otherwise-equal pivot +tuple. Consistently moving left on full equality is also needed by +page deletion, which re-finds a leaf page by descending the tree while +searching on the leaf page's high key. If we wanted to avoid moving +left without breaking page deletion, we'd have to avoid suffix +truncation, which could never be worth it. Lehman and Yao don't require read locks, but assume that in-memory copies of tree pages are unshared. Postgres shares in-memory buffers @@ -610,21 +627,25 @@ scanned to decide whether to return the entry and whether the scan can stop (see _bt_checkkeys()). We use term "pivot" index tuples to distinguish tuples which don't point -to heap tuples, but rather used for tree navigation. Pivot tuples includes -all tuples on non-leaf pages and high keys on leaf pages. Note that pivot -index tuples are only used to represent which part of the key space belongs -on each page, and can have attribute values copied from non-pivot tuples -that were deleted and killed by VACUUM some time ago. In principle, we could -truncate away attributes that are not needed for a page high key during a leaf -page split, provided that the remaining attributes distinguish the last index -tuple on the post-split left page as belonging on the left page, and the first -index tuple on the post-split right page as belonging on the right page. This -optimization is sometimes called suffix truncation, and may appear in a future -release. Since the high key is subsequently reused as the downlink in the -parent page for the new right page, suffix truncation can increase index -fan-out considerably by keeping pivot tuples short. INCLUDE indexes similarly -truncate away non-key attributes at the time of a leaf page split, -increasing fan-out. +to heap tuples, that are used only for tree navigation. Pivot tuples +includes all tuples on non-leaf pages and high keys on leaf pages. Note +that pivot index tuples are only used to represent which part of the key +space belongs on each page, and can have attribute values copied from +non-pivot tuples that were deleted and killed by VACUUM some time ago. + +We truncate away attributes that are not needed for a page high key during +a leaf page split, provided that the remaining attributes distinguish the +last index tuple on the post-split left page as belonging on the left +page, and the first index tuple on the post-split right page as belonging +on the right page. A truncated tuple logically retains the truncated +suffix key attributes, which implicitly have "negative infinity" as their +value. This optimization is called suffix truncation. Since the high key +is subsequently reused as the downlink in the parent page for the new +right page, suffix truncation can increase index fan-out considerably by +keeping pivot tuples short. INCLUDE indexes are guaranteed to have +non-key attributes truncated at the time of a leaf page split, but may +also have some key attributes truncated away, based on the usual criteria +for key attributes. Notes About Data Representation ------------------------------- @@ -658,4 +679,19 @@ downlink. The first data item on each such page has no lower bound routines must treat it accordingly. The actual key stored in the item is irrelevant, and need not be stored at all. This arrangement corresponds to the fact that an L&Y non-leaf page has one more pointer -than key. +than key. Suffix truncation's negative infinity attributes behave in +the same way. + +Non-leaf pages only truly need to truncate their first item to zero +attributes at the leftmost level, since that truly is negative infinity. +All other negative infinity items are only really negative infinity +within the subtree that the page is at the root of (or is a leftmost +page within). We truncate away all attributes of the first item on +non-leaf pages just the same, to save a little space. If we ever +avoided zero-truncating items on pages where that doesn't accurately +represent the absolute separation of the keyspace, we'd be left with +"low key" items on internal pages -- a key value that can be used as a +lower bound on items on the page, much like the high key is an upper +bound. (Actually, that would even be true of "true" negative infinity +items. One can think of rightmost pages as implicitly containing +"positive infinity" high keys.) diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 582e5b0652..0cb8bb1816 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -24,29 +24,44 @@ #include "storage/lmgr.h" #include "storage/predicate.h" #include "storage/smgr.h" +#include "utils/datum.h" #include "utils/tqual.h" /* Minimum tree height for application of fastpath optimization */ #define BTREE_FASTPATH_MIN_LEVEL 2 +#define STACK_SPLIT_POINTS 15 + +typedef enum +{ + /* strategy to use for a call to FindSplitData */ + SPLIT_DEFAULT, /* give some weight to truncation */ + SPLIT_MANY_DUPLICATES, /* find minimally distinguishing point */ + SPLIT_SINGLE_VALUE /* leave left page almost empty */ +} SplitMode; + +typedef struct +{ + /* FindSplitData candidate split */ + int delta; /* size delta */ + bool newitemonleft; /* new item on left or right of split */ + OffsetNumber firstright; /* split point */ +} SplitPoint; typedef struct { /* context data for _bt_checksplitloc */ Size newitemsz; /* size of new item to be inserted */ - int fillfactor; /* needed when splitting rightmost page */ + int fillfactor; /* needed for weighted splits */ bool is_leaf; /* T if splitting a leaf page */ - bool is_rightmost; /* T if splitting a rightmost page */ + bool is_weighted; /* T if weighted (e.g. rightmost) split */ OffsetNumber newitemoff; /* where the new item is to be inserted */ int leftspace; /* space available for items on left page */ int rightspace; /* space available for items on right page */ int olddataitemstotal; /* space taken by old items */ - bool have_split; /* found a valid split? */ - - /* these fields valid only if have_split is true */ - bool newitemonleft; /* new item on left or right of best split */ - OffsetNumber firstright; /* best split point */ - int best_delta; /* best size delta so far */ + int maxsplit; /* Maximum number of splits */ + int nsplits; /* Current number of splits */ + SplitPoint *splits; /* Sorted by delta */ } FindSplitData; @@ -76,12 +91,18 @@ static Buffer _bt_split(Relation rel, Buffer buf, Buffer cbuf, static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only); static OffsetNumber _bt_findsplitloc(Relation rel, Page page, - OffsetNumber newitemoff, - Size newitemsz, - bool *newitemonleft); -static void _bt_checksplitloc(FindSplitData *state, + SplitMode mode, OffsetNumber newitemoff, + Size newitemsz, IndexTuple newitem, bool *newitemonleft); +static int _bt_checksplitloc(FindSplitData *state, OffsetNumber firstoldonright, bool newitemonleft, int dataitemstoleft, Size firstoldonrightsz); +static int _bt_perfect_firstdiff(Relation rel, Page page, + OffsetNumber newitemoff, IndexTuple newitem, + int nsplits, SplitPoint *splits, SplitMode *secondmode); +static int _bt_split_firstdiff(Relation rel, Page page, OffsetNumber newitemoff, + IndexTuple newitem, SplitPoint *split); +static int _bt_tuple_firstdiff(Relation rel, IndexTuple lastleft, + IndexTuple firstright, bool *identical); static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off); static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum, @@ -113,9 +134,12 @@ _bt_doinsert(Relation rel, IndexTuple itup, bool is_unique = false; int indnkeyatts; ScanKey itup_scankey; + ItemPointer itup_scantid; BTStack stack = NULL; Buffer buf; OffsetNumber offset; + Page page; + BTPageOpaque lpageop; bool fastpath; indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); @@ -123,6 +147,8 @@ _bt_doinsert(Relation rel, IndexTuple itup, /* we need an insertion scan key to do our search, so build one */ itup_scankey = _bt_mkscankey(rel, itup); + /* we use a heap TID with scan key if this isn't unique case */ + itup_scantid = (checkUnique == UNIQUE_CHECK_NO ? &itup->t_tid : NULL); /* * It's very common to have an index on an auto-incremented or @@ -149,8 +175,6 @@ top: if (RelationGetTargetBlock(rel) != InvalidBlockNumber) { Size itemsz; - Page page; - BTPageOpaque lpageop; /* * Conditionally acquire exclusive lock on the buffer before doing any @@ -180,8 +204,8 @@ top: !P_IGNORE(lpageop) && (PageGetFreeSpace(page) > itemsz) && PageGetMaxOffsetNumber(page) >= P_FIRSTDATAKEY(lpageop) && - _bt_compare(rel, indnkeyatts, itup_scankey, page, - P_FIRSTDATAKEY(lpageop)) > 0) + _bt_compare(rel, indnkeyatts, itup_scankey, itup_scantid, + page, P_FIRSTDATAKEY(lpageop)) > 0) { /* * The right-most block should never have an incomplete split. @@ -220,8 +244,8 @@ top: * Find the first page containing this key. Buffer returned by * _bt_search() is locked in exclusive mode. */ - stack = _bt_search(rel, indnkeyatts, itup_scankey, false, &buf, BT_WRITE, - NULL); + stack = _bt_search(rel, indnkeyatts, itup_scankey, itup_scantid, false, + &buf, BT_WRITE, NULL); } /* @@ -231,12 +255,13 @@ top: * NOTE: obviously, _bt_check_unique can only detect keys that are already * in the index; so it cannot defend against concurrent insertions of the * same key. We protect against that by means of holding a write lock on - * the target page. Any other would-be inserter of the same key must - * acquire a write lock on the same target page, so only one would-be - * inserter can be making the check at one time. Furthermore, once we are - * past the check we hold write locks continuously until we have performed - * our insertion, so no later inserter can fail to see our insertion. - * (This requires some care in _bt_findinsertloc.) + * the first page the value could be on, regardless of the value of its + * implicit heap TID tie-breaker attribute. Any other would-be inserter + * of the same key must acquire a write lock on the same page, so only one + * would-be inserter can be making the check at one time. Furthermore, + * once we are past the check we hold write locks continuously until we + * have performed our insertion, so no later inserter can fail to see our + * insertion. (This requires some care in _bt_findinsertloc.) * * If we must wait for another xact, we release the lock while waiting, * and then must start over completely. @@ -250,7 +275,11 @@ top: TransactionId xwait; uint32 speculativeToken; - offset = _bt_binsrch(rel, buf, indnkeyatts, itup_scankey, false); + page = BufferGetPage(buf); + lpageop = (BTPageOpaque) PageGetSpecialPointer(page); + Assert(itup_scantid == NULL); + offset = _bt_binsrch(rel, buf, indnkeyatts, itup_scankey, NULL, + P_FIRSTDATAKEY(lpageop), false); xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey, checkUnique, &is_unique, &speculativeToken); @@ -288,7 +317,7 @@ top: * attributes are not considered part of the key space. */ CheckForSerializableConflictIn(rel, NULL, buf); - /* do the insertion */ + /* do the insertion, possibly on a page to the right in unique case */ _bt_findinsertloc(rel, &buf, &offset, indnkeyatts, itup_scankey, itup, stack, heapRel); _bt_insertonpg(rel, buf, InvalidBuffer, stack, itup, offset, false); @@ -553,11 +582,11 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, offset = OffsetNumberNext(offset); else { - /* If scankey == hikey we gotta check the next page too */ + /* If scankey <= hikey we gotta check the next page too */ if (P_RIGHTMOST(opaque)) break; - if (!_bt_isequal(itupdesc, page, P_HIKEY, - indnkeyatts, itup_scankey)) + /* _bt_isequal()'s special NULL semantics not required here */ + if (_bt_compare(rel, indnkeyatts, itup_scankey, NULL, page, P_HIKEY) > 0) break; /* Advance to next non-dead page --- there must be one */ for (;;) @@ -601,31 +630,22 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, /* * _bt_findinsertloc() -- Finds an insert location for a tuple * - * If the new key is equal to one or more existing keys, we can - * legitimately place it anywhere in the series of equal keys --- in fact, - * if the new key is equal to the page's "high key" we can place it on - * the next page. If it is equal to the high key, and there's not room - * to insert the new tuple on the current page without splitting, then - * we can move right hoping to find more free space and avoid a split. - * (We should not move right indefinitely, however, since that leads to - * O(N^2) insertion behavior in the presence of many equal keys.) - * Once we have chosen the page to put the key on, we'll insert it before - * any existing equal keys because of the way _bt_binsrch() works. - * - * If there's not enough room in the space, we try to make room by - * removing any LP_DEAD tuples. - * * On entry, *bufptr and *offsetptr point to the first legal position - * where the new tuple could be inserted. The caller should hold an - * exclusive lock on *bufptr. *offsetptr can also be set to - * InvalidOffsetNumber, in which case the function will search for the - * right location within the page if needed. On exit, they point to the - * chosen insert location. If _bt_findinsertloc decides to move right, - * the lock and pin on the original page will be released and the new - * page returned to the caller is exclusively locked instead. + * where the new tuple could be inserted if we were to treat it as having + * no implicit heap TID; only callers that just called _bt_check_unique() + * provide this hint (all other callers should set *offsetptr to + * InvalidOffsetNumber). The caller should hold an exclusive lock on + * *bufptr in all cases. On exit, they both point to the chosen insert + * location in all cases. If _bt_findinsertloc decides to move right, the + * lock and pin on the original page will be released, and the new page + * returned to the caller is exclusively locked instead. + * + * This is also where opportunistic microvacuuming of LP_DEAD tuples + * occurs. * * newtup is the new tuple we're inserting, and scankey is an insertion - * type scan key for it. + * type scan key for it. We take a "scantid" heap TID attribute value + * from newtup directly. */ static void _bt_findinsertloc(Relation rel, @@ -641,9 +661,9 @@ _bt_findinsertloc(Relation rel, Page page = BufferGetPage(buf); Size itemsz; BTPageOpaque lpageop; - bool movedright, - vacuumed; + bool hintinvalidated; OffsetNumber newitemoff; + OffsetNumber lowitemoff; OffsetNumber firstlegaloff = *offsetptr; lpageop = (BTPageOpaque) PageGetSpecialPointer(page); @@ -673,59 +693,30 @@ _bt_findinsertloc(Relation rel, errtableconstraint(heapRel, RelationGetRelationName(rel)))); - /*---------- - * If we will need to split the page to put the item on this page, - * check whether we can put the tuple somewhere to the right, - * instead. Keep scanning right until we - * (a) find a page with enough free space, - * (b) reach the last page where the tuple can legally go, or - * (c) get tired of searching. - * (c) is not flippant; it is important because if there are many - * pages' worth of equal keys, it's better to split one of the early - * pages than to scan all the way to the end of the run of equal keys - * on every insert. We implement "get tired" as a random choice, - * since stopping after scanning a fixed number of pages wouldn't work - * well (we'd never reach the right-hand side of previously split - * pages). Currently the probability of moving right is set at 0.99, - * which may seem too high to change the behavior much, but it does an - * excellent job of preventing O(N^2) behavior with many equal keys. - *---------- + /* firstlegaloff/offsetptr hint (if any) assumed valid initially */ + hintinvalidated = false; + + /* + * TODO: Restore the logic for finding a page to insert on in the event of + * many duplicates for pre-pg_upgrade indexes. The whole search through + * pages of logical duplicates to determine where to insert seems like + * something that has little upside, but that doesn't make it okay to + * ignore the performance characteristics after pg_upgrade is run, but + * before a REINDEX can run to bump BTREE_VERSION. */ - movedright = false; - vacuumed = false; - while (PageGetFreeSpace(page) < itemsz) + while (true) { Buffer rbuf; BlockNumber rblkno; - /* - * before considering moving right, see if we can obtain enough space - * by erasing LP_DEAD items - */ - if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop)) - { - _bt_vacuum_one_page(rel, buf, heapRel); - - /* - * remember that we vacuumed this page, because that makes the - * hint supplied by the caller invalid - */ - vacuumed = true; - - if (PageGetFreeSpace(page) >= itemsz) - break; /* OK, now we have enough space */ - } - - /* - * nope, so check conditions (b) and (c) enumerated above - */ if (P_RIGHTMOST(lpageop) || - _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 || - random() <= (MAX_RANDOM_VALUE / 100)) + _bt_compare(rel, keysz, scankey, &newtup->t_tid, page, P_HIKEY) <= 0) break; /* - * step right to next non-dead page + * step right to next non-dead page. this is only needed for unique + * indexes, and pg_upgrade'd indexes that still use BTREE_VERSION 2 or + * 3, where heap TID isn't considered to be a part of the keyspace. * * must write-lock that page before releasing write lock on current * page; else someone else's _bt_check_unique scan could fail to see @@ -764,24 +755,40 @@ _bt_findinsertloc(Relation rel, } _bt_relbuf(rel, buf); buf = rbuf; - movedright = true; - vacuumed = false; + hintinvalidated = true; + } + + Assert(P_ISLEAF(lpageop)); + + /* + * Perform micro-vacuuming of the page we're about to insert tuple on to + * if it looks like it has LP_DEAD items. + */ + if (P_HAS_GARBAGE(lpageop) && PageGetFreeSpace(page) < itemsz) + { + _bt_vacuum_one_page(rel, buf, heapRel); + + hintinvalidated = true; } /* - * Now we are on the right page, so find the insert position. If we moved - * right at all, we know we should insert at the start of the page. If we - * didn't move right, we can use the firstlegaloff hint if the caller - * supplied one, unless we vacuumed the page which might have moved tuples - * around making the hint invalid. If we didn't move right or can't use - * the hint, find the position by searching. + * Consider using caller's hint to avoid repeated binary search effort. + * + * Note that the hint is only provided by callers that checked uniqueness. + * The hint is used as a lower bound for a new binary search, since + * caller's original binary search won't have specified a scan tid. */ - if (movedright) - newitemoff = P_FIRSTDATAKEY(lpageop); - else if (firstlegaloff != InvalidOffsetNumber && !vacuumed) - newitemoff = firstlegaloff; + if (firstlegaloff == InvalidOffsetNumber || hintinvalidated) + lowitemoff = P_FIRSTDATAKEY(lpageop); else - newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false); + { + Assert(firstlegaloff == _bt_binsrch(rel, buf, keysz, scankey, NULL, + P_FIRSTDATAKEY(lpageop), false)); + lowitemoff = firstlegaloff; + } + + newitemoff = _bt_binsrch(rel, buf, keysz, scankey, &newtup->t_tid, + lowitemoff, false); *bufptr = buf; *offsetptr = newitemoff; @@ -840,11 +847,12 @@ _bt_insertonpg(Relation rel, /* child buffer must be given iff inserting on an internal page */ Assert(P_ISLEAF(lpageop) == !BufferIsValid(cbuf)); /* tuple must have appropriate number of attributes */ + Assert(BTreeTupleGetNAtts(itup, rel) > 0); Assert(!P_ISLEAF(lpageop) || BTreeTupleGetNAtts(itup, rel) == IndexRelationGetNumberOfAttributes(rel)); Assert(P_ISLEAF(lpageop) || - BTreeTupleGetNAtts(itup, rel) == + BTreeTupleGetNAtts(itup, rel) <= IndexRelationGetNumberOfKeyAttributes(rel)); /* The caller should've finished any incomplete splits already. */ @@ -889,8 +897,8 @@ _bt_insertonpg(Relation rel, BlockNumberIsValid(RelationGetTargetBlock(rel)))); /* Choose the split point */ - firstright = _bt_findsplitloc(rel, page, - newitemoff, itemsz, + firstright = _bt_findsplitloc(rel, page, SPLIT_DEFAULT, + newitemoff, itemsz, itup, &newitemonleft); /* split the buffer into left and right halves */ @@ -1132,8 +1140,6 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, OffsetNumber i; bool isleaf; IndexTuple lefthikey; - int indnatts = IndexRelationGetNumberOfAttributes(rel); - int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); /* Acquire a new page to split into */ rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); @@ -1203,7 +1209,9 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, itemid = PageGetItemId(origpage, P_HIKEY); itemsz = ItemIdGetLength(itemid); item = (IndexTuple) PageGetItem(origpage, itemid); - Assert(BTreeTupleGetNAtts(item, rel) == indnkeyatts); + Assert(BTreeTupleGetNAtts(item, rel) > 0); + Assert(BTreeTupleGetNAtts(item, rel) <= + IndexRelationGetNumberOfKeyAttributes(rel)); if (PageAddItem(rightpage, (Item) item, itemsz, rightoff, false, false) == InvalidOffsetNumber) { @@ -1217,8 +1225,9 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, /* * The "high key" for the new left page will be the first key that's going - * to go into the new right page. This might be either the existing data - * item at position firstright, or the incoming tuple. + * to go into the new right page, or possibly a truncated version if this + * is a leaf page split. This might be either the existing data item at + * position firstright, or the incoming tuple. */ leftoff = P_HIKEY; if (!newitemonleft && newitemoff == firstright) @@ -1236,25 +1245,55 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, } /* - * Truncate non-key (INCLUDE) attributes of the high key item before - * inserting it on the left page. This only needs to happen at the leaf - * level, since in general all pivot tuple values originate from leaf - * level high keys. This isn't just about avoiding unnecessary work, - * though; truncating unneeded key attributes (more aggressive suffix - * truncation) can only be performed at the leaf level anyway. This is - * because a pivot tuple in a grandparent page must guide a search not + * Truncate attributes of the high key item before inserting it on the + * left page. This can only happen at the leaf level, since in general + * all pivot tuple values originate from leaf level high keys. This isn't + * just about avoiding unnecessary work, though; truncating unneeded key + * suffix attributes can only be performed at the leaf level anyway. This + * is because a pivot tuple in a grandparent page must guide a search not * only to the correct parent page, but also to the correct leaf page. + * + * Note that non-key (INCLUDE) attributes are always truncated away here. + * Additional key attributes are truncated away when they're not required + * to correctly separate the key space. */ - if (indnatts != indnkeyatts && isleaf) + if (isleaf) { - lefthikey = _bt_nonkey_truncate(rel, item); + OffsetNumber lastleftoff; + IndexTuple lastleft; + + /* + * Determine which tuple is on the left side of the split point, and + * generate truncated copy of the right tuple. Truncate as + * aggressively as possible without generating a high key for the left + * side of the split (and later downlink for the right side) that + * fails to distinguish each side. The new high key needs to be + * strictly less than all tuples on the right side of the split, but + * can be equal to items on the left side of the split. + * + * Handle the case where the incoming tuple is about to become the + * last item on the left side of the split. + */ + if (newitemonleft && newitemoff == firstright) + lastleft = newitem; + else + { + lastleftoff = OffsetNumberPrev(firstright); + itemid = PageGetItemId(origpage, lastleftoff); + lastleft = (IndexTuple) PageGetItem(origpage, itemid); + } + + Assert(lastleft != item); + lefthikey = _bt_suffix_truncate(rel, lastleft, item); itemsz = IndexTupleSize(lefthikey); itemsz = MAXALIGN(itemsz); } else lefthikey = item; - Assert(BTreeTupleGetNAtts(lefthikey, rel) == indnkeyatts); + Assert(BTreeTupleGetNAtts(lefthikey, rel) > 0); + Assert(BTreeTupleGetNAtts(lefthikey, rel) <= + IndexRelationGetNumberOfKeyAttributes(rel)); if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff, false, false) == InvalidOffsetNumber) { @@ -1447,7 +1486,6 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, xl_btree_split xlrec; uint8 xlinfo; XLogRecPtr recptr; - bool loglhikey = false; xlrec.level = ropaque->btpo.level; xlrec.firstright = firstright; @@ -1476,22 +1514,10 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, if (newitemonleft) XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz)); - /* Log left page */ - if (!isleaf || indnatts != indnkeyatts) - { - /* - * We must also log the left page's high key. There are two - * reasons for that: right page's leftmost key is suppressed on - * non-leaf levels and in covering indexes included columns are - * truncated from high keys. Show it as belonging to the left - * page buffer, so that it is not stored if XLogInsert decides it - * needs a full-page image of the left page. - */ - itemid = PageGetItemId(origpage, P_HIKEY); - item = (IndexTuple) PageGetItem(origpage, itemid); - XLogRegisterBufData(0, (char *) item, MAXALIGN(IndexTupleSize(item))); - loglhikey = true; - } + /* Log left page. We must also log the left page's high key. */ + itemid = PageGetItemId(origpage, P_HIKEY); + item = (IndexTuple) PageGetItem(origpage, itemid); + XLogRegisterBufData(0, (char *) item, MAXALIGN(IndexTupleSize(item))); /* * Log the contents of the right page in the format understood by @@ -1509,9 +1535,7 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, (char *) rightpage + ((PageHeader) rightpage)->pd_upper, ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper); - xlinfo = newitemonleft ? - (loglhikey ? XLOG_BTREE_SPLIT_L_HIGHKEY : XLOG_BTREE_SPLIT_L) : - (loglhikey ? XLOG_BTREE_SPLIT_R_HIGHKEY : XLOG_BTREE_SPLIT_R); + xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R; recptr = XLogInsert(RM_BTREE_ID, xlinfo); PageSetLSN(origpage, recptr); @@ -1548,6 +1572,39 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, * for it, we might find ourselves with too little room on the page that * it needs to go into!) * + * We also give some weight to suffix truncation in deciding a split point + * on leaf pages. We try to select a point where a distinguishing attribute + * appears earlier in the new high key for the left side of the split, in + * order to maximize the number of trailing attributes that can be truncated + * away. Generally speaking, only candidate split points that fall within + * an acceptable space utilization range are considered. This is even + * useful with pages that only have a single (non-TID) attribute, since it's + * important to avoid appending an explicit heap TID attribute to the new + * pivot tuple (high key/downlink) when it cannot actually be truncated. + * Avoiding appending a heap TID can be thought of as a "logical" suffix + * truncation that "removes" the final attribute in the new high key for the + * new left page. + * + * We do all we can to avoid having to append a heap TID in the new high + * key. We may have to call ourselves recursively in many duplicates mode. + * This happens when a heap TID would otherwise be appended, but the page + * isn't completely full of logical duplicates (there may be a few as two + * distinct values). Many duplicates mode has no hard requirements for + * space utilization, though it still keeps the use of space balanced as a + * non-binding secondary goal. This significantly improves fan-out in + * practice, at least with most affected workloads. + * + * Many duplicates mode may lead to slightly inferior space utilization when + * values are spaced apart at fixed intervals, even on levels above the leaf + * level. Even when that happens, many duplicates mode will probably still + * beat the generic default strategy. Not having groups of duplicates + * straddle two leaf pages is likely to more than make up for having sparser + * pages, since "false sharing" of leaf blocks by index scans is avoided. A + * point lookup will only visit one leaf page, not two. (This kind of false + * sharing may also have negative implications for page deletion during + * vacuuming, and may artificially increase the number of pages subsequently + * dirtied.) + * * If the page is the rightmost page on its level, we instead try to arrange * to leave the left split page fillfactor% full. In this way, when we are * inserting successively increasing keys (consider sequences, timestamps, @@ -1556,6 +1613,17 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, * This is the same as nbtsort.c produces for a newly-created tree. Note * that leaf and nonleaf pages use different fillfactors. * + * If called recursively in single value mode, we also try to arrange to + * leave the left split page fillfactor% full, though we arrange to use a + * fillfactor that leaves the left page mostly empty and the right page + * mostly full, rather than the other way around. This greatly helps with + * space management in cases where tuples with the same attribute values + * span multiple pages. Newly inserted duplicates will tend to have higher + * heap TID values, so we'll end up splitting the same page again and again + * as even more duplicates are inserted. (The heap TID attribute has + * descending sort order, so ascending heap TID values continually split the + * same low page). + * * We are passed the intended insert position of the new tuple, expressed as * the offsetnumber of the tuple it must go in front of. (This could be * maxoff+1 if the tuple is to go at the end.) @@ -1568,8 +1636,10 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, static OffsetNumber _bt_findsplitloc(Relation rel, Page page, + SplitMode mode, OffsetNumber newitemoff, Size newitemsz, + IndexTuple newitem, bool *newitemonleft) { BTPageOpaque opaque; @@ -1581,19 +1651,32 @@ _bt_findsplitloc(Relation rel, rightspace, goodenough, olddataitemstotal, - olddataitemstoleft; + olddataitemstoleft, + perfectfirstdiff, + bestfirstdiff, + lowsplit; bool goodenoughfound; + SplitPoint splits[STACK_SPLIT_POINTS]; + SplitMode secondmode; + OffsetNumber finalfirstright; opaque = (BTPageOpaque) PageGetSpecialPointer(page); - - /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ - newitemsz += sizeof(ItemIdData); + maxoff = PageGetMaxOffsetNumber(page); /* Total free space available on a btree page, after fixed overhead */ leftspace = rightspace = PageGetPageSize(page) - SizeOfPageHeaderData - MAXALIGN(sizeof(BTPageOpaqueData)); + /* + * Conservatively assume that suffix truncation cannot avoid adding a heap + * TID to the left half's new high key when splitting at the leaf level. + * Accounting for the size of the rest of the high key comes later, since + * it's considered for every candidate split point. + */ + if (P_ISLEAF(opaque)) + leftspace -= MAXALIGN(sizeof(ItemPointerData)); + /* The right page will have the same high key as the old page */ if (!P_RIGHTMOST(opaque)) { @@ -1605,18 +1688,37 @@ _bt_findsplitloc(Relation rel, /* Count up total space in data items without actually scanning 'em */ olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page); - state.newitemsz = newitemsz; + /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ + state.newitemsz = newitemsz + sizeof(ItemIdData); state.is_leaf = P_ISLEAF(opaque); - state.is_rightmost = P_RIGHTMOST(opaque); - state.have_split = false; + state.is_weighted = P_RIGHTMOST(opaque) || mode == SPLIT_SINGLE_VALUE; if (state.is_leaf) - state.fillfactor = RelationGetFillFactor(rel, - BTREE_DEFAULT_FILLFACTOR); + { + if (mode != SPLIT_SINGLE_VALUE) + state.fillfactor = RelationGetFillFactor(rel, + BTREE_DEFAULT_FILLFACTOR); + else + state.fillfactor = BTREE_SINGLEVAL_FILLFACTOR; + + if (mode == SPLIT_DEFAULT) + state.maxsplit = Min(Max(3, maxoff / 16), STACK_SPLIT_POINTS); + else if (mode == SPLIT_MANY_DUPLICATES) + state.maxsplit = maxoff; + else + state.maxsplit = 1; + } else + { + Assert(mode == SPLIT_DEFAULT); + state.fillfactor = BTREE_NONLEAF_FILLFACTOR; - state.newitemonleft = false; /* these just to keep compiler quiet */ - state.firstright = 0; - state.best_delta = 0; + state.maxsplit = 1; + } + state.nsplits = 0; + if (mode != SPLIT_MANY_DUPLICATES) + state.splits = splits; + else + state.splits = palloc(sizeof(SplitPoint) * maxoff); state.leftspace = leftspace; state.rightspace = rightspace; state.olddataitemstotal = olddataitemstotal; @@ -1625,11 +1727,13 @@ _bt_findsplitloc(Relation rel, /* * Finding the best possible split would require checking all the possible * split points, because of the high-key and left-key special cases. - * That's probably more work than it's worth; instead, stop as soon as we - * find a "good-enough" split, where good-enough is defined as an - * imbalance in free space of no more than pagesize/16 (arbitrary...) This - * should let us stop near the middle on most pages, instead of plowing to - * the end. + * That's probably more work than it's worth in default mode; instead, + * stop as soon as we find all "good-enough" splits, where good-enough is + * defined as an imbalance in free space of no more than pagesize/16 + * (arbitrary...) This should let us stop near the middle on most pages, + * instead of plowing to the end. Many duplicates mode does consider + * all choices, while single value mode gives up as soon as it finds a + * good enough split point. */ goodenough = leftspace / 16; @@ -1639,13 +1743,13 @@ _bt_findsplitloc(Relation rel, */ olddataitemstoleft = 0; goodenoughfound = false; - maxoff = PageGetMaxOffsetNumber(page); for (offnum = P_FIRSTDATAKEY(opaque); offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { Size itemsz; + int delta; itemid = PageGetItemId(page, offnum); itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData); @@ -1654,28 +1758,38 @@ _bt_findsplitloc(Relation rel, * Will the new item go to left or right of split? */ if (offnum > newitemoff) - _bt_checksplitloc(&state, offnum, true, - olddataitemstoleft, itemsz); + delta = _bt_checksplitloc(&state, offnum, true, + olddataitemstoleft, itemsz); else if (offnum < newitemoff) - _bt_checksplitloc(&state, offnum, false, - olddataitemstoleft, itemsz); + delta = _bt_checksplitloc(&state, offnum, false, + olddataitemstoleft, itemsz); else { /* need to try it both ways! */ _bt_checksplitloc(&state, offnum, true, olddataitemstoleft, itemsz); - _bt_checksplitloc(&state, offnum, false, - olddataitemstoleft, itemsz); + delta = _bt_checksplitloc(&state, offnum, false, + olddataitemstoleft, itemsz); } - /* Abort scan once we find a good-enough choice */ - if (state.have_split && state.best_delta <= goodenough) - { + /* + * Abort default mode scan once we've found a good-enough choice, and + * reach the point where we stop finding new good-enough choices. + */ + if (state.nsplits > 0 && state.splits[0].delta <= goodenough) goodenoughfound = true; + + if (mode == SPLIT_DEFAULT && goodenoughfound && delta > goodenough) + break; + + /* + * Single value mode does not expect to be able to truncate; might as + * well give up quickly once good enough value found. + */ + if (mode == SPLIT_SINGLE_VALUE && goodenoughfound) break; - } olddataitemstoleft += itemsz; } @@ -1692,12 +1806,80 @@ _bt_findsplitloc(Relation rel, * I believe it is not possible to fail to find a feasible split, but just * in case ... */ - if (!state.have_split) + if (state.nsplits == 0) elog(ERROR, "could not find a feasible split point for index \"%s\"", RelationGetRelationName(rel)); - *newitemonleft = state.newitemonleft; - return state.firstright; + /* + * Search among acceptable split points for the entry with earliest + * enclosing tuple pair differing attribute values. The general idea is + * to maximize the effectiveness of suffix truncation without affecting + * the balance of space on each side of the split very much. + * + * First find lowest possible first differing attribute among array of + * acceptable split points -- the "perfect" firstdiff. This allows us to + * return early without wasting cycles on calculating the first differing + * attribute for all candidate splits when that clearly cannot improve our + * choice. This optimization is important for several common cases, + * including insertion into a primary key index on an auto-incremented or + * monotonically increasing integer column. + * + * This is also the point at which we decide to either finish splitting + * the page using the default strategy, or, alternatively, to do a second + * pass over page using a different strategy. The second pass may be in + * many duplicates mode, or in single value mode. + */ + perfectfirstdiff = 0; + secondmode = SPLIT_DEFAULT; + if (state.is_leaf && mode == SPLIT_DEFAULT) + perfectfirstdiff = _bt_perfect_firstdiff(rel, page, newitemoff, newitem, + state.nsplits, state.splits, + &secondmode); + + /* newitemonleft output parameter is set recursively */ + if (secondmode != SPLIT_DEFAULT) + return _bt_findsplitloc(rel, page, secondmode, newitemoff, newitemsz, + newitem, newitemonleft); + + /* + * Now actually search among acceptable split points for the entry that + * allows suffix truncation to truncate away the maximum possible number + * of attributes. + */ + bestfirstdiff = INT_MAX; + lowsplit = 0; + for (int i = 0; i < state.nsplits; i++) + { + int firstdiff; + + /* Don't waste cycles */ + if (perfectfirstdiff == INT_MAX || state.nsplits == 1) + break; + + firstdiff = _bt_split_firstdiff(rel, page, newitemoff, newitem, + state.splits + i); + + if (firstdiff <= perfectfirstdiff) + { + bestfirstdiff = firstdiff; + lowsplit = i; + break; + } + + if (firstdiff < bestfirstdiff) + { + bestfirstdiff = firstdiff; + lowsplit = i; + } + } + + *newitemonleft = state.splits[lowsplit].newitemonleft; + finalfirstright = state.splits[lowsplit].firstright; + /* Be tidy */ + if (state.splits != splits) + pfree(state.splits); + + return finalfirstright; } /* @@ -1712,8 +1894,11 @@ _bt_findsplitloc(Relation rel, * * olddataitemstoleft is the total size of all old items to the left of * firstoldonright. + * + * Returns delta between space that will be left free on left and right side + * of split. */ -static void +static int _bt_checksplitloc(FindSplitData *state, OffsetNumber firstoldonright, bool newitemonleft, @@ -1745,8 +1930,13 @@ _bt_checksplitloc(FindSplitData *state, * index has included attributes, then those attributes of left page high * key will be truncated leaving that page with slightly more free space. * However, that shouldn't affect our ability to find valid split - * location, because anyway split location should exists even without high - * key truncation. + * location, since we err in the direction of being pessimistic about free + * space on the left half. + * + * Note that we've already conservatively subtracted away overhead + * required for the left/new high key to have an explicit head TID, on the + * assumption that that cannot be avoided by suffix truncation. (Leaf + * pages only.) */ leftfree -= firstrightitemsz; @@ -1765,17 +1955,20 @@ _bt_checksplitloc(FindSplitData *state, (int) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData)); /* - * If feasible split point, remember best delta. + * If feasible split point with lower delta than that of most marginal + * spit point so far, or we haven't run out of space for split points, + * remember it. */ if (leftfree >= 0 && rightfree >= 0) { int delta; - if (state->is_rightmost) + if (state->is_weighted) { /* - * If splitting a rightmost page, try to put (100-fillfactor)% of - * free space on left page. See comments for _bt_findsplitloc. + * If splitting a rightmost page, or in single value mode, try to + * put (100-fillfactor)% of free space on left page. See comments + * for _bt_findsplitloc. */ delta = (state->fillfactor * leftfree) - ((100 - state->fillfactor) * rightfree); @@ -1788,14 +1981,288 @@ _bt_checksplitloc(FindSplitData *state, if (delta < 0) delta = -delta; - if (!state->have_split || delta < state->best_delta) + if (state->nsplits < state->maxsplit || + delta < state->splits[state->nsplits - 1].delta) { - state->have_split = true; - state->newitemonleft = newitemonleft; - state->firstright = firstoldonright; - state->best_delta = delta; + SplitPoint newsplit; + int j; + + newsplit.delta = delta; + newsplit.newitemonleft = newitemonleft; + newsplit.firstright = firstoldonright; + + /* + * Make space at the end of the state array for new candidate + * split point if we haven't already reached the maximum number of + * split points. + */ + if (state->nsplits < state->maxsplit) + state->nsplits++; + + /* + * Replace the final item in the nsplits-wise array. The final + * item is either a garbage still-uninitialized entry, or the most + * marginal real entry when we already have as many split points + * as we're willing to consider. + */ + for (j = state->nsplits - 1; + j > 0 && state->splits[j - 1].delta > newsplit.delta; + j--) + { + state->splits[j] = state->splits[j - 1]; + } + state->splits[j] = newsplit; + } + + return delta; + } + + return INT_MAX; +} + +/* + * Subroutine to find the earliest possible attribute that differs for any + * entry within array of acceptable candidate split points. + * + * This may be earlier than any real firstdiff for any of the candidate split + * points, in which case the optimization is ineffective. + */ +static int +_bt_perfect_firstdiff(Relation rel, Page page, OffsetNumber newitemoff, + IndexTuple newitem, int nsplits, SplitPoint *splits, + SplitMode *secondmode) +{ + ItemId itemid; + OffsetNumber center; + IndexTuple leftmost, + rightmost; + int perfectfirstdiff; + bool identical; + + /* Assume that a second pass over page won't be required for now */ + *secondmode = SPLIT_DEFAULT; + + /* + * Iterate from the end of split array to the start, in search of the + * firstright-wise leftmost and rightmost entries among acceptable split + * points. The split point with the lowest delta is at the start of the + * array. It is deemed to be the split point whose firstright offset is + * at the center. Split points with firstright offsets at both the left + * and right extremes among acceptable split points will be found at the + * end of caller's array. + */ + leftmost = NULL; + rightmost = NULL; + center = splits[0].firstright; + + /* + * Split points can be thought of as points _between_ tuples on the + * original unsplit page image, at least if you pretend that the incoming + * tuple is already on the page to be split (imagine that the original + * unsplit page actually had enough space to fit the incoming tuple). The + * rightmost tuple is the tuple that is immediately to the right of a + * split point that is itself rightmost. Likewise, the leftmost tuple is + * the tuple to the left of the leftmost split point. This is slightly + * arbitrary. + * + * When there are very few candidates, no sensible comparison can be made + * here, resulting in caller selecting lowest delta/the center split point + * by default. No great care is taken around boundary cases where the + * center split point has the same firstright offset as either the + * leftmost or rightmost split points (i.e. only newitemonleft differs). + * We expect to find leftmost and rightmost tuples almost immediately. + */ + perfectfirstdiff = INT_MAX; + identical = false; + for (int j = nsplits - 1; j > 1; j--) + { + SplitPoint *split = splits + j; + + if (!leftmost && split->firstright < center) + { + if (split->newitemonleft && newitemoff == split->firstright) + leftmost = newitem; + else + { + itemid = PageGetItemId(page, + OffsetNumberPrev(split->firstright)); + leftmost = (IndexTuple) PageGetItem(page, itemid); + } + } + + if (!rightmost && split->firstright > center) + { + if (!split->newitemonleft && newitemoff == split->firstright) + rightmost = newitem; + else + { + itemid = PageGetItemId(page, split->firstright); + rightmost = (IndexTuple) PageGetItem(page, itemid); + } + } + + if (leftmost && rightmost) + { + Assert(leftmost != rightmost); + perfectfirstdiff = _bt_tuple_firstdiff(rel, leftmost, rightmost, + &identical); + break; } } + + /* Work out which type of second pass will be performed, if any */ + if (identical) + { + BTPageOpaque opaque; + OffsetNumber maxoff; + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + + if (P_FIRSTDATAKEY(opaque) == newitemoff) + leftmost = newitem; + else + { + itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque)); + leftmost = (IndexTuple) PageGetItem(page, itemid); + } + + if (newitemoff > maxoff) + rightmost = newitem; + else + { + itemid = PageGetItemId(page, maxoff); + rightmost = (IndexTuple) PageGetItem(page, itemid); + } + + Assert(leftmost != rightmost); + (void) _bt_tuple_firstdiff(rel, leftmost, rightmost, &identical); + + /* + * If page has many duplicates but is not entirely full of + * duplicates, a many duplicates mode pass will be performed. If + * page is entirely full of duplicates, a single value mode pass + * will be performed. + * + * Caller should avoid a single value mode pass when incoming tuple + * doesn't sort lowest among items on the page, though. Instead, we + * instruct caller to continue with original default mode split, + * since an out-of-order new item suggests that newer tuples have + * come from (non-HOT) updates, not inserts. Evenly sharing space + * among each half of the split avoids pathological performance. + */ + if (identical) + { + if (P_FIRSTDATAKEY(opaque) == newitemoff) + *secondmode = SPLIT_SINGLE_VALUE; + else + { + perfectfirstdiff = INT_MAX; + *secondmode = SPLIT_DEFAULT; + } + } + else + *secondmode = SPLIT_MANY_DUPLICATES; + } + + return perfectfirstdiff; +} + +/* + * Subroutine to find first attribute that differs among the two tuples that + * enclose caller's candidate split point. + */ +static int +_bt_split_firstdiff(Relation rel, Page page, OffsetNumber newitemoff, + IndexTuple newitem, SplitPoint *split) +{ + ItemId itemid; + IndexTuple lastleft; + IndexTuple firstright; + + if (split->newitemonleft && newitemoff == split->firstright) + lastleft = newitem; + else + { + itemid = PageGetItemId(page, OffsetNumberPrev(split->firstright)); + lastleft = (IndexTuple) PageGetItem(page, itemid); + } + + if (!split->newitemonleft && newitemoff == split->firstright) + firstright = newitem; + else + { + itemid = PageGetItemId(page, split->firstright); + firstright = (IndexTuple) PageGetItem(page, itemid); + } + + Assert(lastleft != firstright); + return _bt_tuple_firstdiff(rel, lastleft, firstright, NULL); +} + +/* + * Subroutine to find first attribute that differs between two tuples, + * typically two tuples that enclose a candidate split point. Caller may also + * be interested in whether or not tuples are completely identical, in which + * case an "identical" parameter is passed. + * + * A naive bitwise approach to datum comparisons is used to save cycles. This + * is inherently approximate, but works just as well as real scan key + * comparisons in most cases, since the vast majority of types in Postgres + * cannot be equal unless they're bitwise equal. + * + * Testing has shown that an approach involving treating the tuple as a + * decomposed binary string would work almost as well as our current approach. + * It would also be faster. It might actually be necessary to go that way in + * the future, if suffix truncation is made sophisticated enough to truncate + * at a finer granularity (i.e. truncate within an attribute, rather than just + * truncating away whole attributes). The current approach isn't markedly + * slower, since it works particularly well with the "perfectfirstdiff" + * optimization (there are fewer, more expensive calls here). It also works + * with INCLUDE indexes (indexes with non-key attributes) without any special + * effort. + */ +static int +_bt_tuple_firstdiff(Relation rel, IndexTuple lastleft, IndexTuple firstright, + bool *identical) +{ + TupleDesc itupdesc = RelationGetDescr(rel); + int keysz = IndexRelationGetNumberOfKeyAttributes(rel); + int result; + + result = 0; + for (int attnum = 1; attnum <= keysz; attnum++) + { + Datum datum1, + datum2; + bool isNull1, + isNull2; + Form_pg_attribute att; + + datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); + datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + att = TupleDescAttr(itupdesc, attnum - 1); + + if (isNull1 != isNull2) + break; + + if (!isNull1 && + !datumIsEqual(datum1, datum2, att->attbyval, att->attlen)) + break; + + result++; + } + + /* Report if left and right tuples are identical when requested */ + if (identical) + { + if (result >= keysz) + *identical = true; + else + *identical = false; + } + + return result; } /* @@ -2199,7 +2666,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) /* * insert the right page pointer into the new root page. */ - Assert(BTreeTupleGetNAtts(right_item, rel) == + Assert(BTreeTupleGetNAtts(right_item, rel) > 0); + Assert(BTreeTupleGetNAtts(right_item, rel) <= IndexRelationGetNumberOfKeyAttributes(rel)); if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY, false, false) == InvalidOffsetNumber) @@ -2311,8 +2779,8 @@ _bt_pgaddtup(Page page, /* * _bt_isequal - used in _bt_doinsert in check for duplicates. * - * This is very similar to _bt_compare, except for NULL handling. - * Rule is simple: NOT_NULL not equal NULL, NULL not equal NULL too. + * This is very similar to _bt_compare, except for NULL and negative infinity + * handling. Rule is simple: NOT_NULL not equal NULL, NULL not equal NULL too. */ static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum, @@ -2326,12 +2794,6 @@ _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum, itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); - /* - * It's okay that we might perform a comparison against a truncated page - * high key when caller needs to determine if _bt_check_unique scan must - * continue on to the next page. Caller never asks us to compare non-key - * attributes within an INCLUDE index. - */ for (i = 1; i <= keysz; i++) { AttrNumber attno; diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 4082103fe2..f63615341c 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -1421,10 +1421,12 @@ _bt_pagedel(Relation rel, Buffer buf) /* we need an insertion scan key for the search, so build one */ itup_scankey = _bt_mkscankey(rel, targetkey); - /* find the leftmost leaf page containing this key */ + /* get stack to leaf page by searching index */ stack = _bt_search(rel, - IndexRelationGetNumberOfKeyAttributes(rel), - itup_scankey, false, &lbuf, BT_READ, NULL); + BTreeTupleGetNAtts(targetkey, rel), + itup_scankey, + BTreeTupleGetHeapTID(targetkey), false, + &lbuf, BT_READ, NULL); /* don't need a pin on the page */ _bt_relbuf(rel, lbuf); diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index d3700bd082..c229b7eed2 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -69,11 +69,13 @@ _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp) /* - * _bt_search() -- Search the tree for a particular scankey, + * _bt_search() -- Search the tree for a particular scankey + scantid, * or more precisely for the first leaf page it could be on. * * The passed scankey must be an insertion-type scankey (see nbtree/README), - * but it can omit the rightmost column(s) of the index. + * but it can omit the rightmost column(s) of the index. The scantid + * argument may also be omitted (caller passes NULL), since it's logically + * the "real" rightmost attribute. * * When nextkey is false (the usual case), we are looking for the first * item >= scankey. When nextkey is true, we are looking for the first @@ -94,8 +96,8 @@ _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp) * during the search will be finished. */ BTStack -_bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, - Buffer *bufP, int access, Snapshot snapshot) +_bt_search(Relation rel, int keysz, ScanKey scankey, ItemPointer scantid, + bool nextkey, Buffer *bufP, int access, Snapshot snapshot) { BTStack stack_in = NULL; int page_access = BT_READ; @@ -131,7 +133,7 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, * if the leaf page is split and we insert to the parent page). But * this is a good opportunity to finish splits of internal pages too. */ - *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, + *bufP = _bt_moveright(rel, *bufP, keysz, scankey, scantid, nextkey, (access == BT_WRITE), stack_in, page_access, snapshot); @@ -145,7 +147,8 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, * Find the appropriate item on the internal page, and get the child * page that it points to. */ - offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey); + offnum = _bt_binsrch(rel, *bufP, keysz, scankey, scantid, + P_FIRSTDATAKEY(opaque), nextkey); itemid = PageGetItemId(page, offnum); itup = (IndexTuple) PageGetItem(page, itemid); blkno = BTreeInnerTupleGetDownLink(itup); @@ -158,8 +161,8 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, * downlink (block) to uniquely identify the index entry, in case it * moves right while we're working lower in the tree. See the paper * by Lehman and Yao for how this is detected and handled. (We use the - * child link to disambiguate duplicate keys in the index -- Lehman - * and Yao disallow duplicate keys.) + * child link to disambiguate duplicate keys in the index, which is + * faster than comparing the keys themselves.) */ new_stack = (BTStack) palloc(sizeof(BTStackData)); new_stack->bts_blkno = par_blkno; @@ -199,7 +202,7 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, * need to move right in the tree. See Lehman and Yao for an * excruciatingly precise description. */ - *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, + *bufP = _bt_moveright(rel, *bufP, keysz, scankey, scantid, nextkey, true, stack_in, BT_WRITE, snapshot); } @@ -245,6 +248,7 @@ _bt_moveright(Relation rel, Buffer buf, int keysz, ScanKey scankey, + ItemPointer scantid, bool nextkey, bool forupdate, BTStack stack, @@ -305,7 +309,7 @@ _bt_moveright(Relation rel, continue; } - if (P_IGNORE(opaque) || _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval) + if (P_IGNORE(opaque) || _bt_compare(rel, keysz, scankey, scantid, page, P_HIKEY) >= cmpval) { /* step right one page */ buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access); @@ -337,6 +341,12 @@ _bt_moveright(Relation rel, * particular, this means it is possible to return a value 1 greater than the * number of keys on the page, if the scankey is > all keys on the page.) * + * Caller passes own low value for binary search. This can be used to + * resume a partial binary search without repeated effort. _bt_check_unique + * callers use this to avoid repeated work. This only works when a buffer + * lock is held throughout, and we're passed a leaf page both times, and + * nextkey is false. + * * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber * of the last key < given scankey, or last key <= given scankey if nextkey * is true. (Since _bt_compare treats the first data key of such a page as @@ -354,19 +364,19 @@ _bt_binsrch(Relation rel, Buffer buf, int keysz, ScanKey scankey, + ItemPointer scantid, + OffsetNumber low, bool nextkey) { Page page; BTPageOpaque opaque; - OffsetNumber low, - high; + OffsetNumber high; int32 result, cmpval; page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); - low = P_FIRSTDATAKEY(opaque); high = PageGetMaxOffsetNumber(page); /* @@ -401,7 +411,7 @@ _bt_binsrch(Relation rel, /* We have low <= mid < high, so mid points at a real slot */ - result = _bt_compare(rel, keysz, scankey, page, mid); + result = _bt_compare(rel, keysz, scankey, scantid, page, mid); if (result >= cmpval) low = mid + 1; @@ -431,6 +441,50 @@ _bt_binsrch(Relation rel, /*---------- * _bt_compare() -- Compare scankey to a particular tuple on the page. * + * Convenience wrapper for _bt_tuple_compare() callers that want to compare + * an offset on a particular page. + * + * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be + * "minus infinity": this routine will always claim it is less than the + * scankey. The actual key value stored (if any, which there probably isn't) + * does not matter. This convention allows us to implement the Lehman and + * Yao convention that the first down-link pointer is before the first key. + * See backend/access/nbtree/README for details. + *---------- + */ +int32 +_bt_compare(Relation rel, + int keysz, + ScanKey scankey, + ItemPointer scantid, + Page page, + OffsetNumber offnum) +{ + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + IndexTuple itup; + + Assert(_bt_check_natts(rel, page, offnum)); + + /* + * Force result ">" if target item is first data item on an internal page + * --- see NOTE above. + * + * A minus infinity key has all attributes truncated away, so this test is + * redundant with the minus infinity attribute tie-breaker. However, the + * number of attributes in minus infinity tuples was not explicitly + * represented as 0 until PostgreSQL v11, so an explicit offnum test is + * still required. + */ + if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque)) + return 1; + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + return _bt_tuple_compare(rel, keysz, scankey, scantid, itup); +} + +/*---------- + * _bt_tuple_compare() -- Compare scankey to a particular tuple. + * * The passed scankey must be an insertion-type scankey (see nbtree/README), * but it can omit the rightmost column(s) of the index. * @@ -445,37 +499,23 @@ _bt_binsrch(Relation rel, * NULLs in the keys are treated as sortable values. Therefore * "equality" does not necessarily mean that the item should be * returned to the caller as a matching key! - * - * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be - * "minus infinity": this routine will always claim it is less than the - * scankey. The actual key value stored (if any, which there probably isn't) - * does not matter. This convention allows us to implement the Lehman and - * Yao convention that the first down-link pointer is before the first key. - * See backend/access/nbtree/README for details. *---------- */ int32 -_bt_compare(Relation rel, - int keysz, - ScanKey scankey, - Page page, - OffsetNumber offnum) +_bt_tuple_compare(Relation rel, + int keysz, + ScanKey scankey, + ItemPointer scantid, + IndexTuple itup) { TupleDesc itupdesc = RelationGetDescr(rel); - BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); - IndexTuple itup; + ItemPointer heapTid; + int ntupatts; + int ncmpkey; int i; - Assert(_bt_check_natts(rel, page, offnum)); - - /* - * Force result ">" if target item is first data item on an internal page - * --- see NOTE above. - */ - if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque)) - return 1; - - itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + Assert(keysz <= IndexRelationGetNumberOfKeyAttributes(rel)); + ntupatts = BTreeTupleGetNAtts(itup, rel); /* * The scan key is set up with the attribute number associated with each @@ -489,7 +529,8 @@ _bt_compare(Relation rel, * _bt_first). */ - for (i = 1; i <= keysz; i++) + ncmpkey = Min(ntupatts, keysz); + for (i = 1; i <= ncmpkey; i++) { Datum datum; bool isNull; @@ -540,8 +581,31 @@ _bt_compare(Relation rel, scankey++; } - /* if we get here, the keys are equal */ - return 0; + /* + * Use the number of attributes as a tie-breaker, in order to treat + * truncated attributes in index as minus infinity. + */ + if (keysz > ntupatts) + return 1; + + /* If caller provided no heap TID tie-breaker for scan, they're equal */ + if (!scantid) + return 0; + + /* + * Although it isn't counted as an attribute by BTreeTupleGetNAtts(), heap + * TID is an implicit final key attribute that ensures that all index + * tuples have a distinct set of key attribute values. + * + * This is often truncated away in pivot tuples, which makes the attribute + * value implicitly negative infinity. + */ + heapTid = BTreeTupleGetHeapTID(itup); + if (!heapTid) + return 1; + + /* Deliberately invert the order, since TIDs "sort DESC" */ + return ItemPointerCompare(heapTid, scantid); } /* @@ -570,6 +634,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; Buffer buf; + BTPageOpaque opaque; BTStack stack; OffsetNumber offnum; StrategyNumber strat; @@ -577,6 +642,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) bool goback; ScanKey startKeys[INDEX_MAX_KEYS]; ScanKeyData scankeys[INDEX_MAX_KEYS]; + ItemPointer scantid; + ItemPointerData minscantid; ScanKeyData notnullkeys[INDEX_MAX_KEYS]; int keysCount = 0; int i; @@ -826,6 +893,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * scankeys[] array, using the keys identified by startKeys[]. */ Assert(keysCount <= INDEX_MAX_KEYS); + scantid = NULL; for (i = 0; i < keysCount; i++) { ScanKey cur = startKeys[i]; @@ -962,6 +1030,34 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) } } + /* + * When all key attributes will be in insertion scankey, manufacture + * sentinel scan tid that's less than any possible heap TID in the index. + * This is still greater than minus infinity to _bt_compare, allowing + * _bt_search to follow a downlink with scankey-equal attributes, but a + * truncated-away heap TID. + * + * If we didn't do this then affected index scans would have to + * unnecessarily visit an extra page before moving right to the page they + * should have landed on from the parent in the first place. When + * choosing a leaf page split point/new downlink, significant effort goes + * towards avoiding a choice that necessitates appending a heap TID, so + * this is likely to pay off. See _bt_findsplitloc comments on "false + * sharing". + * + * (Note that implementing this by adding hard-coding to _bt_compare is + * unworkable, since some _bt_search callers need to re-find a leaf page + * using the page's high key.) + */ + if (keysCount >= IndexRelationGetNumberOfKeyAttributes(rel)) + { + scantid = &minscantid; + + /* Heap TID attribute uses DESC ordering */ + ItemPointerSetBlockNumber(scantid, InvalidBlockNumber); + ItemPointerSetOffsetNumber(scantid, InvalidOffsetNumber); + } + /*---------- * Examine the selected initial-positioning strategy to determine exactly * where we need to start the scan, and set flag variables to control the @@ -1054,11 +1150,11 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) } /* - * Use the manufactured insertion scan key to descend the tree and - * position ourselves on the target leaf page. + * Use the manufactured insertion scan key (and possibly a scantid) to + * descend the tree and position ourselves on the target leaf page. */ - stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ, - scan->xs_snapshot); + stack = _bt_search(rel, keysCount, scankeys, scantid, nextkey, &buf, + BT_READ, scan->xs_snapshot); /* don't need to keep the stack around... */ _bt_freestack(stack); @@ -1087,7 +1183,9 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) _bt_initialize_more_data(so, dir); /* position to the precise item on the page */ - offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey); + opaque = (BTPageOpaque) PageGetSpecialPointer(BufferGetPage(buf)); + offnum = _bt_binsrch(rel, buf, keysCount, scankeys, scantid, + P_FIRSTDATAKEY(opaque), nextkey); /* * If nextkey = false, we are positioned at the first item >= scan key, or diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 16f5755777..e8f506cc09 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -796,8 +796,6 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) OffsetNumber last_off; Size pgspc; Size itupsz; - int indnatts = IndexRelationGetNumberOfAttributes(wstate->index); - int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(wstate->index); /* * This is a handy place to check for cancel interrupts during the btree @@ -880,19 +878,30 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) ItemIdSetUnused(ii); /* redundant */ ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData); - if (indnkeyatts != indnatts && P_ISLEAF(opageop)) + if (P_ISLEAF(opageop)) { + IndexTuple lastleft; IndexTuple truncated; Size truncsz; /* - * Truncate any non-key attributes from high key on leaf level - * (i.e. truncate on leaf level if we're building an INCLUDE - * index). This is only done at the leaf level because downlinks + * Truncate away any unneeded attributes from high key on leaf + * level. This is only done at the leaf level because downlinks * in internal pages are either negative infinity items, or get * their contents from copying from one level down. See also: * _bt_split(). * + * We don't try to bias our choice of split point to make it more + * likely that _bt_suffix_truncate() can truncate away more + * attributes, whereas the split point passed to _bt_split() is + * chosen much more delicately. Suffix truncation is mostly + * useful because it can greatly improve space utilization for + * workloads with random insertions, or insertions of + * monotonically increasing values at "local" points in the key + * space. It doesn't seem worthwhile to add complex logic for + * choosing a split point here for a benefit that is bound to be + * much smaller. + * * Since the truncated tuple is probably smaller than the * original, it cannot just be copied in place (besides, we want * to actually save space on the leaf page). We delete the @@ -905,7 +914,10 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) * the latter portion of the space occupied by the original tuple. * This is fairly cheap. */ - truncated = _bt_nonkey_truncate(wstate->index, oitup); + ii = PageGetItemId(opage, OffsetNumberPrev(last_off)); + lastleft = (IndexTuple) PageGetItem(opage, ii); + + truncated = _bt_suffix_truncate(wstate->index, lastleft, oitup); truncsz = IndexTupleSize(truncated); PageIndexTupleDelete(opage, P_HIKEY); _bt_sortaddtup(opage, truncsz, truncated, P_HIKEY); @@ -924,8 +936,9 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) if (state->btps_next == NULL) state->btps_next = _bt_pagestate(wstate, state->btps_level + 1); - Assert(BTreeTupleGetNAtts(state->btps_minkey, wstate->index) == - IndexRelationGetNumberOfKeyAttributes(wstate->index) || + Assert((BTreeTupleGetNAtts(state->btps_minkey, wstate->index) <= + IndexRelationGetNumberOfKeyAttributes(wstate->index) && + BTreeTupleGetNAtts(state->btps_minkey, wstate->index) > 0) || P_LEFTMOST(opageop)); Assert(BTreeTupleGetNAtts(state->btps_minkey, wstate->index) == 0 || !P_LEFTMOST(opageop)); @@ -970,7 +983,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) * the first item for a page is copied from the prior page in the code * above. Since the minimum key for an entire level is only used as a * minus infinity downlink, and never as a high key, there is no need to - * truncate away non-key attributes at this point. + * truncate away suffix attributes at this point. */ if (last_off == P_HIKEY) { @@ -1029,8 +1042,9 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) } else { - Assert(BTreeTupleGetNAtts(s->btps_minkey, wstate->index) == - IndexRelationGetNumberOfKeyAttributes(wstate->index) || + Assert((BTreeTupleGetNAtts(s->btps_minkey, wstate->index) <= + IndexRelationGetNumberOfKeyAttributes(wstate->index) && + BTreeTupleGetNAtts(s->btps_minkey, wstate->index) > 0) || P_LEFTMOST(opaque)); Assert(BTreeTupleGetNAtts(s->btps_minkey, wstate->index) == 0 || !P_LEFTMOST(opaque)); @@ -1127,6 +1141,8 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) } else if (itup != NULL) { + int32 compare = 0; + for (i = 1; i <= keysz; i++) { SortSupport entry; @@ -1134,7 +1150,6 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) attrDatum2; bool isNull1, isNull2; - int32 compare; entry = sortKeys + i - 1; attrDatum1 = index_getattr(itup, i, tupdes, &isNull1); @@ -1151,6 +1166,21 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) else if (compare < 0) break; } + + /* + * If key values are equal, we sort on ItemPointer. This is + * required for btree indexes, since heap TID is treated as an + * implicit last key attribute in order to ensure that all + * keys in the index are physically unique. + */ + if (compare == 0) + { + /* Deliberately invert the order, since TIDs "sort DESC" */ + compare = ItemPointerCompare(&itup2->t_tid, &itup->t_tid); + Assert(compare != 0); + if (compare > 0) + load1 = false; + } } else load1 = false; diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 4528e87c83..f9f3ec7914 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -49,6 +49,8 @@ static void _bt_mark_scankey_required(ScanKey skey); static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc, ScanDirection dir, bool *continuescan); +static int _bt_leave_natts(Relation rel, IndexTuple lastleft, + IndexTuple firstright); /* @@ -56,27 +58,34 @@ static bool _bt_check_rowcompare(ScanKey skey, * Build an insertion scan key that contains comparison data from itup * as well as comparator routines appropriate to the key datatypes. * - * The result is intended for use with _bt_compare(). + * The result is intended for use with _bt_compare(). If itup has + * undergone suffix truncation of key attributes, caller had better + * pass BTreeTupleGetNAtts(itup, rel) as keysz to routines like + * _bt_search() and _bt_compare() when using returned scan key. This + * allows truncated attributes to participate in comparisons (truncated + * attributes have implicit negative infinity values). Note that + * _bt_compare() never treats a scan key as containing negative + * infinity attributes. */ ScanKey _bt_mkscankey(Relation rel, IndexTuple itup) { ScanKey skey; TupleDesc itupdesc; + int tupnatts; int indnatts PG_USED_FOR_ASSERTS_ONLY; int indnkeyatts; int16 *indoption; int i; itupdesc = RelationGetDescr(rel); + tupnatts = BTreeTupleGetNAtts(itup, rel); indnatts = IndexRelationGetNumberOfAttributes(rel); indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); indoption = rel->rd_indoption; - Assert(indnkeyatts > 0); - Assert(indnkeyatts <= indnatts); - Assert(BTreeTupleGetNAtts(itup, rel) == indnatts || - BTreeTupleGetNAtts(itup, rel) == indnkeyatts); + Assert(tupnatts > 0); + Assert(tupnatts <= indnatts); /* * We'll execute search using scan key constructed on key columns. Non-key @@ -96,7 +105,21 @@ _bt_mkscankey(Relation rel, IndexTuple itup) * comparison can be needed. */ procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC); - arg = index_getattr(itup, i + 1, itupdesc, &null); + + /* + * Truncated key attributes may not be represented in index tuple due + * to suffix truncation. Keys built from truncated attributes are + * defensively represented as NULL values, though they should still + * not be allowed to participate in comparisons (caller must be sure + * to pass a sane keysz to _bt_compare()). + */ + if (i < tupnatts) + arg = index_getattr(itup, i + 1, itupdesc, &null); + else + { + arg = (Datum) 0; + null = true; + } flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT); ScanKeyEntryInitializeWithInfo(&skey[i], flags, @@ -2083,38 +2106,197 @@ btproperty(Oid index_oid, int attno, } /* - * _bt_nonkey_truncate() -- create tuple without non-key suffix attributes. + * _bt_suffix_truncate() -- create tuple without unneeded suffix attributes. * - * Returns truncated index tuple allocated in caller's memory context, with key - * attributes copied from caller's itup argument. Currently, suffix truncation - * is only performed to create pivot tuples in INCLUDE indexes, but some day it - * could be generalized to remove suffix attributes after the first - * distinguishing key attribute. + * Returns truncated pivot index tuple allocated in caller's memory context, + * with key attributes copied from caller's firstright argument. If rel is + * an INCLUDE index, non-key attributes will definitely be truncated away, + * since they're not part of the key space. More aggressive suffix + * truncation can take place when it's clear that the returned tuple does not + * need one or more suffix key attributes. This is possible when there are + * attributes that follow an attribute in firstright that is not equal to the + * corresponding attribute in lastleft (equal according to an insertion scan + * key). * - * Truncated tuple is guaranteed to be no larger than the original, which is - * important for staying under the 1/3 of a page restriction on tuple size. + * Sometimes this routine will return a new pivot tuple that takes up more + * space than firstright, because a new heap TID attribute had to be added to + * distinguish lastleft from firstright. This should only happen when the + * caller is in the process of splitting a leaf page that has many logical + * duplicates, where it's unavoidable. * - * Note that returned tuple's t_tid offset will hold the number of attributes - * present, so the original item pointer offset is not represented. Caller - * should only change truncated tuple's downlink. + * Note that returned tuple's t_tid offset will hold the number of + * attributes present, so the original item pointer offset is not + * represented. Caller should only change truncated tuple's downlink. Note + * also that truncated key attributes are treated as containing "minus + * infinity" values by _bt_compare()/_bt_tuple_compare(). + * + * Returned tuple is guaranteed to be no larger than the original plus some + * extra space for a possible extra heap TID tie-breaker attribute. This + * guarantee is important for staying under the 1/3 of a page restriction on + * tuple size. */ IndexTuple -_bt_nonkey_truncate(Relation rel, IndexTuple itup) +_bt_suffix_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright) { - int nkeyattrs = IndexRelationGetNumberOfKeyAttributes(rel); - IndexTuple truncated; + TupleDesc itupdesc = RelationGetDescr(rel); + int16 natts = IndexRelationGetNumberOfAttributes(rel); + int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + int leavenatts; + IndexTuple pivot; + ItemPointer pivotheaptid; + Size newsize; /* - * We should only ever truncate leaf index tuples, which must have both - * key and non-key attributes. It's never okay to truncate a second time. + * We should only ever truncate leaf index tuples, which must have non-key + * attributes in the case of INCLUDE indexes. It's never okay to truncate + * a second time. */ - Assert(BTreeTupleGetNAtts(itup, rel) == - IndexRelationGetNumberOfAttributes(rel)); + Assert(BTreeTupleGetNAtts(lastleft, rel) == natts); + Assert(BTreeTupleGetNAtts(firstright, rel) == natts); - truncated = index_truncate_tuple(RelationGetDescr(rel), itup, nkeyattrs); - BTreeTupleSetNAtts(truncated, nkeyattrs); + /* Determine how many attributes must be left behind */ + leavenatts = _bt_leave_natts(rel, lastleft, firstright); - return truncated; + if (leavenatts <= natts) + { + IndexTuple tidpivot; + + /* + * Truncate away non-key attributes and/or key attributes. Do a + * straight copy in the case where the only attribute to be "truncated + * away" is the implicit heap TID key attribute (i.e. the case where + * we can at least avoid adding an explicit heap TID attribute to new + * pivot). We should only call index_truncate_tuple() when non-TID + * attributes need to be truncated. + */ + if (leavenatts < natts) + pivot = index_truncate_tuple(itupdesc, firstright, leavenatts); + else + pivot = CopyIndexTuple(firstright); + + /* + * If there is a distinguishing key attribute within leavenatts, there + * is no need to add an explicit heap TID attribute to new pivot. + */ + if (leavenatts <= nkeyatts) + { + BTreeTupleSetNAtts(pivot, leavenatts); + return pivot; + } + + /* + * Only non-key attributes could be truncated away from an INCLUDE + * index's pivot tuple. They are not considered part of the key + * space, so it's still necessary to add a heap TID attribute to the + * new pivot tuple. Create enlarged copy of our truncated right tuple + * copy, to fit heap TID. + */ + Assert(natts < nkeyatts); + newsize = IndexTupleSize(pivot) + MAXALIGN(sizeof(ItemPointerData)); + tidpivot = palloc0(newsize); + memcpy(tidpivot, pivot, IndexTupleSize(pivot)); + pfree(pivot); + pivot = tidpivot; + } + else + { + /* + * No truncation was possible, since attributes are all equal. It's + * necessary to add a heap TID attribute to the new pivot tuple. + */ + Assert(natts == nkeyatts); + newsize = IndexTupleSize(firstright) + MAXALIGN(sizeof(ItemPointerData)); + pivot = palloc0(newsize); + memcpy(pivot, firstright, IndexTupleSize(firstright)); + } + + /* + * Create enlarged copy of first right tuple to fit heap TID. We must use + * heap TID as a unique-ifier in new pivot tuple, since no non-TID + * attribute distinguishes which values belong on each side of the split + * point. + */ + pivot->t_info &= ~INDEX_SIZE_MASK; + pivot->t_info |= newsize; + + /* + * Generate a heap TID value to go in enlarged (not truncated) pivot + * tuple. Simply use the last left heap TID as new pivot's heap TID + * value. This code path is mostly used by cases where the page to be + * split only contains duplicates, since the logic for picking a split + * point tries very hard to avoid that, using all means available to it. + * "Single value" mode was likely to have been used to pick this split + * point. + * + * We could easily manufacturing a "median TID" value to use in the new + * pivot, since optimizations like that often help fan-out when applied to + * distinguishing/trailing non-TID attributes (adding opclass + * infrastructure that gets called here to truncate non-TID attributes is + * a possible future enhancement). Using the last left heap TID actually + * results in slightly better space utilization, though, because of the + * specific properties of heap TID attributes. This strategy maximizes + * the number of duplicate tuples that will end up on the mostly-empty + * left side of the split, and minimizes the number that will end up on + * the mostly-full right side. (This assumes that the split point was + * likely chosen using "single value" mode.) + */ + pivotheaptid = (ItemPointer) ((char *) pivot + newsize - + MAXALIGN(sizeof(ItemPointerData))); + ItemPointerCopy(&lastleft->t_tid, pivotheaptid); + + /* + * Lehman and Yao require that the downlink to the right page, which is to + * be inserted into the parent page in the second phase of a page split be + * a strict lower bound on all current and future items on the right page + * (this will be copied from the new high key for the left side of the + * split). + */ + + /* Deliberately invert the order, since TIDs "sort DESC" */ + Assert(ItemPointerCompare(&lastleft->t_tid, pivotheaptid) >= 0); + Assert(ItemPointerCompare(&firstright->t_tid, pivotheaptid) < 0); + + BTreeTupleSetNAtts(pivot, nkeyatts); + BTreeTupleSetAltHeapTID(pivot); + + return pivot; +} + +/* + * _bt_leave_natts - how many key attributes to leave when truncating. + * + * This can return a number of attributes that is one greater than the + * number of key attributes for the index relation. This indicates that the + * caller must use a heap TID as a unique-ifier in new pivot tuple. + */ +static int +_bt_leave_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright) +{ + int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + int leavenatts; + ScanKey skey; + + skey = _bt_mkscankey(rel, firstright); + + /* + * Even test nkeyatts (no truncated non-TID attributes) case, since caller + * cares about whether or not it can avoid appending a heap TID as a + * unique-ifier + */ + leavenatts = 1; + for (;;) + { + if (leavenatts > nkeyatts) + break; + if (_bt_tuple_compare(rel, leavenatts, skey, NULL, lastleft) > 0) + break; + leavenatts++; + } + + /* Can't leak memory here */ + _bt_freeskey(skey); + + return leavenatts; } /* @@ -2137,6 +2319,7 @@ _bt_check_natts(Relation rel, Page page, OffsetNumber offnum) int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); IndexTuple itup; + int tupnatts; /* * We cannot reliably test a deleted or half-deleted page, since they have @@ -2156,6 +2339,7 @@ _bt_check_natts(Relation rel, Page page, OffsetNumber offnum) "BT_N_KEYS_OFFSET_MASK can't fit INDEX_MAX_KEYS"); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + tupnatts = BTreeTupleGetNAtts(itup, rel); if (P_ISLEAF(opaque)) { @@ -2165,7 +2349,7 @@ _bt_check_natts(Relation rel, Page page, OffsetNumber offnum) * Leaf tuples that are not the page high key (non-pivot tuples) * should never be truncated */ - return BTreeTupleGetNAtts(itup, rel) == natts; + return tupnatts == natts; } else { @@ -2176,7 +2360,7 @@ _bt_check_natts(Relation rel, Page page, OffsetNumber offnum) Assert(!P_RIGHTMOST(opaque)); /* Page high key tuple contains only key attributes */ - return BTreeTupleGetNAtts(itup, rel) == nkeyatts; + return tupnatts > 0 && tupnatts <= nkeyatts; } } else /* !P_ISLEAF(opaque) */ @@ -2209,7 +2393,7 @@ _bt_check_natts(Relation rel, Page page, OffsetNumber offnum) * Tuple contains only key attributes despite on is it page high * key or not */ - return BTreeTupleGetNAtts(itup, rel) == nkeyatts; + return tupnatts > 0 && tupnatts <= nkeyatts; } } diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 67a94cb80a..7c061e96d2 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -202,7 +202,7 @@ btree_xlog_insert(bool isleaf, bool ismeta, XLogReaderState *record) } static void -btree_xlog_split(bool onleft, bool lhighkey, XLogReaderState *record) +btree_xlog_split(bool onleft, XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record); @@ -213,8 +213,6 @@ btree_xlog_split(bool onleft, bool lhighkey, XLogReaderState *record) BTPageOpaque ropaque; char *datapos; Size datalen; - IndexTuple left_hikey = NULL; - Size left_hikeysz = 0; BlockNumber leftsib; BlockNumber rightsib; BlockNumber rnext; @@ -248,20 +246,6 @@ btree_xlog_split(bool onleft, bool lhighkey, XLogReaderState *record) _bt_restore_page(rpage, datapos, datalen); - /* - * When the high key isn't present is the wal record, then we assume it to - * be equal to the first key on the right page. It must be from the leaf - * level. - */ - if (!lhighkey) - { - ItemId hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque)); - - Assert(isleaf); - left_hikey = (IndexTuple) PageGetItem(rpage, hiItemId); - left_hikeysz = ItemIdGetLength(hiItemId); - } - PageSetLSN(rpage, lsn); MarkBufferDirty(rbuf); @@ -284,6 +268,8 @@ btree_xlog_split(bool onleft, bool lhighkey, XLogReaderState *record) OffsetNumber off; IndexTuple newitem = NULL; Size newitemsz = 0; + IndexTuple left_hikey = NULL; + Size left_hikeysz = 0; Page newlpage; OffsetNumber leftoff; @@ -298,13 +284,10 @@ btree_xlog_split(bool onleft, bool lhighkey, XLogReaderState *record) } /* Extract left hikey and its size (assuming 16-bit alignment) */ - if (lhighkey) - { - left_hikey = (IndexTuple) datapos; - left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey)); - datapos += left_hikeysz; - datalen -= left_hikeysz; - } + left_hikey = (IndexTuple) datapos; + left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey)); + datapos += left_hikeysz; + datalen -= left_hikeysz; Assert(datalen == 0); @@ -1003,16 +986,10 @@ btree_redo(XLogReaderState *record) btree_xlog_insert(false, true, record); break; case XLOG_BTREE_SPLIT_L: - btree_xlog_split(true, false, record); - break; - case XLOG_BTREE_SPLIT_L_HIGHKEY: - btree_xlog_split(true, true, record); + btree_xlog_split(true, record); break; case XLOG_BTREE_SPLIT_R: - btree_xlog_split(false, false, record); - break; - case XLOG_BTREE_SPLIT_R_HIGHKEY: - btree_xlog_split(false, true, record); + btree_xlog_split(false, record); break; case XLOG_BTREE_VACUUM: btree_xlog_vacuum(record); diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c index 5c4457179d..667c906b2e 100644 --- a/src/backend/access/rmgrdesc/nbtdesc.c +++ b/src/backend/access/rmgrdesc/nbtdesc.c @@ -35,8 +35,6 @@ btree_desc(StringInfo buf, XLogReaderState *record) } case XLOG_BTREE_SPLIT_L: case XLOG_BTREE_SPLIT_R: - case XLOG_BTREE_SPLIT_L_HIGHKEY: - case XLOG_BTREE_SPLIT_R_HIGHKEY: { xl_btree_split *xlrec = (xl_btree_split *) rec; @@ -130,12 +128,6 @@ btree_identify(uint8 info) case XLOG_BTREE_SPLIT_R: id = "SPLIT_R"; break; - case XLOG_BTREE_SPLIT_L_HIGHKEY: - id = "SPLIT_L_HIGHKEY"; - break; - case XLOG_BTREE_SPLIT_R_HIGHKEY: - id = "SPLIT_R_HIGHKEY"; - break; case XLOG_BTREE_VACUUM: id = "VACUUM"; break; diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index dfbda5458f..ffeb0624fe 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -854,10 +854,8 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems) * PageIndexTupleDelete is the best way. Delete the items in reverse * order so we don't have to think about adjusting item numbers for * previous deletions. - * - * TODO: tune the magic number here */ - if (nitems <= 2) + if (nitems <= 7) { while (--nitems >= 0) PageIndexTupleDelete(page, itemnos[nitems]); diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index 9fb33b9035..5211cf5b98 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -4057,23 +4057,26 @@ comparetup_index_btree(const SortTuple *a, const SortTuple *b, } /* - * If key values are equal, we sort on ItemPointer. This does not affect - * validity of the finished index, but it may be useful to have index - * scans in physical order. + * If key values are equal, we sort on ItemPointer. This is required for + * btree indexes, since heap TID is treated as an implicit last key + * attribute in order to ensure that all keys in the index are physically + * unique. */ { BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + /* Deliberately invert the order, since TIDs "sort DESC" */ if (blk1 != blk2) - return (blk1 < blk2) ? -1 : 1; + return (blk1 < blk2) ? 1 : -1; } { OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + /* Deliberately invert the order, since TIDs "sort DESC" */ if (pos1 != pos2) - return (pos1 < pos2) ? -1 : 1; + return (pos1 < pos2) ? 1 : -1; } return 0; diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 04ecb4cbc0..12f57773e7 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -114,16 +114,26 @@ typedef struct BTMetaPageData #define BTREE_METAPAGE 0 /* first page is meta */ #define BTREE_MAGIC 0x053162 /* magic number of btree pages */ -#define BTREE_VERSION 3 /* current version number */ -#define BTREE_MIN_VERSION 2 /* minimal supported version number */ +#define BTREE_VERSION 4 /* current version number */ +#define BTREE_MIN_VERSION 4 /* minimal supported version number */ /* * Maximum size of a btree index entry, including its tuple header. * * We actually need to be able to fit three items on every page, * so restrict any one item to 1/3 the per-page available space. + * + * There are rare cases where _bt_suffix_truncate() will need to + * enlarge a heap index tuple to make space for a tie-breaker heap + * TID attribute, which we account for here. */ #define BTMaxItemSize(page) \ + MAXALIGN_DOWN((PageGetPageSize(page) - \ + MAXALIGN(SizeOfPageHeaderData + \ + 3*sizeof(ItemIdData) + \ + 3*MAXALIGN(sizeof(ItemPointerData))) - \ + MAXALIGN(sizeof(BTPageOpaqueData))) / 3) +#define BTMaxItemSizeOld(page) \ MAXALIGN_DOWN((PageGetPageSize(page) - \ MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \ MAXALIGN(sizeof(BTPageOpaqueData))) / 3) @@ -133,11 +143,15 @@ typedef struct BTMetaPageData * For pages above the leaf level, we use a fixed 70% fillfactor. * The fillfactor is applied during index build and when splitting * a rightmost page; when splitting non-rightmost pages we try to - * divide the data equally. + * divide the data equally. When splitting a page that's entirely + * filled with a single value (duplicates), the leaf-page + * fillfactor is overridden, and is applied regardless of whether + * the page is a rightmost page. */ #define BTREE_MIN_FILLFACTOR 10 #define BTREE_DEFAULT_FILLFACTOR 90 #define BTREE_NONLEAF_FILLFACTOR 70 +#define BTREE_SINGLEVAL_FILLFACTOR 1 /* * In general, the btree code tries to localize its knowledge about @@ -204,21 +218,23 @@ typedef struct BTMetaPageData * real offset (downlinks only need to store a block number). The offset * field only stores the number of attributes when the INDEX_ALT_TID_MASK * bit is set (we never assume that pivot tuples must explicitly store the - * number of attributes, and currently do not bother storing the number of - * attributes unless indnkeyatts actually differs from indnatts). - * INDEX_ALT_TID_MASK is only used for pivot tuples at present, though it's - * possible that it will be used within non-pivot tuples in the future. Do - * not assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot - * tuple. + * number of attributes). INDEX_ALT_TID_MASK is only used for pivot tuples + * at present, though it's possible that it will be used within non-pivot + * tuples in the future. Do not assume that a tuple with INDEX_ALT_TID_MASK + * set must be a pivot tuple. A pivot tuple must have INDEX_ALT_TID_MASK set + * as of BTREE_VERSION 4, however. * * The 12 least significant offset bits are used to represent the number of - * attributes in INDEX_ALT_TID_MASK tuples, leaving 4 bits that are reserved - * for future use (BT_RESERVED_OFFSET_MASK bits). BT_N_KEYS_OFFSET_MASK should - * be large enough to store any number <= INDEX_MAX_KEYS. + * attributes in INDEX_ALT_TID_MASK tuples, leaving 4 status bits + * (BT_RESERVED_OFFSET_MASK bits): BT_HEAP_TID_ATTR, plus 3 bits that are + * reserved for future use. BT_N_KEYS_OFFSET_MASK should be large enough to + * store any number <= INDEX_MAX_KEYS. */ #define INDEX_ALT_TID_MASK INDEX_AM_RESERVED_BIT #define BT_RESERVED_OFFSET_MASK 0xF000 #define BT_N_KEYS_OFFSET_MASK 0x0FFF +/* Reserved to indicate if heap TID is represented at end of tuple */ +#define BT_HEAP_TID_ATTR 0x1000 /* Get/set downlink block number */ #define BTreeInnerTupleGetDownLink(itup) \ @@ -241,14 +257,15 @@ typedef struct BTMetaPageData } while(0) /* - * Get/set number of attributes within B-tree index tuple. Asserts should be - * removed when BT_RESERVED_OFFSET_MASK bits will be used. + * Get/set number of attributes within B-tree index tuple. + * + * Note that this does not include an implicit tie-breaker heap-TID + * attribute, if any. */ #define BTreeTupleGetNAtts(itup, rel) \ ( \ (itup)->t_info & INDEX_ALT_TID_MASK ? \ ( \ - AssertMacro((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_RESERVED_OFFSET_MASK) == 0), \ ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK \ ) \ : \ @@ -257,10 +274,42 @@ typedef struct BTMetaPageData #define BTreeTupleSetNAtts(itup, n) \ do { \ (itup)->t_info |= INDEX_ALT_TID_MASK; \ - Assert(((n) & BT_RESERVED_OFFSET_MASK) == 0); \ ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_KEYS_OFFSET_MASK); \ } while(0) +/* + * Get tie-breaker heap TID attribute, if any. Macro works with both pivot + * and non-pivot tuples. + * + * Assumes that any tuple without INDEX_ALT_TID_MASK set has a t_tid that + * points to the heap, and that all pivot tuples have INDEX_ALT_TID_MASK set + * (since all pivot tuples must as of BTREE_VERSION 4). When non-pivot + * tuples use the INDEX_ALT_TID_MASK representation in the future, they'll + * probably also contain a heap TID at the end of the tuple. We avoid + * assuming that a tuple with INDEX_ALT_TID_MASK set is necessarily a pivot + * tuple. + */ +#define BTreeTupleGetHeapTID(itup) \ + ( \ + (itup)->t_info & INDEX_ALT_TID_MASK && \ + (ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_HEAP_TID_ATTR) != 0 ? \ + ( \ + (ItemPointer) (((char *) (itup) + IndexTupleSize(itup)) - \ + MAXALIGN(sizeof(ItemPointerData))) \ + ) \ + : (itup)->t_info & INDEX_ALT_TID_MASK ? NULL : (ItemPointer) &((itup)->t_tid) \ + ) +/* + * Set the heap TID attribute for a tuple that uses the INDEX_ALT_TID_MASK + * representation (currently limited to pivot tuples) + */ +#define BTreeTupleSetAltHeapTID(itup) \ + do { \ + Assert((itup)->t_info & INDEX_ALT_TID_MASK); \ + ItemPointerSetOffsetNumber(&(itup)->t_tid, \ + ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_HEAP_TID_ATTR); \ + } while(0) + /* * Operator strategy numbers for B-tree have been moved to access/stratnum.h, * because many places need to use them in ScanKeyInit() calls. @@ -560,15 +609,18 @@ extern int _bt_pagedel(Relation rel, Buffer buf); * prototypes for functions in nbtsearch.c */ extern BTStack _bt_search(Relation rel, - int keysz, ScanKey scankey, bool nextkey, + int keysz, ScanKey scankey, ItemPointer scantid, bool nextkey, Buffer *bufP, int access, Snapshot snapshot); extern Buffer _bt_moveright(Relation rel, Buffer buf, int keysz, - ScanKey scankey, bool nextkey, bool forupdate, BTStack stack, - int access, Snapshot snapshot); + ScanKey scankey, ItemPointer scantid, bool nextkey, + bool forupdate, BTStack stack, int access, Snapshot snapshot); extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, - ScanKey scankey, bool nextkey); + ScanKey scankey, ItemPointer scantid, OffsetNumber low, + bool nextkey); extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, - Page page, OffsetNumber offnum); + ItemPointer scantid, Page page, OffsetNumber offnum); +extern int32 _bt_tuple_compare(Relation rel, int keysz, ScanKey scankey, + ItemPointer scantid, IndexTuple itup); extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, @@ -601,7 +653,8 @@ extern bytea *btoptions(Datum reloptions, bool validate); extern bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull); -extern IndexTuple _bt_nonkey_truncate(Relation rel, IndexTuple itup); +extern IndexTuple _bt_suffix_truncate(Relation rel, IndexTuple lastleft, + IndexTuple firstright); extern bool _bt_check_natts(Relation rel, Page page, OffsetNumber offnum); /* diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index 819373031c..5f3c4a015a 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -28,8 +28,7 @@ #define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */ #define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */ #define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */ -#define XLOG_BTREE_SPLIT_L_HIGHKEY 0x50 /* as above, include truncated highkey */ -#define XLOG_BTREE_SPLIT_R_HIGHKEY 0x60 /* as above, include truncated highkey */ +/* 0x50 and 0x60 are unused */ #define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */ #define XLOG_BTREE_UNLINK_PAGE 0x80 /* delete a half-dead page */ #define XLOG_BTREE_UNLINK_PAGE_META 0x90 /* same, and update metapage */ @@ -82,20 +81,16 @@ typedef struct xl_btree_insert * * Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record. * The _L and _R variants indicate whether the inserted tuple went into the - * left or right split page (and thus, whether newitemoff and the new item - * are stored or not). The _HIGHKEY variants indicate that we've logged - * explicitly left page high key value, otherwise redo should use right page - * leftmost key as a left page high key. _HIGHKEY is specified for internal - * pages where right page leftmost key is suppressed, and for leaf pages - * of covering indexes where high key have non-key attributes truncated. + * left or right split page (and thus, whether newitemoff and the new item are + * stored or not). We always explicitly log the left page high key value. * * Backup Blk 0: original page / new left page * * The left page's data portion contains the new item, if it's the _L variant. - * (In the _R variants, the new item is one of the right page's tuples.) - * If level > 0, an IndexTuple representing the HIKEY of the left page - * follows. We don't need this on leaf pages, because it's the same as the - * leftmost key in the new right page. + * In the _R variants, the new item is one of the right page's tuples. An + * IndexTuple representing the HIKEY of the left page follows. We don't need + * this on leaf pages, because it's the same as the leftmost key in the new + * right page. * * Backup Blk 1: new right page * diff --git a/src/test/regress/expected/domain.out b/src/test/regress/expected/domain.out index 0b5a9041b0..f4899f2a38 100644 --- a/src/test/regress/expected/domain.out +++ b/src/test/regress/expected/domain.out @@ -643,10 +643,10 @@ update domnotnull set col1 = null; -- fails ERROR: domain dnotnulltest does not allow null values alter domain dnotnulltest drop not null; update domnotnull set col1 = null; +\set VERBOSITY terse drop domain dnotnulltest cascade; NOTICE: drop cascades to 2 other objects -DETAIL: drop cascades to column col1 of table domnotnull -drop cascades to column col2 of table domnotnull +\set VERBOSITY default -- Test ALTER DOMAIN .. DEFAULT .. create table domdeftest (col1 ddef1); insert into domdeftest default values; diff --git a/src/test/regress/expected/foreign_key.out b/src/test/regress/expected/foreign_key.out index fc3bbe4deb..1ec8264dfd 100644 --- a/src/test/regress/expected/foreign_key.out +++ b/src/test/regress/expected/foreign_key.out @@ -253,13 +253,13 @@ SELECT * FROM FKTABLE; (5 rows) -- this should fail for lack of CASCADE +\set VERBOSITY terse DROP TABLE PKTABLE; ERROR: cannot drop table pktable because other objects depend on it -DETAIL: constraint constrname2 on table fktable depends on table pktable -HINT: Use DROP ... CASCADE to drop the dependent objects too. DROP TABLE PKTABLE CASCADE; NOTICE: drop cascades to constraint constrname2 on table fktable DROP TABLE FKTABLE; +\set VERBOSITY default -- -- First test, check with no on delete or on update -- diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index dc6262be43..2c20cea4b9 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -5896,8 +5896,8 @@ inner join j1 j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2 where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1; id1 | id2 | id1 | id2 -----+-----+-----+----- - 1 | 1 | 1 | 1 1 | 2 | 1 | 2 + 1 | 1 | 1 | 1 (2 rows) reset enable_nestloop; diff --git a/src/test/regress/expected/truncate.out b/src/test/regress/expected/truncate.out index 2e26510522..c8b9a71689 100644 --- a/src/test/regress/expected/truncate.out +++ b/src/test/regress/expected/truncate.out @@ -276,11 +276,10 @@ SELECT * FROM trunc_faa; (0 rows) ROLLBACK; +\set VERBOSITY terse DROP TABLE trunc_f CASCADE; NOTICE: drop cascades to 3 other objects -DETAIL: drop cascades to table trunc_fa -drop cascades to table trunc_faa -drop cascades to table trunc_fb +\set VERBOSITY default -- Test ON TRUNCATE triggers CREATE TABLE trunc_trigger_test (f1 int, f2 text, f3 text); CREATE TABLE trunc_trigger_log (tgop text, tglevel text, tgwhen text, diff --git a/src/test/regress/expected/typed_table.out b/src/test/regress/expected/typed_table.out index 2e47ecbcf5..c76efee358 100644 --- a/src/test/regress/expected/typed_table.out +++ b/src/test/regress/expected/typed_table.out @@ -75,19 +75,12 @@ CREATE TABLE persons4 OF person_type ( name WITH OPTIONS DEFAULT '' -- error, specified more than once ); ERROR: column "name" specified more than once +\set VERBOSITY terse DROP TYPE person_type RESTRICT; ERROR: cannot drop type person_type because other objects depend on it -DETAIL: table persons depends on type person_type -function get_all_persons() depends on type person_type -table persons2 depends on type person_type -table persons3 depends on type person_type -HINT: Use DROP ... CASCADE to drop the dependent objects too. DROP TYPE person_type CASCADE; NOTICE: drop cascades to 4 other objects -DETAIL: drop cascades to table persons -drop cascades to function get_all_persons() -drop cascades to table persons2 -drop cascades to table persons3 +\set VERBOSITY default CREATE TABLE persons5 OF stuff; -- only CREATE TYPE AS types may be used ERROR: type stuff is not a composite type DROP TABLE stuff; diff --git a/src/test/regress/expected/updatable_views.out b/src/test/regress/expected/updatable_views.out index e64d693e9c..1ea90181d8 100644 --- a/src/test/regress/expected/updatable_views.out +++ b/src/test/regress/expected/updatable_views.out @@ -328,24 +328,10 @@ UPDATE ro_view20 SET b=upper(b); ERROR: cannot update view "ro_view20" DETAIL: Views that return set-returning functions are not automatically updatable. HINT: To enable updating the view, provide an INSTEAD OF UPDATE trigger or an unconditional ON UPDATE DO INSTEAD rule. +\set VERBOSITY terse DROP TABLE base_tbl CASCADE; NOTICE: drop cascades to 16 other objects -DETAIL: drop cascades to view ro_view1 -drop cascades to view ro_view17 -drop cascades to view ro_view2 -drop cascades to view ro_view3 -drop cascades to view ro_view5 -drop cascades to view ro_view6 -drop cascades to view ro_view7 -drop cascades to view ro_view8 -drop cascades to view ro_view9 -drop cascades to view ro_view11 -drop cascades to view ro_view13 -drop cascades to view rw_view15 -drop cascades to view rw_view16 -drop cascades to view ro_view20 -drop cascades to view ro_view4 -drop cascades to view rw_view14 +\set VERBOSITY default DROP VIEW ro_view10, ro_view12, ro_view18; DROP SEQUENCE uv_seq CASCADE; NOTICE: drop cascades to view ro_view19 diff --git a/src/test/regress/sql/domain.sql b/src/test/regress/sql/domain.sql index 68da27de22..d19e2c9d28 100644 --- a/src/test/regress/sql/domain.sql +++ b/src/test/regress/sql/domain.sql @@ -381,7 +381,9 @@ alter domain dnotnulltest drop not null; update domnotnull set col1 = null; +\set VERBOSITY terse drop domain dnotnulltest cascade; +\set VERBOSITY default -- Test ALTER DOMAIN .. DEFAULT .. create table domdeftest (col1 ddef1); diff --git a/src/test/regress/sql/foreign_key.sql b/src/test/regress/sql/foreign_key.sql index d2cecdf4eb..2c26191980 100644 --- a/src/test/regress/sql/foreign_key.sql +++ b/src/test/regress/sql/foreign_key.sql @@ -159,9 +159,11 @@ UPDATE PKTABLE SET ptest1=1 WHERE ptest1=2; SELECT * FROM FKTABLE; -- this should fail for lack of CASCADE +\set VERBOSITY terse DROP TABLE PKTABLE; DROP TABLE PKTABLE CASCADE; DROP TABLE FKTABLE; +\set VERBOSITY default -- diff --git a/src/test/regress/sql/truncate.sql b/src/test/regress/sql/truncate.sql index 6ddfb6dd1d..fee7e76ec3 100644 --- a/src/test/regress/sql/truncate.sql +++ b/src/test/regress/sql/truncate.sql @@ -125,7 +125,9 @@ SELECT * FROM trunc_fa; SELECT * FROM trunc_faa; ROLLBACK; +\set VERBOSITY terse DROP TABLE trunc_f CASCADE; +\set VERBOSITY default -- Test ON TRUNCATE triggers diff --git a/src/test/regress/sql/typed_table.sql b/src/test/regress/sql/typed_table.sql index 9ef0cdfcc7..953cd1f14b 100644 --- a/src/test/regress/sql/typed_table.sql +++ b/src/test/regress/sql/typed_table.sql @@ -43,8 +43,10 @@ CREATE TABLE persons4 OF person_type ( name WITH OPTIONS DEFAULT '' -- error, specified more than once ); +\set VERBOSITY terse DROP TYPE person_type RESTRICT; DROP TYPE person_type CASCADE; +\set VERBOSITY default CREATE TABLE persons5 OF stuff; -- only CREATE TYPE AS types may be used diff --git a/src/test/regress/sql/updatable_views.sql b/src/test/regress/sql/updatable_views.sql index dc6d5cbe35..6eaa81b540 100644 --- a/src/test/regress/sql/updatable_views.sql +++ b/src/test/regress/sql/updatable_views.sql @@ -98,7 +98,9 @@ DELETE FROM ro_view18; UPDATE ro_view19 SET last_value=1000; UPDATE ro_view20 SET b=upper(b); +\set VERBOSITY terse DROP TABLE base_tbl CASCADE; +\set VERBOSITY default DROP VIEW ro_view10, ro_view12, ro_view18; DROP SEQUENCE uv_seq CASCADE; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 9fe950b29d..f46a0e745d 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2207,6 +2207,8 @@ SpecialJoinInfo SpinDelayStatus SplitInterval SplitLR +SplitMode +SplitPoint SplitVar SplitedPageLayout StackElem -- 2.17.1