From 065b2861c0acec3125fbb67336a6461e64dca285 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Fri, 27 Apr 2018 12:47:39 -0700 Subject: [PATCH v7 1/6] Make nbtree indexes have unique keys in tuples. Make nbtree treat all index tuples as having a heap TID trailing attribute. Heap TID becomes a first class part of the key space on all levels of the tree. Index searches can distinguish duplicates by heap TID. This general approach has numerous benefits for performance, and is prerequisite to teaching VACUUM to perform "retail index tuple deletion". Naively adding a new attribute to every pivot tuple has unacceptable overhead (it bloats internal pages), so suffix truncation of pivot tuples is also introduced. This will generally truncate away the "extra" heap TID attribute from pivot tuples during a leaf page split, and may also truncate away additional user attributes. This can increase fan-out, especially when there are several attributes in an index. Truncation can only occur at the attribute granularity, which isn't particularly effective, but works well enough for now. Only new indexes (BTREE_VERSION 4 indexes) will have insertions that treat heap TID as a tie-breaker attribute, or will have pivot tuples undergo suffix truncation during a leaf page split (on-disk compatibility with versions 2 and 3 is preserved). Upgrades to version 4 cannot be performed on-the-fly, unlike upgrades from version 2 to version 3. contrib/amcheck continues to work with BTREE_VERSIONs 2 and 3, while also enforcing the newer/more strict invariants with BTREE_VERSION 4 indexes. We no longer allow a search for free space among multiple pages full of duplicates to "get tired", except when needed to preserve compatibility with earlier versions. This has significant benefits for free space management in secondary indexes on low cardinality attributes. However, without the next commit in the patch series (without having "single value" mode and "many duplicates" mode within _bt_findsplitloc()), these cases will be significantly regressed, since they'll naively perform 50:50 splits without there being any hope of reusing space left free on the left half of the split. --- contrib/amcheck/verify_nbtree.c | 266 +++++++-- contrib/file_fdw/output/file_fdw.source | 10 +- contrib/pageinspect/btreefuncs.c | 2 +- contrib/pageinspect/expected/btree.out | 2 +- contrib/pgstattuple/expected/pgstattuple.out | 10 +- src/backend/access/nbtree/README | 162 ++++-- src/backend/access/nbtree/nbtinsert.c | 525 +++++++++++------- src/backend/access/nbtree/nbtpage.c | 199 +++++-- src/backend/access/nbtree/nbtree.c | 2 +- src/backend/access/nbtree/nbtsearch.c | 363 +++++++++--- src/backend/access/nbtree/nbtsort.c | 80 +-- src/backend/access/nbtree/nbtutils.c | 365 ++++++++++-- src/backend/access/nbtree/nbtxlog.c | 43 +- src/backend/access/rmgrdesc/nbtdesc.c | 8 - src/backend/utils/sort/tuplesort.c | 11 +- src/include/access/nbtree.h | 158 +++++- src/include/access/nbtxlog.h | 20 +- .../expected/test_extensions.out | 6 +- src/test/regress/expected/aggregates.out | 4 +- src/test/regress/expected/alter_table.out | 20 +- src/test/regress/expected/collate.out | 3 +- src/test/regress/expected/create_type.out | 8 +- src/test/regress/expected/dependency.out | 4 +- src/test/regress/expected/domain.out | 4 +- src/test/regress/expected/event_trigger.out | 69 ++- src/test/regress/expected/foreign_data.out | 8 +- src/test/regress/expected/foreign_key.out | 4 +- src/test/regress/expected/inherit.out | 16 +- src/test/regress/expected/matview.out | 18 +- src/test/regress/expected/rowsecurity.out | 4 +- src/test/regress/expected/select_into.out | 4 +- src/test/regress/expected/triggers.out | 4 +- src/test/regress/expected/truncate.out | 5 +- src/test/regress/expected/typed_table.out | 11 +- src/test/regress/expected/updatable_views.out | 56 +- src/test/regress/output/tablespace.source | 8 +- src/test/regress/sql/collate.sql | 2 + src/test/regress/sql/domain.sql | 2 + src/test/regress/sql/foreign_key.sql | 2 + src/test/regress/sql/truncate.sql | 2 + src/test/regress/sql/typed_table.sql | 2 + src/test/regress/sql/updatable_views.sql | 20 + src/tools/pgindent/typedefs.list | 4 + 43 files changed, 1736 insertions(+), 780 deletions(-) diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index a1438a2855..09bcd4442d 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -45,6 +45,13 @@ PG_MODULE_MAGIC; */ #define InvalidBtreeLevel ((uint32) InvalidBlockNumber) +/* + * Convenience macro to get number of key attributes in tuple in low-context + * fashion + */ +#define BTreeTupleGetNKeyAtts(itup, rel) \ + Min(IndexRelationGetNumberOfKeyAttributes(rel), BTreeTupleGetNAtts(itup, rel)) + /* * State associated with verifying a B-Tree index * @@ -125,26 +132,28 @@ static void bt_check_every_level(Relation rel, Relation heaprel, static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level); static void bt_target_page_check(BtreeCheckState *state); -static ScanKey bt_right_page_check_scankey(BtreeCheckState *state); -static void bt_downlink_check(BtreeCheckState *state, BlockNumber childblock, - ScanKey targetkey); +static IndexTuple bt_right_page_check_tuple(BtreeCheckState *state); +static void bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey, + BlockNumber childblock); static void bt_downlink_missing_check(BtreeCheckState *state); static void bt_tuple_present_callback(Relation index, HeapTuple htup, Datum *values, bool *isnull, bool tupleIsAlive, void *checkstate); static inline bool offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset); -static inline bool invariant_leq_offset(BtreeCheckState *state, - ScanKey key, +static inline bool invariant_l_offset(BtreeCheckState *state, BTScanInsert key, + OffsetNumber upperbound); +static inline bool invariant_leq_offset(BtreeCheckState *state, BTScanInsert key, OffsetNumber upperbound); -static inline bool invariant_geq_offset(BtreeCheckState *state, - ScanKey key, - OffsetNumber lowerbound); -static inline bool invariant_leq_nontarget_offset(BtreeCheckState *state, - Page other, - ScanKey key, - OffsetNumber upperbound); +static inline bool invariant_g_offset(BtreeCheckState *state, BTScanInsert key, + OffsetNumber lowerbound); +static inline bool invariant_l_nontarget_offset(BtreeCheckState *state, + BTScanInsert key, + Page nontarget, + OffsetNumber upperbound); static Page palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum); +static inline ItemPointer BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, + IndexTuple itup, bool isleaf); /* * bt_index_check(index regclass, heapallindexed boolean) @@ -834,8 +843,8 @@ bt_target_page_check(BtreeCheckState *state) { ItemId itemid; IndexTuple itup; - ScanKey skey; size_t tupsize; + BTScanInsert skey; CHECK_FOR_INTERRUPTS(); @@ -902,8 +911,15 @@ bt_target_page_check(BtreeCheckState *state) if (offset_is_negative_infinity(topaque, offset)) continue; - /* Build insertion scankey for current page offset */ - skey = _bt_mkscankey(state->rel, itup); + /* + * Build insertion scankey for current page offset/tuple. + * + * As required by _bt_mkscankey(), track number of key attributes, + * which is needed so that _bt_compare() calls handle truncated + * attributes correctly. Never count non-key attributes in + * non-truncated tuples as key attributes, though. + */ + skey = _bt_mkscankey(state->rel, itup, false); /* Fingerprint leaf page tuples (those that point to the heap) */ if (state->heapallindexed && P_ISLEAF(topaque) && !ItemIdIsDead(itemid)) @@ -956,11 +972,10 @@ bt_target_page_check(BtreeCheckState *state) * * Item order check * * * Check that items are stored on page in logical order, by checking - * current item is less than or equal to next item (if any). + * current item is strictly less than next item (if any). */ if (OffsetNumberNext(offset) <= max && - !invariant_leq_offset(state, skey, - OffsetNumberNext(offset))) + !invariant_l_offset(state, skey, OffsetNumberNext(offset))) { char *itid, *htid, @@ -1017,16 +1032,20 @@ bt_target_page_check(BtreeCheckState *state) */ else if (offset == max) { - ScanKey rightkey; + IndexTuple righttup; + BTScanInsert rightkey; /* Get item in next/right page */ - rightkey = bt_right_page_check_scankey(state); + righttup = bt_right_page_check_tuple(state); - if (rightkey && - !invariant_geq_offset(state, rightkey, max)) + /* Set up right item scankey */ + if (righttup) + rightkey = _bt_mkscankey(state->rel, righttup, false); + + if (righttup && !invariant_g_offset(state, rightkey, max)) { /* - * As explained at length in bt_right_page_check_scankey(), + * As explained at length in bt_right_page_check_tuple(), * there is a known !readonly race that could account for * apparent violation of invariant, which we must check for * before actually proceeding with raising error. Our canary @@ -1069,7 +1088,7 @@ bt_target_page_check(BtreeCheckState *state) { BlockNumber childblock = BTreeInnerTupleGetDownLink(itup); - bt_downlink_check(state, childblock, skey); + bt_downlink_check(state, skey, childblock); } } @@ -1083,9 +1102,9 @@ bt_target_page_check(BtreeCheckState *state) } /* - * Return a scankey for an item on page to right of current target (or the + * Return an index tuple for an item on page to right of current target (or the * first non-ignorable page), sufficient to check ordering invariant on last - * item in current target page. Returned scankey relies on local memory + * item in current target page. Returned tuple relies on local memory * allocated for the child page, which caller cannot pfree(). Caller's memory * context should be reset between calls here. * @@ -1098,8 +1117,8 @@ bt_target_page_check(BtreeCheckState *state) * Note that !readonly callers must reverify that target page has not * been concurrently deleted. */ -static ScanKey -bt_right_page_check_scankey(BtreeCheckState *state) +static IndexTuple +bt_right_page_check_tuple(BtreeCheckState *state) { BTPageOpaque opaque; ItemId rightitem; @@ -1287,11 +1306,10 @@ bt_right_page_check_scankey(BtreeCheckState *state) } /* - * Return first real item scankey. Note that this relies on right page - * memory remaining allocated. + * Return first real item. Note that this relies on right page memory + * remaining allocated. */ - return _bt_mkscankey(state->rel, - (IndexTuple) PageGetItem(rightpage, rightitem)); + return (IndexTuple) PageGetItem(rightpage, rightitem); } /* @@ -1304,8 +1322,8 @@ bt_right_page_check_scankey(BtreeCheckState *state) * verification this way around is much more practical. */ static void -bt_downlink_check(BtreeCheckState *state, BlockNumber childblock, - ScanKey targetkey) +bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey, + BlockNumber childblock) { OffsetNumber offset; OffsetNumber maxoffset; @@ -1354,7 +1372,8 @@ bt_downlink_check(BtreeCheckState *state, BlockNumber childblock, /* * Verify child page has the downlink key from target page (its parent) as - * a lower bound. + * a lower bound; downlink must be strictly less than all keys on the + * page. * * Check all items, rather than checking just the first and trusting that * the operator class obeys the transitive law. @@ -1404,14 +1423,13 @@ bt_downlink_check(BtreeCheckState *state, BlockNumber childblock, /* * Skip comparison of target page key against "negative infinity" * item, if any. Checking it would indicate that it's not an upper - * bound, but that's only because of the hard-coding within - * _bt_compare(). + * bound, but that's only because of the hard-coding for negative + * inifinity items within _bt_compare(). */ if (offset_is_negative_infinity(copaque, offset)) continue; - if (!invariant_leq_nontarget_offset(state, child, - targetkey, offset)) + if (!invariant_l_nontarget_offset(state, targetkey, child, offset)) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("down-link lower bound invariant violated for index \"%s\"", @@ -1751,6 +1769,60 @@ offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset) return !P_ISLEAF(opaque) && offset == P_FIRSTDATAKEY(opaque); } +/* + * Does the invariant hold that the key is strictly less than a given upper + * bound offset item? + * + * If this function returns false, convention is that caller throws error due + * to corruption. + */ +static inline bool +invariant_l_offset(BtreeCheckState *state, BTScanInsert key, + OffsetNumber upperbound) +{ + int32 cmp; + + /* + * pg_upgrade'd indexes may legally have equal sibling tuples. Their + * pivot tuples can never have key attributes truncated away. + */ + if (!key->uniquekeys) + return invariant_leq_offset(state, key, upperbound); + + cmp = _bt_compare(state->rel, key, state->target, upperbound); + + /* + * _bt_compare interprets the absence of attributes in scan keys as + * meaning that they're not participating in a search, not as negative + * infinity (only tuples within the index are treated as negative + * infinity). Compensate for that here. + */ + if (cmp == 0) + { + BTPageOpaque topaque; + ItemId itemid; + IndexTuple ritup; + int uppnkeyatts; + ItemPointer rheaptid; + + itemid = PageGetItemId(state->target, upperbound); + ritup = (IndexTuple) PageGetItem(state->target, itemid); + uppnkeyatts = BTreeTupleGetNKeyAtts(ritup, state->rel); + + /* Get heap TID for item to the right */ + topaque = (BTPageOpaque) PageGetSpecialPointer(state->target); + rheaptid = BTreeTupleGetHeapTIDCareful(state, ritup, + P_ISLEAF(topaque)); + + if (key->keysz == uppnkeyatts) + return key->scantid == NULL && rheaptid != NULL; + + return key->keysz < uppnkeyatts; + } + + return cmp < 0; +} + /* * Does the invariant hold that the key is less than or equal to a given upper * bound offset item? @@ -1759,57 +1831,104 @@ offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset) * to corruption. */ static inline bool -invariant_leq_offset(BtreeCheckState *state, ScanKey key, +invariant_leq_offset(BtreeCheckState *state, BTScanInsert key, OffsetNumber upperbound) { - int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel); int32 cmp; - cmp = _bt_compare(state->rel, nkeyatts, key, state->target, upperbound); + cmp = _bt_compare(state->rel, key, state->target, upperbound); return cmp <= 0; } /* - * Does the invariant hold that the key is greater than or equal to a given - * lower bound offset item? + * Does the invariant hold that the key is strictly greater than a given lower + * bound offset item? * * If this function returns false, convention is that caller throws error due * to corruption. */ static inline bool -invariant_geq_offset(BtreeCheckState *state, ScanKey key, - OffsetNumber lowerbound) +invariant_g_offset(BtreeCheckState *state, BTScanInsert key, + OffsetNumber lowerbound) { - int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel); int32 cmp; - cmp = _bt_compare(state->rel, nkeyatts, key, state->target, lowerbound); + cmp = _bt_compare(state->rel, key, state->target, lowerbound); - return cmp >= 0; + /* + * pg_upgrade'd indexes may legally have equal sibling tuples. Their + * pivot tuples can never have key attributes truncated away. + */ + if (!key->uniquekeys) + return cmp >= 0; + + /* + * No need to consider the possibility that scankey has attributes that we + * need to force to be interpreted as negative infinity. That could cause + * us to miss the fact that the scankey is less than rather than equal to + * its lower bound, but the index is corrupt either way. + */ + return cmp > 0; } /* - * Does the invariant hold that the key is less than or equal to a given upper + * Does the invariant hold that the key is strictly less than a given upper * bound offset item, with the offset relating to a caller-supplied page that - * is not the current target page? Caller's non-target page is typically a - * child page of the target, checked as part of checking a property of the - * target page (i.e. the key comes from the target). + * is not the current target page? + * + * Caller's non-target page is a child page of the target, checked as part of + * checking a property of the target page (i.e. the key comes from the + * target). * * If this function returns false, convention is that caller throws error due * to corruption. */ static inline bool -invariant_leq_nontarget_offset(BtreeCheckState *state, - Page nontarget, ScanKey key, - OffsetNumber upperbound) +invariant_l_nontarget_offset(BtreeCheckState *state, BTScanInsert key, + Page nontarget, OffsetNumber upperbound) { - int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel); int32 cmp; - cmp = _bt_compare(state->rel, nkeyatts, key, nontarget, upperbound); + cmp = _bt_compare(state->rel, key, nontarget, upperbound); - return cmp <= 0; + /* + * pg_upgrade'd indexes may legally have equal sibling tuples. Their + * pivot tuples can never have key attributes truncated away. + */ + if (!key->uniquekeys) + return cmp <= 0; + + /* + * _bt_compare interprets the absence of attributes in scan keys as + * meaning that they're not participating in a search, not as negative + * infinity (only tuples within the index are treated as negative + * infinity). Compensate for that here. + */ + if (cmp == 0) + { + ItemId itemid; + IndexTuple child; + int uppnkeyatts; + ItemPointer childheaptid; + BTPageOpaque copaque; + + copaque = (BTPageOpaque) PageGetSpecialPointer(nontarget); + itemid = PageGetItemId(nontarget, upperbound); + child = (IndexTuple) PageGetItem(nontarget, itemid); + uppnkeyatts = BTreeTupleGetNKeyAtts(child, state->rel); + + /* Get heap TID for item from child/non-target */ + childheaptid = + BTreeTupleGetHeapTIDCareful(state, child, P_ISLEAF(copaque)); + + if (key->keysz == uppnkeyatts) + return key->scantid == NULL && childheaptid != NULL; + + return key->keysz < uppnkeyatts; + } + + return cmp < 0; } /* @@ -1965,3 +2084,32 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum) return page; } + +/* + * BTreeTupleGetHeapTID() wrapper that lets caller enforce that a heap TID must + * be present in cases where that is mandatory. + * + * This doesn't add much as of BTREE_VERSION 4, since the INDEX_ALT_TID_MASK + * bit is effectively a proxy for whether or not the tuple is a pivot tuple. + * It may become more useful in the future, when non-pivot tuples support their + * own alternative INDEX_ALT_TID_MASK representation. + * + * Note that it is incorrect to specify the tuple as a non-pivot when passing a + * leaf tuple that came from the high key offset, since that is actually a + * pivot tuple. + */ +static inline ItemPointer +BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, IndexTuple itup, + bool nonpivot) +{ + ItemPointer result = BTreeTupleGetHeapTID(itup); + BlockNumber targetblock = state->targetblock; + + if (result == NULL && nonpivot) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("block %u or its right sibling block or child block in index \"%s\" contains non-pivot tuple that lacks a heap TID", + targetblock, RelationGetRelationName(state->rel)))); + + return result; +} diff --git a/contrib/file_fdw/output/file_fdw.source b/contrib/file_fdw/output/file_fdw.source index 853c9f9b28..42bf16ba70 100644 --- a/contrib/file_fdw/output/file_fdw.source +++ b/contrib/file_fdw/output/file_fdw.source @@ -432,10 +432,10 @@ RESET ROLE; DROP EXTENSION file_fdw CASCADE; NOTICE: drop cascades to 7 other objects DETAIL: drop cascades to server file_server -drop cascades to user mapping for regress_file_fdw_superuser on server file_server -drop cascades to user mapping for regress_no_priv_user on server file_server -drop cascades to foreign table agg_text -drop cascades to foreign table agg_csv -drop cascades to foreign table agg_bad drop cascades to foreign table text_csv +drop cascades to foreign table agg_bad +drop cascades to foreign table agg_csv +drop cascades to foreign table agg_text +drop cascades to user mapping for regress_no_priv_user on server file_server +drop cascades to user mapping for regress_file_fdw_superuser on server file_server DROP ROLE regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user; diff --git a/contrib/pageinspect/btreefuncs.c b/contrib/pageinspect/btreefuncs.c index 184ac62255..bee1f1c9d9 100644 --- a/contrib/pageinspect/btreefuncs.c +++ b/contrib/pageinspect/btreefuncs.c @@ -560,7 +560,7 @@ bt_metap(PG_FUNCTION_ARGS) * Get values of extended metadata if available, use default values * otherwise. */ - if (metad->btm_version == BTREE_VERSION) + if (metad->btm_version >= BTREE_META_VERSION) { values[j++] = psprintf("%u", metad->btm_oldest_btpo_xact); values[j++] = psprintf("%f", metad->btm_last_cleanup_num_heap_tuples); diff --git a/contrib/pageinspect/expected/btree.out b/contrib/pageinspect/expected/btree.out index 2aaa4df53b..07c2dcd771 100644 --- a/contrib/pageinspect/expected/btree.out +++ b/contrib/pageinspect/expected/btree.out @@ -5,7 +5,7 @@ CREATE INDEX test1_a_idx ON test1 USING btree (a); SELECT * FROM bt_metap('test1_a_idx'); -[ RECORD 1 ]-----------+------- magic | 340322 -version | 3 +version | 4 root | 1 level | 0 fastroot | 1 diff --git a/contrib/pgstattuple/expected/pgstattuple.out b/contrib/pgstattuple/expected/pgstattuple.out index 9858ea69d4..9920dbfd40 100644 --- a/contrib/pgstattuple/expected/pgstattuple.out +++ b/contrib/pgstattuple/expected/pgstattuple.out @@ -48,7 +48,7 @@ select version, tree_level, from pgstatindex('test_pkey'); version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+-------------------- - 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN + 4 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN (1 row) select version, tree_level, @@ -58,7 +58,7 @@ select version, tree_level, from pgstatindex('test_pkey'::text); version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+-------------------- - 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN + 4 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN (1 row) select version, tree_level, @@ -68,7 +68,7 @@ select version, tree_level, from pgstatindex('test_pkey'::name); version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+-------------------- - 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN + 4 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN (1 row) select version, tree_level, @@ -78,7 +78,7 @@ select version, tree_level, from pgstatindex('test_pkey'::regclass); version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+-------------------- - 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN + 4 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN (1 row) select pg_relpages('test'); @@ -232,7 +232,7 @@ create index test_partition_hash_idx on test_partition using hash (a); select pgstatindex('test_partition_idx'); pgstatindex ------------------------------ - (3,0,8192,0,0,0,0,0,NaN,NaN) + (4,0,8192,0,0,0,0,0,NaN,NaN) (1 row) select pgstathashindex('test_partition_hash_idx'); diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 3680e69b89..43545311da 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -34,30 +34,47 @@ Differences to the Lehman & Yao algorithm We have made the following changes in order to incorporate the L&Y algorithm into Postgres: -The requirement that all btree keys be unique is too onerous, -but the algorithm won't work correctly without it. Fortunately, it is -only necessary that keys be unique on a single tree level, because L&Y -only use the assumption of key uniqueness when re-finding a key in a -parent page (to determine where to insert the key for a split page). -Therefore, we can use the link field to disambiguate multiple -occurrences of the same user key: only one entry in the parent level -will be pointing at the page we had split. (Indeed we need not look at -the real "key" at all, just at the link field.) We can distinguish -items at the leaf level in the same way, by examining their links to -heap tuples; we'd never have two items for the same heap tuple. +The requirement that all btree keys be unique is satisfied by treating +heap TID as a tie-breaker attribute. Logical duplicates are sorted in +heap item pointer order. We don't use btree keys to disambiguate +downlinks from the internal pages during a page split, though: only +one entry in the parent level will be pointing at the page we just +split, so the link fields can be used to re-find downlinks in the +parent via a linear search. -Lehman and Yao assume that the key range for a subtree S is described -by Ki < v <= Ki+1 where Ki and Ki+1 are the adjacent keys in the parent -page. This does not work for nonunique keys (for example, if we have -enough equal keys to spread across several leaf pages, there *must* be -some equal bounding keys in the first level up). Therefore we assume -Ki <= v <= Ki+1 instead. A search that finds exact equality to a -bounding key in an upper tree level must descend to the left of that -key to ensure it finds any equal keys in the preceding page. An -insertion that sees the high key of its target page is equal to the key -to be inserted has a choice whether or not to move right, since the new -key could go on either page. (Currently, we try to find a page where -there is room for the new key without a split.) +Lehman and Yao require that the key range for a subtree S is described +by Ki < v <= Ki+1 where Ki and Ki+1 are the adjacent keys in the +parent page, but do not account for the need to search the tree based +only on leading index attributes in a composite index. Since heap TID +is always used to make btree keys unique (even in unique indexes), +every btree index is treated as a composite index internally. A +search that finds exact equality to a pivot tuple in an upper tree +level must descend to the left of that key to ensure it finds any +equal keys, even when scan values were provided for all attributes. +An insertion that sees that the high key of its target page is equal +to the key to be inserted cannot move right, since the downlink for +the right sibling in the parent must always be strictly less than +right sibling keys (this is always possible because the leftmost +downlink on any non-leaf level is always a negative infinity +downlink). + +We might be able to avoid moving left in the event of a full match on +all attributes up to and including the heap TID attribute, but that +would be a very narrow win, since it's rather unlikely that heap TID +will be an exact match. We can avoid moving left unnecessarily when +all user-visible keys are equal by avoiding exact equality; a +sentinel value that's less than any possible heap TID is used by most +index scans. This is effective because of suffix truncation. An +"extra" heap TID attribute in pivot tuples is almost always avoided. +All truncated attributes compare as minus infinity, even against a +sentinel value, and the sentinel value is less than any real TID +value, so an unnecessary move to the left is avoided regardless of +whether or not a heap TID is present in the otherwise-equal pivot +tuple. Consistently moving left on full equality is also needed by +page deletion, which re-finds a leaf page by descending the tree while +searching on the leaf page's high key. If we wanted to avoid moving +left without breaking page deletion, we'd have to avoid suffix +truncation, which could never be worth it. Lehman and Yao don't require read locks, but assume that in-memory copies of tree pages are unshared. Postgres shares in-memory buffers @@ -598,33 +615,60 @@ the order of multiple keys for a given column is unspecified.) An insertion scankey uses the same array-of-ScanKey data structure, but the sk_func pointers point to btree comparison support functions (ie, 3-way comparators that return int4 values interpreted as <0, =0, >0). In an -insertion scankey there is exactly one entry per index column. Insertion -scankeys are built within the btree code (eg, by _bt_mkscankey()) and are -used to locate the starting point of a scan, as well as for locating the -place to insert a new index tuple. (Note: in the case of an insertion -scankey built from a search scankey, there might be fewer keys than -index columns, indicating that we have no constraints for the remaining -index columns.) After we have located the starting point of a scan, the -original search scankey is consulted as each index entry is sequentially -scanned to decide whether to return the entry and whether the scan can -stop (see _bt_checkkeys()). +insertion scankey there is exactly one entry per index column. There is +also other data about the rules used to locate where to begin the scan, +such as whether or not the scan is a "nextkey" scan. Insertion scankeys +are built within the btree code (eg, by _bt_mkscankey()) and are used to +locate the starting point of a scan, as well as for locating the place to +insert a new index tuple. (Note: in the case of an insertion scankey built +from a search scankey, there might be fewer keys than index columns, +indicating that we have no constraints for the remaining index columns.) +After we have located the starting point of a scan, the original search +scankey is consulted as each index entry is sequentially scanned to decide +whether to return the entry and whether the scan can stop (see +_bt_checkkeys()). We use term "pivot" index tuples to distinguish tuples which don't point -to heap tuples, but rather used for tree navigation. Pivot tuples includes -all tuples on non-leaf pages and high keys on leaf pages. Note that pivot -index tuples are only used to represent which part of the key space belongs -on each page, and can have attribute values copied from non-pivot tuples -that were deleted and killed by VACUUM some time ago. In principle, we could -truncate away attributes that are not needed for a page high key during a leaf -page split, provided that the remaining attributes distinguish the last index -tuple on the post-split left page as belonging on the left page, and the first -index tuple on the post-split right page as belonging on the right page. This -optimization is sometimes called suffix truncation, and may appear in a future -release. Since the high key is subsequently reused as the downlink in the -parent page for the new right page, suffix truncation can increase index -fan-out considerably by keeping pivot tuples short. INCLUDE indexes similarly -truncate away non-key attributes at the time of a leaf page split, -increasing fan-out. +to heap tuples, that are used only for tree navigation. Pivot tuples +includes all tuples on non-leaf pages and high keys on leaf pages. Note +that pivot index tuples are only used to represent which part of the key +space belongs on each page, and can have attribute values copied from +non-pivot tuples that were deleted and killed by VACUUM some time ago. + +Notes about suffix truncation +----------------------------- + +We truncate away suffix key attributes that are not needed for a page high +key during a leaf page split when the remaining attributes distinguish the +last index tuple on the post-split left page as belonging on the left page, +and the first index tuple on the post-split right page as belonging on the +right page. A truncated tuple logically retains all key attributes, though +they implicitly have "negative infinity" as their value, and have no +storage overhead. Since the high key is subsequently reused as the +downlink in the parent page for the new right page, suffix truncation makes +pivot tuples short. INCLUDE indexes are guaranteed to have non-key +attributes truncated at the time of a leaf page split, but may also have +some key attributes truncated away, based on the usual criteria for key +attributes. They are not a special case, since non-key attributes are +merely payload to B-Tree searches. + +The goal of suffix truncation of key attributes is to improve index +fan-out. The technique was first described by Bayer and Unterauer (R.Bayer +and K.Unterauer, Prefix B-Trees, ACM Transactions on Database Systems, Vol +2, No. 1, March 1977, pp 11-26). The Postgres implementation is loosely +based on their paper. Note that Postgres only implements what the paper +refers to as simple prefix B-Trees. Note also that the paper assumes that +the tree has keys that consist of single strings that maintain the "prefix +property", much like strings that are stored in a suffix tree (comparisons +of earlier bytes must always be more significant than comparisons of later +bytes, and, in general, the strings must compare in a way that doesn't +break transitive consistency as they're split into pieces). Suffix +truncation in Postgres currently only works at the whole-attribute +granularity, but it would be straightforward to invent opclass +infrastructure that manufactures a smaller attribute value in the case of +variable-length types, such as text. An opclass support function could +manufacture the shortest possible key value that still correctly separates +each half of a leaf page split. Notes About Data Representation ------------------------------- @@ -642,9 +686,10 @@ so we have to play some games. On a page that is not rightmost in its tree level, the "high key" is kept in the page's first item, and real data items start at item 2. The link portion of the "high key" item goes unused. A page that is -rightmost has no "high key", so data items start with the first item. -Putting the high key at the left, rather than the right, may seem odd, -but it avoids moving the high key as we add data items. +rightmost has no "high key" (it's implicitly positive infinity), so data +items start with the first item. Putting the high key at the left, rather +than the right, may seem odd, but it avoids moving the high key as we add +data items. On a leaf page, the data items are simply links to (TIDs of) tuples in the relation being indexed, with the associated key values. @@ -658,4 +703,17 @@ downlink. The first data item on each such page has no lower bound routines must treat it accordingly. The actual key stored in the item is irrelevant, and need not be stored at all. This arrangement corresponds to the fact that an L&Y non-leaf page has one more pointer -than key. +than key. Suffix truncation's negative infinity attributes behave in +the same way. + +Non-leaf pages only truly need to truncate their first item to zero +attributes at the leftmost level, since that truly is negative infinity. +All other negative infinity items are only really negative infinity +within the subtree that the page is at the root of (or is a leftmost +page within). We truncate away all attributes of the first item on +non-leaf pages just the same, to save a little space. If we ever +avoided zero-truncating items on pages where that doesn't accurately +represent the absolute separation of the keyspace, we'd be left with +"low key" items on internal pages -- a key value that can be used as a +lower bound on items on the page, much like the high key is an upper +bound. diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 582e5b0652..77bc6ee9b3 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -52,19 +52,19 @@ typedef struct static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf); -static TransactionId _bt_check_unique(Relation rel, IndexTuple itup, - Relation heapRel, Buffer buf, OffsetNumber offset, - ScanKey itup_scankey, - IndexUniqueCheck checkUnique, bool *is_unique, - uint32 *speculativeToken); -static void _bt_findinsertloc(Relation rel, +static TransactionId _bt_check_unique(Relation rel, BTScanInsert itup_scankey, + IndexTuple itup, Relation heapRel, Buffer buf, + OffsetNumber offset, IndexUniqueCheck checkUnique, + bool *is_unique, uint32 *speculativeToken); +static OffsetNumber _bt_findinsertloc(Relation rel, + BTScanInsert itup_scankey, Buffer *bufptr, - OffsetNumber *offsetptr, - int keysz, - ScanKey scankey, + bool restorebinsrch, IndexTuple newtup, BTStack stack, Relation heapRel); +static bool _bt_findinsertrandom(Relation rel, Relation heapRel, Buffer buf, + bool *restorebinsrch, Size itemsz); static void _bt_insertonpg(Relation rel, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, @@ -72,7 +72,7 @@ static void _bt_insertonpg(Relation rel, Buffer buf, Buffer cbuf, bool split_only_page); static Buffer _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz, - IndexTuple newitem, bool newitemonleft); + IndexTuple newitem, bool newitemonleft, bool truncate); static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only); static OffsetNumber _bt_findsplitloc(Relation rel, Page page, @@ -84,8 +84,8 @@ static void _bt_checksplitloc(FindSplitData *state, int dataitemstoleft, Size firstoldonrightsz); static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off); -static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum, - int keysz, ScanKey scankey); +static bool _bt_isequal(TupleDesc itupdesc, BTScanInsert itup_scankey, + Page page, OffsetNumber offnum); static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel); /* @@ -111,18 +111,21 @@ _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel) { bool is_unique = false; - int indnkeyatts; - ScanKey itup_scankey; + BTScanInsert itup_scankey; BTStack stack = NULL; Buffer buf; - OffsetNumber offset; + Page page; + BTPageOpaque lpageop; bool fastpath; - indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); - Assert(indnkeyatts != 0); + Assert(IndexRelationGetNumberOfKeyAttributes(rel) != 0); /* we need an insertion scan key to do our search, so build one */ - itup_scankey = _bt_mkscankey(rel, itup); + itup_scankey = _bt_mkscankey(rel, itup, false); +top: + /* Cannot use real heap TID in unique case -- it'll be restored later */ + if (itup_scankey->uniquekeys && checkUnique != UNIQUE_CHECK_NO) + itup_scankey->scantid = _bt_lowest_scantid(); /* * It's very common to have an index on an auto-incremented or @@ -143,14 +146,10 @@ _bt_doinsert(Relation rel, IndexTuple itup, * other backend might be concurrently inserting into the page, thus * reducing our chances to finding an insertion place in this page. */ -top: fastpath = false; - offset = InvalidOffsetNumber; if (RelationGetTargetBlock(rel) != InvalidBlockNumber) { Size itemsz; - Page page; - BTPageOpaque lpageop; /* * Conditionally acquire exclusive lock on the buffer before doing any @@ -174,14 +173,17 @@ top: /* * Check if the page is still the rightmost leaf page, has enough * free space to accommodate the new tuple, and the insertion scan - * key is strictly greater than the first key on the page. + * key is strictly greater than the first key on the page. The + * itup_scankey scantid value may prevent the optimization from + * being applied despite being safe when it was temporarily set to + * a sentinel low value, though only when the page is full of + * duplicates. */ if (P_ISLEAF(lpageop) && P_RIGHTMOST(lpageop) && !P_IGNORE(lpageop) && (PageGetFreeSpace(page) > itemsz) && PageGetMaxOffsetNumber(page) >= P_FIRSTDATAKEY(lpageop) && - _bt_compare(rel, indnkeyatts, itup_scankey, page, - P_FIRSTDATAKEY(lpageop)) > 0) + _bt_compare(rel, itup_scankey, page, P_FIRSTDATAKEY(lpageop)) > 0) { /* * The right-most block should never have an incomplete split. @@ -220,8 +222,7 @@ top: * Find the first page containing this key. Buffer returned by * _bt_search() is locked in exclusive mode. */ - stack = _bt_search(rel, indnkeyatts, itup_scankey, false, &buf, BT_WRITE, - NULL); + stack = _bt_search(rel, itup_scankey, &buf, BT_WRITE, NULL); } /* @@ -231,12 +232,13 @@ top: * NOTE: obviously, _bt_check_unique can only detect keys that are already * in the index; so it cannot defend against concurrent insertions of the * same key. We protect against that by means of holding a write lock on - * the target page. Any other would-be inserter of the same key must - * acquire a write lock on the same target page, so only one would-be - * inserter can be making the check at one time. Furthermore, once we are - * past the check we hold write locks continuously until we have performed - * our insertion, so no later inserter can fail to see our insertion. - * (This requires some care in _bt_findinsertloc.) + * the first page the value could be on, regardless of the value of its + * implicit heap TID tie-breaker attribute. Any other would-be inserter + * of the same key must acquire a write lock on the same page, so only one + * would-be inserter can be making the check at one time. Furthermore, + * once we are past the check we hold write locks continuously until we + * have performed our insertion, so no later inserter can fail to see our + * insertion. (This requires some care in _bt_findinsertloc.) * * If we must wait for another xact, we release the lock while waiting, * and then must start over completely. @@ -249,9 +251,24 @@ top: { TransactionId xwait; uint32 speculativeToken; + OffsetNumber offset; - offset = _bt_binsrch(rel, buf, indnkeyatts, itup_scankey, false); - xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey, + page = BufferGetPage(buf); + lpageop = (BTPageOpaque) PageGetSpecialPointer(page); + + /* + * Arrange for the later _bt_findinsertloc call to _bt_binsrch to + * avoid repeating the work done during this initial _bt_binsrch call. + * Clear the _bt_lowest_scantid-supplied scantid value first, though, + * so that the itup_scankey-cached low and high bounds will enclose a + * range of offsets in the event of multiple duplicates. (Our + * _bt_binsrch call cannot be allowed to incorrectly enclose a single + * offset: the offset of the first duplicate among many on the page.) + */ + itup_scankey->scantid = NULL; + itup_scankey->savebinsrch = true; + offset = _bt_binsrch(rel, itup_scankey, buf); + xwait = _bt_check_unique(rel, itup_scankey, itup, heapRel, buf, offset, checkUnique, &is_unique, &speculativeToken); if (TransactionIdIsValid(xwait)) @@ -274,10 +291,16 @@ top: _bt_freestack(stack); goto top; } + + /* Uniqueness is established -- restore heap tid as scantid */ + if (itup_scankey->uniquekeys) + itup_scankey->scantid = &itup->t_tid; } if (checkUnique != UNIQUE_CHECK_EXISTING) { + OffsetNumber insertoff; + /* * The only conflict predicate locking cares about for indexes is when * an index tuple insert conflicts with an existing lock. Since the @@ -288,10 +311,11 @@ top: * attributes are not considered part of the key space. */ CheckForSerializableConflictIn(rel, NULL, buf); - /* do the insertion */ - _bt_findinsertloc(rel, &buf, &offset, indnkeyatts, itup_scankey, itup, - stack, heapRel); - _bt_insertonpg(rel, buf, InvalidBuffer, stack, itup, offset, false); + /* do the insertion, possibly on a page to the right in unique case */ + insertoff = _bt_findinsertloc(rel, itup_scankey, &buf, + checkUnique != UNIQUE_CHECK_NO, itup, + stack, heapRel); + _bt_insertonpg(rel, buf, InvalidBuffer, stack, itup, insertoff, false); } else { @@ -302,7 +326,7 @@ top: /* be tidy */ if (stack) _bt_freestack(stack); - _bt_freeskey(itup_scankey); + pfree(itup_scankey); return is_unique; } @@ -327,13 +351,12 @@ top: * core code must redo the uniqueness check later. */ static TransactionId -_bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, - Buffer buf, OffsetNumber offset, ScanKey itup_scankey, - IndexUniqueCheck checkUnique, bool *is_unique, - uint32 *speculativeToken) +_bt_check_unique(Relation rel, BTScanInsert itup_scankey, + IndexTuple itup, Relation heapRel, Buffer buf, + OffsetNumber offset, IndexUniqueCheck checkUnique, + bool *is_unique, uint32 *speculativeToken) { TupleDesc itupdesc = RelationGetDescr(rel); - int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); SnapshotData SnapshotDirty; OffsetNumber maxoff; Page page; @@ -344,6 +367,10 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, /* Assume unique until we find a duplicate */ *is_unique = true; + /* Fast path for case where there are clearly no duplicates */ + if (itup_scankey->low == itup_scankey->high) + return InvalidTransactionId; + InitDirtySnapshot(SnapshotDirty); page = BufferGetPage(buf); @@ -392,7 +419,7 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, * in real comparison, but only for ordering/finding items on * pages. - vadim 03/24/97 */ - if (!_bt_isequal(itupdesc, page, offset, indnkeyatts, itup_scankey)) + if (!_bt_isequal(itupdesc, itup_scankey, page, offset)) break; /* we're past all the equal tuples */ /* okay, we gotta fetch the heap tuple ... */ @@ -553,11 +580,29 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, offset = OffsetNumberNext(offset); else { - /* If scankey == hikey we gotta check the next page too */ + /* + * If scankey <= hikey (leaving out the heap TID attribute), we + * gotta check the next page too. + * + * We cannot get away with giving up without going to the next + * page when true key values are all == hikey, because heap TID is + * ignored when considering duplicates (caller is sure to not + * provide a scantid in scankey). We could get away with this in + * a hypothetical world where unique indexes certainly never + * contain physical duplicates, since heap TID would never be + * treated as part of the keyspace --- not here, and not at any + * other point. + * + * If we end up moving right when scankey == hikey, then in + * practice there is a very strong chance that visiting the next + * page will find duplicates that need to be checked. Caller's + * _bt_lowest_scantid() optimization already eliminates all cases + * where visiting an extra leaf page is truly unnecessary. + */ + Assert(itup_scankey->scantid == NULL); if (P_RIGHTMOST(opaque)) break; - if (!_bt_isequal(itupdesc, page, P_HIKEY, - indnkeyatts, itup_scankey)) + if (_bt_compare(rel, itup_scankey, page, P_HIKEY) > 0) break; /* Advance to next non-dead page --- there must be one */ for (;;) @@ -599,40 +644,40 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, /* - * _bt_findinsertloc() -- Finds an insert location for a tuple + * _bt_findinsertloc() -- Finds an insert location for a new tuple * - * If the new key is equal to one or more existing keys, we can - * legitimately place it anywhere in the series of equal keys --- in fact, - * if the new key is equal to the page's "high key" we can place it on - * the next page. If it is equal to the high key, and there's not room - * to insert the new tuple on the current page without splitting, then - * we can move right hoping to find more free space and avoid a split. - * (We should not move right indefinitely, however, since that leads to - * O(N^2) insertion behavior in the presence of many equal keys.) - * Once we have chosen the page to put the key on, we'll insert it before - * any existing equal keys because of the way _bt_binsrch() works. + * On entry, *bufptr contains the page that the new tuple unambiguously + * belongs on. This may not be quite right for callers that just called + * _bt_check_unique(), though, since they won't have initially searched + * using a scantid. They'll have to insert into a page somewhere to the + * right in rare cases where there are many physical duplicates in a + * unique index. * - * If there's not enough room in the space, we try to make room by - * removing any LP_DEAD tuples. + * _bt_check_unique() callers arrange for their insertion scan key to + * save the progress of the last binary search performed. No additional + * binary search comparisons occur in the common case where there was no + * existing duplicate tuple, though we may occasionally still not be able + * to reuse their work for our own reasons. Even when there are garbage + * duplicates, very few binary search comparisons will be performed + * without being strictly necessarily. (Doesn't seem worthwhile to + * optimize this further.) * - * On entry, *bufptr and *offsetptr point to the first legal position - * where the new tuple could be inserted. The caller should hold an - * exclusive lock on *bufptr. *offsetptr can also be set to - * InvalidOffsetNumber, in which case the function will search for the - * right location within the page if needed. On exit, they point to the - * chosen insert location. If _bt_findinsertloc decides to move right, - * the lock and pin on the original page will be released and the new - * page returned to the caller is exclusively locked instead. + * The caller should hold an exclusive lock on *bufptr in all cases. On + * exit, bufptr points to the chosen insert location in all cases. If + * we have to move right, the lock and pin on the original page will be + * released, and the new page returned to the caller is exclusively + * locked instead. In any case, we return the offset that caller should + * use to insert into the buffer pointed to by bufptr on return. * - * newtup is the new tuple we're inserting, and scankey is an insertion - * type scan key for it. + * This is also where opportunistic microvacuuming of LP_DEAD tuples + * occurs. It has to happen here, since it may invalidate caller's + * restorebinsrch hint. */ -static void +static OffsetNumber _bt_findinsertloc(Relation rel, + BTScanInsert itup_scankey, Buffer *bufptr, - OffsetNumber *offsetptr, - int keysz, - ScanKey scankey, + bool restorebinsrch, IndexTuple newtup, BTStack stack, Relation heapRel) @@ -641,91 +686,55 @@ _bt_findinsertloc(Relation rel, Page page = BufferGetPage(buf); Size itemsz; BTPageOpaque lpageop; - bool movedright, - vacuumed; OffsetNumber newitemoff; - OffsetNumber firstlegaloff = *offsetptr; + + Assert(!itup_scankey->uniquekeys || itup_scankey->scantid != NULL); + Assert(itup_scankey->uniquekeys || itup_scankey->scantid == NULL); + Assert(itup_scankey->scantid == NULL || + ItemPointerCompare(itup_scankey->scantid, _bt_lowest_scantid()) > 0); lpageop = (BTPageOpaque) PageGetSpecialPointer(page); - itemsz = IndexTupleSize(newtup); itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we * need to be consistent */ - /* - * Check whether the item can fit on a btree page at all. (Eventually, we - * ought to try to apply TOAST methods if not.) We actually need to be - * able to fit three items on every page, so restrict any one item to 1/3 - * the per-page available space. Note that at this point, itemsz doesn't - * include the ItemId. - * - * NOTE: if you change this, see also the similar code in _bt_buildadd(). - */ if (itemsz > BTMaxItemSize(page)) - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", - itemsz, BTMaxItemSize(page), - RelationGetRelationName(rel)), - errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" - "Consider a function index of an MD5 hash of the value, " - "or use full text indexing."), - errtableconstraint(heapRel, - RelationGetRelationName(rel)))); + _bt_check_third_page(rel, heapRel, page, newtup); - /*---------- - * If we will need to split the page to put the item on this page, - * check whether we can put the tuple somewhere to the right, - * instead. Keep scanning right until we - * (a) find a page with enough free space, - * (b) reach the last page where the tuple can legally go, or - * (c) get tired of searching. - * (c) is not flippant; it is important because if there are many - * pages' worth of equal keys, it's better to split one of the early - * pages than to scan all the way to the end of the run of equal keys - * on every insert. We implement "get tired" as a random choice, - * since stopping after scanning a fixed number of pages wouldn't work - * well (we'd never reach the right-hand side of previously split - * pages). Currently the probability of moving right is set at 0.99, - * which may seem too high to change the behavior much, but it does an - * excellent job of preventing O(N^2) behavior with many equal keys. - *---------- - */ - movedright = false; - vacuumed = false; - while (PageGetFreeSpace(page) < itemsz) + for (;;) { Buffer rbuf; BlockNumber rblkno; + int cmpval; - /* - * before considering moving right, see if we can obtain enough space - * by erasing LP_DEAD items - */ - if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop)) + Assert(P_ISLEAF(lpageop)); + + if (P_RIGHTMOST(lpageop)) + break; + + cmpval = _bt_compare(rel, itup_scankey, page, P_HIKEY); + if (itup_scankey->uniquekeys) + { + /* Version 4 -- handle possible concurrent page splits */ + if (cmpval <= 0) + break; + } + else { - _bt_vacuum_one_page(rel, buf, heapRel); - /* - * remember that we vacuumed this page, because that makes the - * hint supplied by the caller invalid + * Version 2 or 3 -- handle possible concurrent page splits, and + * case when there are many duplicates, and there is a choice of + * which page to place new tuple on */ - vacuumed = true; - - if (PageGetFreeSpace(page) >= itemsz) - break; /* OK, now we have enough space */ + if (cmpval != 0 || _bt_findinsertrandom(rel, heapRel, buf, + &restorebinsrch, itemsz)) + break; } /* - * nope, so check conditions (b) and (c) enumerated above - */ - if (P_RIGHTMOST(lpageop) || - _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 || - random() <= (MAX_RANDOM_VALUE / 100)) - break; - - /* - * step right to next non-dead page + * step right to next non-dead page. this is only needed for unique + * indexes, and pg_upgrade'd indexes that still use BTREE_VERSION 2 or + * 3, where heap TID isn't considered to be a part of the keyspace. * * must write-lock that page before releasing write lock on current * page; else someone else's _bt_check_unique scan could fail to see @@ -764,27 +773,79 @@ _bt_findinsertloc(Relation rel, } _bt_relbuf(rel, buf); buf = rbuf; - movedright = true; - vacuumed = false; + restorebinsrch = false; } /* - * Now we are on the right page, so find the insert position. If we moved - * right at all, we know we should insert at the start of the page. If we - * didn't move right, we can use the firstlegaloff hint if the caller - * supplied one, unless we vacuumed the page which might have moved tuples - * around making the hint invalid. If we didn't move right or can't use - * the hint, find the position by searching. + * Perform micro-vacuuming of the page we're about to insert tuple on if + * it looks like it has LP_DEAD items. Only micro-vacuum when it might + * forestall a page split, though. (Micro-vacuuming occasionally fails to + * prevent a split, since we're not guaranteed to free more space than + * what will be needed for our single new tuple.) */ - if (movedright) - newitemoff = P_FIRSTDATAKEY(lpageop); - else if (firstlegaloff != InvalidOffsetNumber && !vacuumed) - newitemoff = firstlegaloff; - else - newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false); + if (P_HAS_GARBAGE(lpageop) && PageGetFreeSpace(page) < itemsz) + { + _bt_vacuum_one_page(rel, buf, heapRel); + + restorebinsrch = false; + } + + /* _bt_check_unique() callers often avoid binary search effort */ + itup_scankey->restorebinsrch = restorebinsrch; + newitemoff = _bt_binsrch(rel, itup_scankey, buf); + Assert(!itup_scankey->restorebinsrch); + Assert(!restorebinsrch || + newitemoff == _bt_binsrch(rel, itup_scankey, buf)); *bufptr = buf; - *offsetptr = newitemoff; + return newitemoff; +} + +/* + * If we will need to split the page to put the item on this page, check + * whether we can put the tuple somewhere to the right, instead. Keep + * scanning right until we (a) find a page with enough free space, (b) reach + * the last page where the tuple can legally go, or (c) get tired of + * searching. (c) is not flippant; it is important because if there are many + * pages' worth of equal keys, it's better to split one of the early pages + * than to scan all the way to the end of the run of equal keys on every + * insert. We implement "get tired" as a random choice, since stopping after + * scanning a fixed number of pages wouldn't work well (we'd never reach the + * right-hand side of previously split pages). Currently the probability of + * moving right is set at 0.99, which may seem too high to change the behavior + * much, but it does an excellent job of preventing O(N^2) behavior with many + * equal keys. + * + * Returns value indicating if caller should insert on candidate leaf page. + */ +static bool +_bt_findinsertrandom(Relation rel, Relation heapRel, Buffer buf, + bool *restorebinsrch, Size itemsz) +{ + Page page = BufferGetPage(buf); + BTPageOpaque lpageop; + + lpageop = (BTPageOpaque) PageGetSpecialPointer(page); + Assert(P_ISLEAF(lpageop)); + + if (PageGetFreeSpace(page) >= itemsz) + return true; + + /* + * before considering moving right, see if we can obtain enough space + * by erasing LP_DEAD items + */ + if (P_HAS_GARBAGE(lpageop)) + { + _bt_vacuum_one_page(rel, buf, heapRel); + + *restorebinsrch = false; + if (PageGetFreeSpace(page) >= itemsz) + return true; /* OK, now we have enough space */ + } + + /* "Get tired" at random */ + return random() <= (MAX_RANDOM_VALUE / 100); } /*---------- @@ -833,6 +894,8 @@ _bt_insertonpg(Relation rel, BTPageOpaque lpageop; OffsetNumber firstright = InvalidOffsetNumber; Size itemsz; + int indnatts = IndexRelationGetNumberOfAttributes(rel); + int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); page = BufferGetPage(buf); lpageop = (BTPageOpaque) PageGetSpecialPointer(page); @@ -840,12 +903,9 @@ _bt_insertonpg(Relation rel, /* child buffer must be given iff inserting on an internal page */ Assert(P_ISLEAF(lpageop) == !BufferIsValid(cbuf)); /* tuple must have appropriate number of attributes */ - Assert(!P_ISLEAF(lpageop) || - BTreeTupleGetNAtts(itup, rel) == - IndexRelationGetNumberOfAttributes(rel)); - Assert(P_ISLEAF(lpageop) || - BTreeTupleGetNAtts(itup, rel) == - IndexRelationGetNumberOfKeyAttributes(rel)); + Assert(BTreeTupleGetNAtts(itup, rel) > 0); + Assert(!P_ISLEAF(lpageop) || BTreeTupleGetNAtts(itup, rel) == indnatts); + Assert(P_ISLEAF(lpageop) || BTreeTupleGetNAtts(itup, rel) <= indnkeyatts); /* The caller should've finished any incomplete splits already. */ if (P_INCOMPLETE_SPLIT(lpageop)) @@ -867,6 +927,7 @@ _bt_insertonpg(Relation rel, { bool is_root = P_ISROOT(lpageop); bool is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(lpageop); + bool truncate; bool newitemonleft; Buffer rbuf; @@ -893,9 +954,16 @@ _bt_insertonpg(Relation rel, newitemoff, itemsz, &newitemonleft); + /* + * Perform truncation of the new high key for the left half of the + * split when splitting a leaf page. Don't do so with version 3 + * indexes unless the index has non-key attributes. + */ + truncate = P_ISLEAF(lpageop) && + (_bt_hasuniquekeys(rel) || indnatts != indnkeyatts); /* split the buffer into left and right halves */ rbuf = _bt_split(rel, buf, cbuf, firstright, - newitemoff, itemsz, itup, newitemonleft); + newitemoff, itemsz, itup, newitemonleft, truncate); PredicateLockPageSplit(rel, BufferGetBlockNumber(buf), BufferGetBlockNumber(rbuf)); @@ -977,7 +1045,7 @@ _bt_insertonpg(Relation rel, if (BufferIsValid(metabuf)) { /* upgrade meta-page if needed */ - if (metad->btm_version < BTREE_VERSION) + if (metad->btm_version < BTREE_META_VERSION) _bt_upgrademetapage(metapg); metad->btm_fastroot = itup_blkno; metad->btm_fastlevel = lpageop->btpo.level; @@ -1032,6 +1100,9 @@ _bt_insertonpg(Relation rel, if (BufferIsValid(metabuf)) { + Assert(metad->btm_version == BTREE_META_VERSION || + metad->btm_version == BTREE_VERSION); + xlmeta.version = metad->btm_root; xlmeta.root = metad->btm_root; xlmeta.level = metad->btm_level; xlmeta.fastroot = metad->btm_fastroot; @@ -1097,7 +1168,10 @@ _bt_insertonpg(Relation rel, * On entry, buf is the page to split, and is pinned and write-locked. * firstright is the item index of the first item to be moved to the * new right page. newitemoff etc. tell us about the new item that - * must be inserted along with the data from the old page. + * must be inserted along with the data from the old page. truncate + * tells us if the new high key should undergo suffix truncation. + * (Version 4 pivot tuples always have an explicit representation of + * the number of non-truncated attributes that remain.) * * When splitting a non-leaf page, 'cbuf' is the left-sibling of the * page we're inserting the downlink for. This function will clear the @@ -1109,7 +1183,7 @@ _bt_insertonpg(Relation rel, static Buffer _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, - bool newitemonleft) + bool newitemonleft, bool truncate) { Buffer rbuf; Page origpage; @@ -1132,8 +1206,6 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, OffsetNumber i; bool isleaf; IndexTuple lefthikey; - int indnatts = IndexRelationGetNumberOfAttributes(rel); - int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); /* Acquire a new page to split into */ rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); @@ -1203,7 +1275,9 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, itemid = PageGetItemId(origpage, P_HIKEY); itemsz = ItemIdGetLength(itemid); item = (IndexTuple) PageGetItem(origpage, itemid); - Assert(BTreeTupleGetNAtts(item, rel) == indnkeyatts); + Assert(BTreeTupleGetNAtts(item, rel) > 0); + Assert(BTreeTupleGetNAtts(item, rel) <= + IndexRelationGetNumberOfKeyAttributes(rel)); if (PageAddItem(rightpage, (Item) item, itemsz, rightoff, false, false) == InvalidOffsetNumber) { @@ -1217,8 +1291,9 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, /* * The "high key" for the new left page will be the first key that's going - * to go into the new right page. This might be either the existing data - * item at position firstright, or the incoming tuple. + * to go into the new right page, or possibly a truncated version if this + * is a leaf page split. This might be either the existing data item at + * position firstright, or the incoming tuple. */ leftoff = P_HIKEY; if (!newitemonleft && newitemoff == firstright) @@ -1236,25 +1311,58 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, } /* - * Truncate non-key (INCLUDE) attributes of the high key item before - * inserting it on the left page. This only needs to happen at the leaf + * Truncate nondistinguishing key attributes of the high key item before + * inserting it on the left page. This can only happen at the leaf * level, since in general all pivot tuple values originate from leaf * level high keys. This isn't just about avoiding unnecessary work, - * though; truncating unneeded key attributes (more aggressive suffix - * truncation) can only be performed at the leaf level anyway. This is - * because a pivot tuple in a grandparent page must guide a search not - * only to the correct parent page, but also to the correct leaf page. + * though; truncating unneeded key suffix attributes can only be + * performed at the leaf level anyway. This is because a pivot tuple in + * a grandparent page must guide a search not only to the correct parent + * page, but also to the correct leaf page. */ - if (indnatts != indnkeyatts && isleaf) + if (truncate) { - lefthikey = _bt_nonkey_truncate(rel, item); + OffsetNumber lastleftoff; + IndexTuple lastleft; + + /* + * Determine which tuple will become the last on the left page. The + * last left tuple and the first right tuple enclose the split point, + * and are needed to determine how far truncation can go while still + * leaving us with a high key that distinguishes the left side from + * the right side. + */ + Assert(isleaf); + if (newitemonleft && newitemoff == firstright) + { + /* incoming tuple will become last on left page */ + lastleft = newitem; + } + else + { + /* item just before firstright will become last on left page */ + lastleftoff = OffsetNumberPrev(firstright); + itemid = PageGetItemId(origpage, lastleftoff); + lastleft = (IndexTuple) PageGetItem(origpage, itemid); + } + + /* + * Truncate first item on the right side to create a new high key for + * the left side. The high key must be strictly less than all tuples + * on the right side of the split, but can be equal to the last item + * on the left side of the split. + */ + Assert(lastleft != item); + lefthikey = _bt_truncate(rel, lastleft, item, false); itemsz = IndexTupleSize(lefthikey); itemsz = MAXALIGN(itemsz); } else lefthikey = item; - Assert(BTreeTupleGetNAtts(lefthikey, rel) == indnkeyatts); + Assert(BTreeTupleGetNAtts(lefthikey, rel) > 0); + Assert(BTreeTupleGetNAtts(lefthikey, rel) <= + IndexRelationGetNumberOfKeyAttributes(rel)); if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff, false, false) == InvalidOffsetNumber) { @@ -1447,7 +1555,6 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, xl_btree_split xlrec; uint8 xlinfo; XLogRecPtr recptr; - bool loglhikey = false; xlrec.level = ropaque->btpo.level; xlrec.firstright = firstright; @@ -1476,22 +1583,10 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, if (newitemonleft) XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz)); - /* Log left page */ - if (!isleaf || indnatts != indnkeyatts) - { - /* - * We must also log the left page's high key. There are two - * reasons for that: right page's leftmost key is suppressed on - * non-leaf levels and in covering indexes included columns are - * truncated from high keys. Show it as belonging to the left - * page buffer, so that it is not stored if XLogInsert decides it - * needs a full-page image of the left page. - */ - itemid = PageGetItemId(origpage, P_HIKEY); - item = (IndexTuple) PageGetItem(origpage, itemid); - XLogRegisterBufData(0, (char *) item, MAXALIGN(IndexTupleSize(item))); - loglhikey = true; - } + /* Log left page. We must also log the left page's high key. */ + itemid = PageGetItemId(origpage, P_HIKEY); + item = (IndexTuple) PageGetItem(origpage, itemid); + XLogRegisterBufData(0, (char *) item, MAXALIGN(IndexTupleSize(item))); /* * Log the contents of the right page in the format understood by @@ -1509,9 +1604,7 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, (char *) rightpage + ((PageHeader) rightpage)->pd_upper, ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper); - xlinfo = newitemonleft ? - (loglhikey ? XLOG_BTREE_SPLIT_L_HIGHKEY : XLOG_BTREE_SPLIT_L) : - (loglhikey ? XLOG_BTREE_SPLIT_R_HIGHKEY : XLOG_BTREE_SPLIT_R); + xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R; recptr = XLogInsert(RM_BTREE_ID, xlinfo); PageSetLSN(origpage, recptr); @@ -2164,7 +2257,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) START_CRIT_SECTION(); /* upgrade metapage if needed */ - if (metad->btm_version < BTREE_VERSION) + if (metad->btm_version < BTREE_META_VERSION) _bt_upgrademetapage(metapg); /* set btree special data */ @@ -2199,7 +2292,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) /* * insert the right page pointer into the new root page. */ - Assert(BTreeTupleGetNAtts(right_item, rel) == + Assert(BTreeTupleGetNAtts(right_item, rel) > 0); + Assert(BTreeTupleGetNAtts(right_item, rel) <= IndexRelationGetNumberOfKeyAttributes(rel)); if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY, false, false) == InvalidOffsetNumber) @@ -2232,6 +2326,9 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD); XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); + Assert(metad->btm_version == BTREE_META_VERSION || + metad->btm_version == BTREE_VERSION); + md.version = metad->btm_version; md.root = rootblknum; md.level = metad->btm_level; md.fastroot = rootblknum; @@ -2296,6 +2393,7 @@ _bt_pgaddtup(Page page, { trunctuple = *itup; trunctuple.t_info = sizeof(IndexTupleData); + /* Deliberately zero INDEX_ALT_TID_MASK bits */ BTreeTupleSetNAtts(&trunctuple, 0); itup = &trunctuple; itemsize = sizeof(IndexTupleData); @@ -2311,28 +2409,25 @@ _bt_pgaddtup(Page page, /* * _bt_isequal - used in _bt_doinsert in check for duplicates. * - * This is very similar to _bt_compare, except for NULL handling. - * Rule is simple: NOT_NULL not equal NULL, NULL not equal NULL too. + * This is very similar to _bt_compare, except for NULL and negative infinity + * handling. Rule is simple: NOT_NULL not equal NULL, NULL not equal NULL too. */ static bool -_bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum, - int keysz, ScanKey scankey) +_bt_isequal(TupleDesc itupdesc, BTScanInsert itup_scankey, Page page, + OffsetNumber offnum) { IndexTuple itup; + ScanKey scankey; int i; - /* Better be comparing to a leaf item */ + /* Better be comparing to a non-pivot item */ Assert(P_ISLEAF((BTPageOpaque) PageGetSpecialPointer(page))); + Assert(offnum >= P_FIRSTDATAKEY((BTPageOpaque) PageGetSpecialPointer(page))); + scankey = itup_scankey->scankeys; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); - /* - * It's okay that we might perform a comparison against a truncated page - * high key when caller needs to determine if _bt_check_unique scan must - * continue on to the next page. Caller never asks us to compare non-key - * attributes within an INCLUDE index. - */ - for (i = 1; i <= keysz; i++) + for (i = 1; i <= itup_scankey->keysz; i++) { AttrNumber attno; Datum datum; diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 4082103fe2..716f7c1f40 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -34,6 +34,7 @@ #include "utils/snapmgr.h" static void _bt_cachemetadata(Relation rel, BTMetaPageData *metad); +static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf); static bool _bt_mark_page_halfdead(Relation rel, Buffer buf, BTStack stack); static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty); @@ -77,7 +78,9 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level) } /* - * _bt_upgrademetapage() -- Upgrade a meta-page from an old format to the new. + * _bt_upgrademetapage() -- Upgrade a meta-page from an old format to version + * 3, the last version that can be updated without broadly affecting on-disk + * compatibility. (A REINDEX is required to upgrade to version 4.) * * This routine does purely in-memory image upgrade. Caller is * responsible for locking, WAL-logging etc. @@ -93,11 +96,11 @@ _bt_upgrademetapage(Page page) /* It must be really a meta page of upgradable version */ Assert(metaopaque->btpo_flags & BTP_META); - Assert(metad->btm_version < BTREE_VERSION); + Assert(metad->btm_version < BTREE_META_VERSION); Assert(metad->btm_version >= BTREE_MIN_VERSION); /* Set version number and fill extra fields added into version 3 */ - metad->btm_version = BTREE_VERSION; + metad->btm_version = BTREE_META_VERSION; metad->btm_oldest_btpo_xact = InvalidTransactionId; metad->btm_last_cleanup_num_heap_tuples = -1.0; @@ -107,43 +110,79 @@ _bt_upgrademetapage(Page page) } /* - * Cache metadata from meta page to rel->rd_amcache. + * Cache metadata from input meta page to rel->rd_amcache. */ static void -_bt_cachemetadata(Relation rel, BTMetaPageData *metad) +_bt_cachemetadata(Relation rel, BTMetaPageData *input) { + BTMetaPageData *cached_metad; + /* We assume rel->rd_amcache was already freed by caller */ Assert(rel->rd_amcache == NULL); rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt, sizeof(BTMetaPageData)); - /* - * Meta page should be of supported version (should be already checked by - * caller). - */ - Assert(metad->btm_version >= BTREE_MIN_VERSION && - metad->btm_version <= BTREE_VERSION); + /* Meta page should be of supported version */ + Assert(input->btm_version >= BTREE_MIN_VERSION && + input->btm_version <= BTREE_VERSION); - if (metad->btm_version == BTREE_VERSION) + cached_metad = (BTMetaPageData *) rel->rd_amcache; + if (input->btm_version >= BTREE_META_VERSION) { - /* Last version of meta-data, no need to upgrade */ - memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData)); + /* Version with compatible meta-data, no need to upgrade */ + memcpy(cached_metad, input, sizeof(BTMetaPageData)); } else { - BTMetaPageData *cached_metad = (BTMetaPageData *) rel->rd_amcache; - /* * Upgrade meta-data: copy available information from meta-page and * fill new fields with default values. + * + * Note that we cannot upgrade to BTREE_VERSION/version 4 without a + * REINDEX, since extensive on-disk changes are required. */ - memcpy(rel->rd_amcache, metad, offsetof(BTMetaPageData, btm_oldest_btpo_xact)); - cached_metad->btm_version = BTREE_VERSION; + memcpy(cached_metad, input, offsetof(BTMetaPageData, btm_oldest_btpo_xact)); + cached_metad->btm_version = BTREE_META_VERSION; cached_metad->btm_oldest_btpo_xact = InvalidTransactionId; cached_metad->btm_last_cleanup_num_heap_tuples = -1.0; } } +/* + * Get metadata from share-locked buffer containing metapage, while performing + * standard sanity checks. Sanity checks here must match _bt_getroot(). + */ +static BTMetaPageData * +_bt_getmeta(Relation rel, Buffer metabuf) +{ + Page metapg; + BTPageOpaque metaopaque; + BTMetaPageData *metad; + + metapg = BufferGetPage(metabuf); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); + metad = BTPageGetMeta(metapg); + + /* sanity-check the metapage */ + if (!P_ISMETA(metaopaque) || + metad->btm_magic != BTREE_MAGIC) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" is not a btree", + RelationGetRelationName(rel)))); + + if (metad->btm_version < BTREE_MIN_VERSION || + metad->btm_version > BTREE_VERSION) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("version mismatch in index \"%s\": file version %d, " + "current version %d, minimal supported version %d", + RelationGetRelationName(rel), + metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION))); + + return metad; +} + /* * _bt_update_meta_cleanup_info() -- Update cleanup-related information in * the metapage. @@ -186,7 +225,7 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, START_CRIT_SECTION(); /* upgrade meta-page if needed */ - if (metad->btm_version < BTREE_VERSION) + if (metad->btm_version < BTREE_META_VERSION) _bt_upgrademetapage(metapg); /* update cleanup-related information */ @@ -202,6 +241,9 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, XLogBeginInsert(); XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); + Assert(metad->btm_version == BTREE_META_VERSION || + metad->btm_version == BTREE_VERSION); + md.version = metad->btm_version; md.root = metad->btm_root; md.level = metad->btm_level; md.fastroot = metad->btm_fastroot; @@ -376,7 +418,7 @@ _bt_getroot(Relation rel, int access) START_CRIT_SECTION(); /* upgrade metapage if needed */ - if (metad->btm_version < BTREE_VERSION) + if (metad->btm_version < BTREE_META_VERSION) _bt_upgrademetapage(metapg); metad->btm_root = rootblkno; @@ -400,6 +442,9 @@ _bt_getroot(Relation rel, int access) XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT); XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); + Assert(metad->btm_version == BTREE_META_VERSION || + metad->btm_version == BTREE_VERSION); + md.version = metad->btm_version; md.root = rootblkno; md.level = 0; md.fastroot = rootblkno; @@ -595,37 +640,12 @@ _bt_getrootheight(Relation rel) { BTMetaPageData *metad; - /* - * We can get what we need from the cached metapage data. If it's not - * cached yet, load it. Sanity checks here must match _bt_getroot(). - */ if (rel->rd_amcache == NULL) { Buffer metabuf; - Page metapg; - BTPageOpaque metaopaque; metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); - metapg = BufferGetPage(metabuf); - metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); - metad = BTPageGetMeta(metapg); - - /* sanity-check the metapage */ - if (!P_ISMETA(metaopaque) || - metad->btm_magic != BTREE_MAGIC) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("index \"%s\" is not a btree", - RelationGetRelationName(rel)))); - - if (metad->btm_version < BTREE_MIN_VERSION || - metad->btm_version > BTREE_VERSION) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("version mismatch in index \"%s\": file version %d, " - "current version %d, minimal supported version %d", - RelationGetRelationName(rel), - metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION))); + metad = _bt_getmeta(rel, metabuf); /* * If there's no root page yet, _bt_getroot() doesn't expect a cache @@ -642,19 +662,70 @@ _bt_getrootheight(Relation rel) * Cache the metapage data for next time */ _bt_cachemetadata(rel, metad); - + /* We shouldn't have cached it if any of these fail */ + Assert(metad->btm_magic == BTREE_MAGIC); + Assert(metad->btm_version >= BTREE_META_VERSION); + Assert(metad->btm_fastroot != P_NONE); _bt_relbuf(rel, metabuf); } + /* Get cached page */ metad = (BTMetaPageData *) rel->rd_amcache; - /* We shouldn't have cached it if any of these fail */ - Assert(metad->btm_magic == BTREE_MAGIC); - Assert(metad->btm_version == BTREE_VERSION); - Assert(metad->btm_fastroot != P_NONE); return metad->btm_fastlevel; } +/* + * _bt_hasuniquekeys() -- Determine if heap TID should be treated as a key. + * + * This is used to determine the rules that must be used to descend a + * btree. Version 4 indexes treat heap TID as a tie-breaker attribute. + * pg_upgrade'd version 3 indexes need extra steps to preserve reasonable + * performance when inserting a new BTScanInsert-wise duplicate tuple + * among many leaf pages already full of such duplicates. + */ +bool +_bt_hasuniquekeys(Relation rel) +{ + BTMetaPageData *metad; + + if (rel->rd_amcache == NULL) + { + Buffer metabuf; + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); + metad = _bt_getmeta(rel, metabuf); + + /* + * If there's no root page yet, _bt_getroot() doesn't expect a cache + * to be made, so just stop here. (XXX perhaps _bt_getroot() should + * be changed to allow this case.) + */ + if (metad->btm_root == P_NONE) + { + uint32 btm_version = metad->btm_version; + + _bt_relbuf(rel, metabuf); + return btm_version > BTREE_META_VERSION; + } + + /* + * Cache the metapage data for next time + */ + _bt_cachemetadata(rel, metad); + /* We shouldn't have cached it if any of these fail */ + Assert(metad->btm_magic == BTREE_MAGIC); + Assert(metad->btm_version >= BTREE_META_VERSION); + Assert(metad->btm_fastroot != P_NONE); + _bt_relbuf(rel, metabuf); + } + + /* Get cached page */ + metad = (BTMetaPageData *) rel->rd_amcache; + + return metad->btm_version > BTREE_META_VERSION; +} + /* * _bt_checkpage() -- Verify that a freshly-read page looks sane. */ @@ -1370,7 +1441,7 @@ _bt_pagedel(Relation rel, Buffer buf) */ if (!stack) { - ScanKey itup_scankey; + BTScanInsert itup_scankey; ItemId itemid; IndexTuple targetkey; Buffer lbuf; @@ -1420,12 +1491,19 @@ _bt_pagedel(Relation rel, Buffer buf) } /* we need an insertion scan key for the search, so build one */ - itup_scankey = _bt_mkscankey(rel, targetkey); - /* find the leftmost leaf page containing this key */ - stack = _bt_search(rel, - IndexRelationGetNumberOfKeyAttributes(rel), - itup_scankey, false, &lbuf, BT_READ, NULL); - /* don't need a pin on the page */ + itup_scankey = _bt_mkscankey(rel, targetkey, false); + /* get stack to leaf page by searching index */ + stack = _bt_search(rel, itup_scankey, &lbuf, BT_READ, NULL); + + /* + * Prior to version 4, search is for the leftmost leaf page + * containing this key, which is okay because we have to match + * on block number to deal with concurrent splits anyway. + * Otherwise, search will reliably relocate same leaf page. + */ + Assert(!itup_scankey->uniquekeys || + BufferGetBlockNumber(buf) == BufferGetBlockNumber(lbuf)); + /* don't need a lock or second pin on the page */ _bt_relbuf(rel, lbuf); /* @@ -1970,7 +2048,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) if (BufferIsValid(metabuf)) { /* upgrade metapage if needed */ - if (metad->btm_version < BTREE_VERSION) + if (metad->btm_version < BTREE_META_VERSION) _bt_upgrademetapage(metapg); metad->btm_fastroot = rightsib; metad->btm_fastlevel = targetlevel; @@ -2018,6 +2096,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) { XLogRegisterBuffer(4, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); + Assert(metad->btm_version == BTREE_META_VERSION || + metad->btm_version == BTREE_VERSION); + xlmeta.version = metad->btm_version; xlmeta.root = metad->btm_root; xlmeta.level = metad->btm_level; xlmeta.fastroot = metad->btm_fastroot; diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index e8725fbbe1..9cf760ffa0 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -794,7 +794,7 @@ _bt_vacuum_needs_cleanup(IndexVacuumInfo *info) metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); - if (metad->btm_version < BTREE_VERSION) + if (metad->btm_version < BTREE_META_VERSION) { /* * Do cleanup if metapage needs upgrade, because we don't have diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 16223d01ec..7305e647b2 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -25,6 +25,10 @@ #include "utils/tqual.h" +static inline int32 _bt_nonpivot_compare(Relation rel, + BTScanInsert key, + Page page, + OffsetNumber offnum); static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum); static void _bt_saveitem(BTScanOpaque so, int itemIndex, @@ -38,6 +42,7 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir); +static ItemPointerData lowest; /* * _bt_drop_lock_and_maybe_pin() @@ -72,12 +77,9 @@ _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp) * _bt_search() -- Search the tree for a particular scankey, * or more precisely for the first leaf page it could be on. * - * The passed scankey must be an insertion-type scankey (see nbtree/README), - * but it can omit the rightmost column(s) of the index. - * - * When nextkey is false (the usual case), we are looking for the first - * item >= scankey. When nextkey is true, we are looking for the first - * item strictly greater than scankey. + * The passed scankey is an insertion-type scankey (see nbtree/README), + * but it can omit the rightmost column(s) of the index. If key was built + * using a leaf high key, leaf page will be relocated. * * Return value is a stack of parent-page pointers. *bufP is set to the * address of the leaf-page buffer, which is read-locked and pinned. @@ -94,8 +96,8 @@ _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp) * during the search will be finished. */ BTStack -_bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, - Buffer *bufP, int access, Snapshot snapshot) +_bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, + Snapshot snapshot) { BTStack stack_in = NULL; int page_access = BT_READ; @@ -131,7 +133,7 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, * if the leaf page is split and we insert to the parent page). But * this is a good opportunity to finish splits of internal pages too. */ - *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, + *bufP = _bt_moveright(rel, key, *bufP, (access == BT_WRITE), stack_in, page_access, snapshot); @@ -145,7 +147,7 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, * Find the appropriate item on the internal page, and get the child * page that it points to. */ - offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey); + offnum = _bt_binsrch(rel, key, *bufP); itemid = PageGetItemId(page, offnum); itup = (IndexTuple) PageGetItem(page, itemid); blkno = BTreeInnerTupleGetDownLink(itup); @@ -158,8 +160,8 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, * downlink (block) to uniquely identify the index entry, in case it * moves right while we're working lower in the tree. See the paper * by Lehman and Yao for how this is detected and handled. (We use the - * child link to disambiguate duplicate keys in the index -- Lehman - * and Yao disallow duplicate keys.) + * child link to disambiguate duplicate keys in the index, which is + * faster than comparing the keys themselves.) */ new_stack = (BTStack) palloc(sizeof(BTStackData)); new_stack->bts_blkno = par_blkno; @@ -199,8 +201,8 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, * need to move right in the tree. See Lehman and Yao for an * excruciatingly precise description. */ - *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, - true, stack_in, BT_WRITE, snapshot); + *bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE, + snapshot); } return stack_in; @@ -216,16 +218,16 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, * or strictly to the right of it. * * This routine decides whether or not we need to move right in the - * tree by examining the high key entry on the page. If that entry - * is strictly less than the scankey, or <= the scankey in the nextkey=true + * tree by examining the high key entry on the page. If that entry is + * strictly less than the scankey, or <= the scankey in the key.nextkey=true * case, then we followed the wrong link and we need to move right. * - * The passed scankey must be an insertion-type scankey (see nbtree/README), - * but it can omit the rightmost column(s) of the index. + * The passed insertion-type scankey can omit the rightmost column(s) of the + * index. (see nbtree/README) * - * When nextkey is false (the usual case), we are looking for the first - * item >= scankey. When nextkey is true, we are looking for the first - * item strictly greater than scankey. + * When key.nextkey is false (the usual case), we are looking for the first + * item >= key. When key.nextkey is true, we are looking for the first item + * strictly greater than key. * * If forupdate is true, we will attempt to finish any incomplete splits * that we encounter. This is required when locking a target page for an @@ -242,10 +244,8 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, */ Buffer _bt_moveright(Relation rel, + BTScanInsert key, Buffer buf, - int keysz, - ScanKey scankey, - bool nextkey, bool forupdate, BTStack stack, int access, @@ -270,7 +270,7 @@ _bt_moveright(Relation rel, * We also have to move right if we followed a link that brought us to a * dead page. */ - cmpval = nextkey ? 0 : 1; + cmpval = key->nextkey ? 0 : 1; for (;;) { @@ -284,7 +284,7 @@ _bt_moveright(Relation rel, /* * Finish any incomplete splits we encounter along the way. */ - if (forupdate && P_INCOMPLETE_SPLIT(opaque)) + if (unlikely(forupdate && P_INCOMPLETE_SPLIT(opaque))) { BlockNumber blkno = BufferGetBlockNumber(buf); @@ -305,7 +305,8 @@ _bt_moveright(Relation rel, continue; } - if (P_IGNORE(opaque) || _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval) + if (unlikely(P_IGNORE(opaque) || + _bt_compare(rel, key, page, P_HIKEY) >= cmpval)) { /* step right one page */ buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access); @@ -328,10 +329,6 @@ _bt_moveright(Relation rel, * The passed scankey must be an insertion-type scankey (see nbtree/README), * but it can omit the rightmost column(s) of the index. * - * When nextkey is false (the usual case), we are looking for the first - * item >= scankey. When nextkey is true, we are looking for the first - * item strictly greater than scankey. - * * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first * key >= given scankey, or > scankey if nextkey is true. (NOTE: in * particular, this means it is possible to return a value 1 greater than the @@ -347,37 +344,76 @@ _bt_moveright(Relation rel, * * This procedure is not responsible for walking right, it just examines * the given page. _bt_binsrch() has no lock or refcount side effects - * on the buffer. + * on the buffer. When itup_scankey.savebinsrch is set, modifies + * mutable fields of insertion scan key, so that a subsequent call where + * caller sets itup_scankey.savebinsrch can reuse the low and high bound + * of original binary search. This makes the second binary search + * performed on the first leaf page landed on by inserters that do + * unique enforcement avoid doing any real comparisons in most cases. + * See _bt_findinsertloc() for further details. */ OffsetNumber _bt_binsrch(Relation rel, - Buffer buf, - int keysz, - ScanKey scankey, - bool nextkey) + BTScanInsert key, + Buffer buf) { Page page; BTPageOpaque opaque; OffsetNumber low, - high; + high, + savehigh; int32 result, cmpval; + bool isleaf; page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); + isleaf = P_ISLEAF(opaque); - low = P_FIRSTDATAKEY(opaque); - high = PageGetMaxOffsetNumber(page); + Assert(!(key->restorebinsrch && key->savebinsrch)); + /* Requesting nextkey semantics while using scantid seems nonsensical */ + Assert(!key->nextkey || key->scantid == NULL); + /* Restore binary search state when scantid is available */ + Assert(!key->savebinsrch || key->scantid == NULL); + Assert(!key->uniquekeys || !key->restorebinsrch || key->scantid != NULL); + Assert(P_ISLEAF(opaque) || (!key->restorebinsrch && !key->savebinsrch)); - /* - * If there are no keys on the page, return the first available slot. Note - * this covers two cases: the page is really empty (no keys), or it - * contains only a high key. The latter case is possible after vacuuming. - * This can never happen on an internal page, however, since they are - * never empty (an internal page must have children). - */ - if (high < low) - return low; + if (!key->restorebinsrch) + { + low = P_FIRSTDATAKEY(opaque); + high = PageGetMaxOffsetNumber(page); + + /* + * If there are no keys on the page, return the first available + * slot. Note this covers two cases: the page is really empty (no + * keys), or it contains only a high key. The latter case is + * possible after vacuuming. This can never happen on an internal + * page, however, since they are never empty (an internal page must + * have children). + */ + if (unlikely(high < low)) + { + if (key->savebinsrch) + { + key->low = low; + key->high = high; + key->savebinsrch = false; + } + return low; + } + high++; /* establish the loop invariant for high */ + } + else + { + /* Restore result of previous binary search against same page */ + low = key->low; + high = key->high; + key->restorebinsrch = false; + + /* Return the first slot, in line with original binary search */ + if (unlikely(high < low)) + return low; + } /* * Binary search to find the first key on the page >= scan key, or first @@ -391,22 +427,40 @@ _bt_binsrch(Relation rel, * * We can fall out when high == low. */ - high++; /* establish the loop invariant for high */ - - cmpval = nextkey ? 0 : 1; /* select comparison value */ + cmpval = key->nextkey ? 0 : 1; /* select comparison value */ + savehigh = high; while (high > low) { OffsetNumber mid = low + ((high - low) / 2); /* We have low <= mid < high, so mid points at a real slot */ - result = _bt_compare(rel, keysz, scankey, page, mid); + if (!isleaf) + result = _bt_compare(rel, key, page, mid); + else + result = _bt_nonpivot_compare(rel, key, page, mid); if (result >= cmpval) low = mid + 1; else + { high = mid; + + /* + * high can only be reused by more restrictive binary search when + * it's known to be strictly greater than the original scankey + */ + if (result != 0) + savehigh = high; + } + } + + if (key->savebinsrch) + { + key->low = low; + key->high = savehigh; + key->savebinsrch = false; } /* @@ -421,7 +475,8 @@ _bt_binsrch(Relation rel, /* * On a non-leaf page, return the last key < scan key (resp. <= scan key). - * There must be one if _bt_compare() is playing by the rules. + * There must be one if _bt_compare()/_bt_tuple_compare() is playing by + * the rules. */ Assert(low > P_FIRSTDATAKEY(opaque)); @@ -431,21 +486,11 @@ _bt_binsrch(Relation rel, /*---------- * _bt_compare() -- Compare scankey to a particular tuple on the page. * - * The passed scankey must be an insertion-type scankey (see nbtree/README), - * but it can omit the rightmost column(s) of the index. + * Convenience wrapper for _bt_tuple_compare() callers that want to compare + * an offset on a particular page. * - * keysz: number of key conditions to be checked (might be less than the - * number of index columns!) * page/offnum: location of btree item to be compared to. * - * This routine returns: - * <0 if scankey < tuple at offnum; - * 0 if scankey == tuple at offnum; - * >0 if scankey > tuple at offnum. - * NULLs in the keys are treated as sortable values. Therefore - * "equality" does not necessarily mean that the item should be - * returned to the caller as a matching key! - * * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be * "minus infinity": this routine will always claim it is less than the * scankey. The actual key value stored (if any, which there probably isn't) @@ -456,26 +501,82 @@ _bt_binsrch(Relation rel, */ int32 _bt_compare(Relation rel, - int keysz, - ScanKey scankey, + BTScanInsert key, Page page, OffsetNumber offnum) { - TupleDesc itupdesc = RelationGetDescr(rel); BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); IndexTuple itup; - int i; + int ntupatts; Assert(_bt_check_natts(rel, page, offnum)); /* * Force result ">" if target item is first data item on an internal page * --- see NOTE above. + * + * A minus infinity key has all attributes truncated away, so this test is + * redundant with the minus infinity attribute tie-breaker. However, the + * number of attributes in minus infinity tuples was not explicitly + * represented as 0 until PostgreSQL v11, so an explicit offnum test is + * still required. */ if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque)) return 1; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + ntupatts = BTreeTupleGetNAtts(itup, rel); + return _bt_tuple_compare(rel, key, itup, ntupatts); +} + +/* + * Optimized version of _bt_compare(). Only works on non-pivot tuples. + */ +static inline int32 +_bt_nonpivot_compare(Relation rel, + BTScanInsert key, + Page page, + OffsetNumber offnum) +{ + IndexTuple itup; + + Assert(_bt_check_natts(rel, page, offnum)); + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + Assert(BTreeTupleGetNAtts(itup, rel) == + IndexRelationGetNumberOfAttributes(rel)); + return _bt_tuple_compare(rel, key, itup, key->keysz); +} + +/*---------- + * _bt_tuple_compare() -- Compare scankey to a particular tuple. + * + * The passed scankey must be an insertion-type scankey (see nbtree/README), + * but it can omit the rightmost column(s) of the index. + * + * This routine returns: + * <0 if scankey < tuple; + * 0 if scankey == tuple; + * >0 if scankey > tuple. + * NULLs in the keys are treated as sortable values. Therefore + * "equality" does not necessarily mean that the item should be + * returned to the caller as a matching key! + *---------- + */ +int32 +_bt_tuple_compare(Relation rel, + BTScanInsert key, + IndexTuple itup, + int ntupatts) +{ + TupleDesc itupdesc = RelationGetDescr(rel); + ItemPointer heapTid; + int ncmpkey; + int i; + ScanKey scankey; + + Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel)); + Assert(key->uniquekeys || key->scantid == NULL); /* * The scan key is set up with the attribute number associated with each @@ -489,7 +590,9 @@ _bt_compare(Relation rel, * _bt_first). */ - for (i = 1; i <= keysz; i++) + ncmpkey = Min(ntupatts, key->keysz); + scankey = key->scankeys; + for (i = 1; i <= ncmpkey; i++) { Datum datum; bool isNull; @@ -540,8 +643,83 @@ _bt_compare(Relation rel, scankey++; } - /* if we get here, the keys are equal */ - return 0; + /* + * Use the number of attributes as a tie-breaker, in order to treat + * truncated attributes in index as minus infinity. + */ + if (key->keysz > ntupatts) + return 1; + + /* If caller provided no heap TID tie-breaker for scan, they're equal */ + if (key->scantid == NULL) + return 0; + + /* + * Although it isn't counted as an attribute by BTreeTupleGetNAtts(), heap + * TID is an implicit final key attribute that ensures that all index + * tuples have a distinct set of key attribute values. + * + * This is often truncated away in pivot tuples, which makes the attribute + * value implicitly negative infinity. + */ + heapTid = BTreeTupleGetHeapTID(itup); + if (heapTid == NULL) + return 1; + + return ItemPointerCompare(key->scantid, heapTid); +} + +/* + * _bt_lowest_scantid() -- Manufacture low heap TID. + * + * Create a heap TID that's strictly less than any possible real heap + * TID to _bt_tuple_compare. This is still treated as greater than + * minus infinity. The overall effect is that _bt_search follows + * downlinks with scankey equal non-TID attribute(s), but a + * truncated-away TID attribute, as the scankey is greater than the + * downlink/pivot tuple as a whole. (Obviously this can only be of use + * when a scankey has values for all key attributes other than the heap + * TID tie-breaker attribute/scantid.) + * + * If we didn't do this then affected index scans would have to + * unnecessarily visit an extra page before moving right to the page they + * should have landed on from the parent in the first place. There would + * even be a useless binary search on the left/first page, since a high key + * check won't have the search move right immediately (the high key will be + * identical to the downlink we should have followed in the parent, barring + * a concurrent page split). + * + * This is particularly important with unique index insertions, since "the + * first page the value could be on" has an exclusive buffer lock held while + * a subsequent page (usually the actual first page the value could be on) + * has a shared buffer lock held. (There may also be heap buffer locks + * acquired during this process.) + * + * Note that implementing this by adding hard-coding to _bt_compare is + * unworkable, since that would break nextkey semantics in the common case + * where all non-TID key attributes have been provided. + */ +ItemPointer +_bt_lowest_scantid(void) +{ + static ItemPointer low = NULL; + + /* + * A heap TID that's less than or equal to any possible real heap TID + * would also work. Generating an impossibly-low TID value seems + * slightly simpler. + */ + if (!low) + { + low = &lowest; + + /* Lowest possible block is 0 */ + ItemPointerSetBlockNumber(low, 0); + /* InvalidOffsetNumber less than any real offset */ + ItemPointerSetOffsetNumber(low, InvalidOffsetNumber); + } + + return low; } /* @@ -575,8 +753,9 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) StrategyNumber strat; bool nextkey; bool goback; + BTScanInsertData key; ScanKey startKeys[INDEX_MAX_KEYS]; - ScanKeyData scankeys[INDEX_MAX_KEYS]; + ScanKey scankeys; ScanKeyData notnullkeys[INDEX_MAX_KEYS]; int keysCount = 0; int i; @@ -822,10 +1001,12 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) /* * We want to start the scan somewhere within the index. Set up an * insertion scankey we can use to search for the boundary point we - * identified above. The insertion scankey is built in the local - * scankeys[] array, using the keys identified by startKeys[]. + * identified above. The insertion scankey is built in the scankeys[] + * array, using the keys identified by startKeys[]. */ Assert(keysCount <= INDEX_MAX_KEYS); + scankeys = key.scankeys; + for (i = 0; i < keysCount; i++) { ScanKey cur = startKeys[i]; @@ -1053,12 +1234,38 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) return false; } + /* + * Initialize insertion scankey. + * + * Manufacture sentinel scan tid that's less than any possible heap TID + * in the index when that might allow us to avoid unnecessary moves + * right while descending the tree. + * + * Never do this for any nextkey case, since that would make + * _bt_search() incorrectly land on the leaf page with the second + * user-attribute-wise duplicate tuple, rather than landing on the leaf + * page with the next user-attribute-distinct key > scankey, which is + * the intended behavior. We could invent a _bt_highest_scantid() to + * use in nextkey cases, but that would never actually save any cycles + * during the descent of the tree; "_bt_binsrch() + nextkey = true" + * already behaves as if all tuples <= scankey (in terms of the + * attributes/keys actually supplied in the scankey) are < scankey. + */ + key.uniquekeys = _bt_hasuniquekeys(rel); + key.savebinsrch = key.restorebinsrch = false; + key.low = key.high = 0; + key.nextkey = nextkey; + key.keysz = keysCount; + key.scantid = NULL; + if (key.keysz >= IndexRelationGetNumberOfKeyAttributes(rel) && + !key.nextkey && key.uniquekeys) + key.scantid = _bt_lowest_scantid(); + /* * Use the manufactured insertion scan key to descend the tree and * position ourselves on the target leaf page. */ - stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ, - scan->xs_snapshot); + stack = _bt_search(rel, &key, &buf, BT_READ, scan->xs_snapshot); /* don't need to keep the stack around... */ _bt_freestack(stack); @@ -1087,7 +1294,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) _bt_initialize_more_data(so, dir); /* position to the precise item on the page */ - offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey); + offnum = _bt_binsrch(rel, &key, buf); /* * If nextkey = false, we are positioned at the first item >= scan key, or diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 16f5755777..3922d9252b 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -743,6 +743,7 @@ _bt_sortaddtup(Page page, { trunctuple = *itup; trunctuple.t_info = sizeof(IndexTupleData); + /* Deliberately zero INDEX_ALT_TID_MASK bits */ BTreeTupleSetNAtts(&trunctuple, 0); itup = &trunctuple; itemsize = sizeof(IndexTupleData); @@ -796,8 +797,6 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) OffsetNumber last_off; Size pgspc; Size itupsz; - int indnatts = IndexRelationGetNumberOfAttributes(wstate->index); - int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(wstate->index); /* * This is a handy place to check for cancel interrupts during the btree @@ -813,28 +812,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) itupsz = IndexTupleSize(itup); itupsz = MAXALIGN(itupsz); - /* - * Check whether the item can fit on a btree page at all. (Eventually, we - * ought to try to apply TOAST methods if not.) We actually need to be - * able to fit three items on every page, so restrict any one item to 1/3 - * the per-page available space. Note that at this point, itupsz doesn't - * include the ItemId. - * - * NOTE: similar code appears in _bt_insertonpg() to defend against - * oversize items being inserted into an already-existing index. But - * during creation of an index, we don't go through there. - */ if (itupsz > BTMaxItemSize(npage)) - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", - itupsz, BTMaxItemSize(npage), - RelationGetRelationName(wstate->index)), - errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" - "Consider a function index of an MD5 hash of the value, " - "or use full text indexing."), - errtableconstraint(wstate->heap, - RelationGetRelationName(wstate->index)))); + _bt_check_third_page(wstate->index, wstate->heap, npage, itup); /* * Check to see if page is "full". It's definitely full if the item won't @@ -880,19 +859,30 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) ItemIdSetUnused(ii); /* redundant */ ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData); - if (indnkeyatts != indnatts && P_ISLEAF(opageop)) + if (P_ISLEAF(opageop)) { + IndexTuple lastleft; IndexTuple truncated; Size truncsz; /* - * Truncate any non-key attributes from high key on leaf level - * (i.e. truncate on leaf level if we're building an INCLUDE - * index). This is only done at the leaf level because downlinks + * Truncate away any unneeded attributes from high key on leaf + * level. This is only done at the leaf level because downlinks * in internal pages are either negative infinity items, or get * their contents from copying from one level down. See also: * _bt_split(). * + * We don't try to bias our choice of split point to make it + * more likely that _bt_truncate() can truncate away more + * attributes, whereas the split point passed by _bt_split() is + * chosen much more delicately. Suffix truncation is mostly + * useful because it can greatly improve space utilization for + * workloads with random insertions, or insertions of + * monotonically increasing values at "local" points in the key + * space. It doesn't seem worthwhile to add complex logic for + * choosing a split point here for a benefit that is bound to be + * much smaller. + * * Since the truncated tuple is probably smaller than the * original, it cannot just be copied in place (besides, we want * to actually save space on the leaf page). We delete the @@ -905,7 +895,10 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) * the latter portion of the space occupied by the original tuple. * This is fairly cheap. */ - truncated = _bt_nonkey_truncate(wstate->index, oitup); + ii = PageGetItemId(opage, OffsetNumberPrev(last_off)); + lastleft = (IndexTuple) PageGetItem(opage, ii); + + truncated = _bt_truncate(wstate->index, lastleft, oitup, true); truncsz = IndexTupleSize(truncated); PageIndexTupleDelete(opage, P_HIKEY); _bt_sortaddtup(opage, truncsz, truncated, P_HIKEY); @@ -924,8 +917,9 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) if (state->btps_next == NULL) state->btps_next = _bt_pagestate(wstate, state->btps_level + 1); - Assert(BTreeTupleGetNAtts(state->btps_minkey, wstate->index) == - IndexRelationGetNumberOfKeyAttributes(wstate->index) || + Assert((BTreeTupleGetNAtts(state->btps_minkey, wstate->index) <= + IndexRelationGetNumberOfKeyAttributes(wstate->index) && + BTreeTupleGetNAtts(state->btps_minkey, wstate->index) > 0) || P_LEFTMOST(opageop)); Assert(BTreeTupleGetNAtts(state->btps_minkey, wstate->index) == 0 || !P_LEFTMOST(opageop)); @@ -970,7 +964,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) * the first item for a page is copied from the prior page in the code * above. Since the minimum key for an entire level is only used as a * minus infinity downlink, and never as a high key, there is no need to - * truncate away non-key attributes at this point. + * truncate away suffix attributes at this point. */ if (last_off == P_HIKEY) { @@ -1029,8 +1023,9 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) } else { - Assert(BTreeTupleGetNAtts(s->btps_minkey, wstate->index) == - IndexRelationGetNumberOfKeyAttributes(wstate->index) || + Assert((BTreeTupleGetNAtts(s->btps_minkey, wstate->index) <= + IndexRelationGetNumberOfKeyAttributes(wstate->index) && + BTreeTupleGetNAtts(s->btps_minkey, wstate->index) > 0) || P_LEFTMOST(opaque)); Assert(BTreeTupleGetNAtts(s->btps_minkey, wstate->index) == 0 || !P_LEFTMOST(opaque)); @@ -1115,7 +1110,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey); } - _bt_freeskey(indexScanKey); + pfree(indexScanKey); for (;;) { @@ -1127,6 +1122,8 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) } else if (itup != NULL) { + int32 compare = 0; + for (i = 1; i <= keysz; i++) { SortSupport entry; @@ -1134,7 +1131,6 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) attrDatum2; bool isNull1, isNull2; - int32 compare; entry = sortKeys + i - 1; attrDatum1 = index_getattr(itup, i, tupdes, &isNull1); @@ -1151,6 +1147,20 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) else if (compare < 0) break; } + + /* + * If key values are equal, we sort on ItemPointer. This is + * required for btree indexes, since heap TID is treated as an + * implicit last key attribute in order to ensure that all + * keys in the index are physically unique. + */ + if (compare == 0) + { + compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid); + Assert(compare != 0); + if (compare > 0) + load1 = false; + } } else load1 = false; diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 205457ef99..83298ff257 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -49,6 +49,8 @@ static void _bt_mark_scankey_required(ScanKey skey); static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc, ScanDirection dir, bool *continuescan); +static int _bt_leave_natts(Relation rel, IndexTuple lastleft, + IndexTuple firstright, bool build); /* @@ -56,34 +58,56 @@ static bool _bt_check_rowcompare(ScanKey skey, * Build an insertion scan key that contains comparison data from itup * as well as comparator routines appropriate to the key datatypes. * + * When itup is a non-pivot tuple, the returned insertion scan key is + * suitable for finding a place for it to go on the leaf level. When + * itup is a pivot tuple, the returned insertion scankey is suitable + * for locating the leaf page with the pivot as its high key (there + * must have been one at some point if the pivot tuple actually came + * from the tree, barring the minus infinity special case). + * + * Note that we may occasionally have to share lock the metapage, in + * order to determine whether or not the keys in the index are expected + * to be unique (i.e. whether or not heap TID is treated as a tie-breaker + * attribute). Callers that cannot tolerate this can request that we + * assume that all entries in the index are unique. + * * The result is intended for use with _bt_compare(). */ -ScanKey -_bt_mkscankey(Relation rel, IndexTuple itup) +BTScanInsert +_bt_mkscankey(Relation rel, IndexTuple itup, bool assumeunique) { + BTScanInsert res; ScanKey skey; TupleDesc itupdesc; + int tupnatts; int indnatts PG_USED_FOR_ASSERTS_ONLY; int indnkeyatts; int16 *indoption; int i; itupdesc = RelationGetDescr(rel); + tupnatts = BTreeTupleGetNAtts(itup, rel); indnatts = IndexRelationGetNumberOfAttributes(rel); indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); indoption = rel->rd_indoption; - Assert(indnkeyatts > 0); - Assert(indnkeyatts <= indnatts); - Assert(BTreeTupleGetNAtts(itup, rel) == indnatts || - BTreeTupleGetNAtts(itup, rel) == indnkeyatts); + Assert(tupnatts > 0); + Assert(tupnatts <= indnatts); /* - * We'll execute search using scan key constructed on key columns. Non-key - * (INCLUDE index) columns are always omitted from scan keys. + * We'll execute search using scan key constructed on key columns. + * Truncated attributes and non-key attributes are omitted from the final + * scan key. */ - skey = (ScanKey) palloc(indnkeyatts * sizeof(ScanKeyData)); - + res = palloc(offsetof(BTScanInsertData, scankeys) + + sizeof(ScanKeyData) * indnkeyatts); + res->uniquekeys = assumeunique || _bt_hasuniquekeys(rel); + res->savebinsrch = res->restorebinsrch = false; + res->low = res->high = 0; + res->nextkey = false; + res->keysz = Min(indnkeyatts, tupnatts); + res->scantid = res->uniquekeys ? BTreeTupleGetHeapTID(itup) : NULL; + skey = res->scankeys; for (i = 0; i < indnkeyatts; i++) { FmgrInfo *procinfo; @@ -96,7 +120,20 @@ _bt_mkscankey(Relation rel, IndexTuple itup) * comparison can be needed. */ procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC); - arg = index_getattr(itup, i + 1, itupdesc, &null); + + /* + * Truncated key attributes may not be represented in index tuple due + * to suffix truncation. Keys built from truncated attributes are + * defensively represented as NULL values, though they should still + * not participate in comparisons. + */ + if (i < tupnatts) + arg = index_getattr(itup, i + 1, itupdesc, &null); + else + { + arg = (Datum) 0; + null = true; + } flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT); ScanKeyEntryInitializeWithInfo(&skey[i], flags, @@ -108,7 +145,7 @@ _bt_mkscankey(Relation rel, IndexTuple itup) arg); } - return skey; + return res; } /* @@ -159,15 +196,6 @@ _bt_mkscankey_nodata(Relation rel) return skey; } -/* - * free a scan key made by either _bt_mkscankey or _bt_mkscankey_nodata. - */ -void -_bt_freeskey(ScanKey skey) -{ - pfree(skey); -} - /* * free a retracement stack made by _bt_search. */ @@ -2083,38 +2111,218 @@ btproperty(Oid index_oid, int attno, } /* - * _bt_nonkey_truncate() -- create tuple without non-key suffix attributes. + * _bt_truncate() -- create tuple without unneeded suffix attributes. * - * Returns truncated index tuple allocated in caller's memory context, with key - * attributes copied from caller's itup argument. Currently, suffix truncation - * is only performed to create pivot tuples in INCLUDE indexes, but some day it - * could be generalized to remove suffix attributes after the first - * distinguishing key attribute. + * Returns truncated pivot index tuple allocated in caller's memory context, + * with key attributes copied from caller's firstright argument. If rel is + * an INCLUDE index, non-key attributes will definitely be truncated away, + * since they're not part of the key space. More aggressive suffix + * truncation can take place when it's clear that the returned tuple does not + * need one or more suffix key attributes. This is possible when there are + * attributes that follow an attribute in firstright that is not equal to the + * corresponding attribute in lastleft (equal according to insertion scan key + * semantics). * - * Truncated tuple is guaranteed to be no larger than the original, which is - * important for staying under the 1/3 of a page restriction on tuple size. + * Sometimes this routine will return a new pivot tuple that takes up more + * space than firstright, because a new heap TID attribute had to be added to + * distinguish lastleft from firstright. This should only happen when the + * caller is in the process of splitting a leaf page that has many logical + * duplicates, where it's unavoidable. * - * Note that returned tuple's t_tid offset will hold the number of attributes - * present, so the original item pointer offset is not represented. Caller - * should only change truncated tuple's downlink. + * Note that returned tuple's t_tid offset will hold the number of + * attributes present, so the original item pointer offset is not + * represented. Caller should only change truncated tuple's downlink. Note + * also that truncated key attributes are treated as containing "minus + * infinity" values by _bt_compare()/_bt_tuple_compare(). + * + * Returned tuple is guaranteed to be no larger than the original plus some + * extra space for a possible extra heap TID tie-breaker attribute. This + * guarantee is important for staying under the 1/3 of a page restriction on + * tuple size. + * + * CREATE INDEX callers must pass build = true, in order to avoid metapage + * access. */ IndexTuple -_bt_nonkey_truncate(Relation rel, IndexTuple itup) +_bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, + bool build) { - int nkeyattrs = IndexRelationGetNumberOfKeyAttributes(rel); - IndexTuple truncated; + TupleDesc itupdesc = RelationGetDescr(rel); + int16 natts = IndexRelationGetNumberOfAttributes(rel); + int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + int leavenatts; + IndexTuple pivot; + ItemPointer pivotheaptid; + Size newsize; /* - * We should only ever truncate leaf index tuples, which must have both - * key and non-key attributes. It's never okay to truncate a second time. + * We should only ever truncate leaf index tuples, which must have non-key + * attributes in the case of INCLUDE indexes. It's never okay to truncate + * a second time. */ - Assert(BTreeTupleGetNAtts(itup, rel) == - IndexRelationGetNumberOfAttributes(rel)); + Assert(BTreeTupleGetNAtts(lastleft, rel) == natts); + Assert(BTreeTupleGetNAtts(firstright, rel) == natts); - truncated = index_truncate_tuple(RelationGetDescr(rel), itup, nkeyattrs); - BTreeTupleSetNAtts(truncated, nkeyattrs); + /* Determine how many attributes must be left behind */ + leavenatts = _bt_leave_natts(rel, lastleft, firstright, build); - return truncated; + if (leavenatts <= natts) + { + IndexTuple tidpivot; + + /* + * Truncate away non-key attributes and/or key attributes. Do a + * straight copy in the case where the only attribute to be "truncated + * away" is the implicit heap TID key attribute (i.e. the case where + * we can at least avoid adding an explicit heap TID attribute to new + * pivot). We should only call index_truncate_tuple() when non-TID + * attributes need to be truncated. + */ + if (leavenatts < natts) + pivot = index_truncate_tuple(itupdesc, firstright, leavenatts); + else + pivot = CopyIndexTuple(firstright); + + /* + * If there is a distinguishing key attribute within leavenatts, there + * is no need to add an explicit heap TID attribute to new pivot. + */ + if (leavenatts <= nkeyatts) + { + BTreeTupleSetNAtts(pivot, leavenatts); + return pivot; + } + + /* + * Only non-key attributes could be truncated away from an INCLUDE + * index's pivot tuple. They are not considered part of the key + * space, so it's still necessary to add a heap TID attribute to the + * new pivot tuple. Create enlarged copy of our truncated right tuple + * copy, to fit heap TID. + */ + Assert(natts < nkeyatts); + newsize = MAXALIGN(IndexTupleSize(pivot) + sizeof(ItemPointerData)); + tidpivot = palloc0(newsize); + memcpy(tidpivot, pivot, IndexTupleSize(pivot)); + pfree(pivot); + pivot = tidpivot; + } + else + { + /* + * No truncation was possible, since attributes are all equal. It's + * necessary to add a heap TID attribute to the new pivot tuple. + */ + Assert(natts == nkeyatts); + newsize = MAXALIGN(IndexTupleSize(firstright) + sizeof(ItemPointerData)); + pivot = palloc0(newsize); + memcpy(pivot, firstright, IndexTupleSize(firstright)); + } + + /* + * We have to use heap TID as a unique-ifier in the new pivot tuple, since + * no non-TID attribute in the right item readily distinguishes the right + * side of the split from the left side. Use enlarged space that holds a + * copy of first right tuple; place a heap TID value within the extra + * space that remains at the end. + * + * nbtree conceptualizes this case as an inability to truncate away any + * attribute. We must use an alternative representation of heap TID + * within pivots because heap TID is only treated as an attribute within + * nbtree (e.g., there is no explicit pg_attribute entry). + * + * Callers generally try to avoid choosing a split point that necessitates + * that we do this. Splits of pages that only involve a single distinct + * value (or set of values) must end up here, though. + */ + pivot->t_info &= ~INDEX_SIZE_MASK; + pivot->t_info |= newsize; + + /* + * Generate an artificial heap TID value that's immediately before the + * first right item's heap TID. The goal is to maximize the number of + * future duplicates that will end up on the mostly-empty right side of + * the split, while minimizing the number inserted on the mostly-full left + * side. (We expect a continual right-heavy split pattern.) + */ + pivotheaptid = (ItemPointer) ((char *) pivot + newsize - + sizeof(ItemPointerData)); + ItemPointerCopy(&firstright->t_tid, pivotheaptid); + ItemPointerSetOffsetNumber(pivotheaptid, + OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid))); + + /* + * Lehman and Yao require that the downlink to the right page, which is to + * be inserted into the parent page in the second phase of a page split be + * a strict lower bound on all current and future items on the right page. + * That's why we didn't just directly copy the first right item's heap + * TID. + */ + Assert(ItemPointerCompare(pivotheaptid, &lastleft->t_tid) >= 0); + Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0); + + BTreeTupleSetNAtts(pivot, nkeyatts); + BTreeTupleSetAltHeapTID(pivot); + + return pivot; +} + +/* + * _bt_leave_natts - how many key attributes to leave when truncating. + * + * Caller provides two tuples that enclose a split point. CREATE INDEX + * callers must pass build = true so that we may avoid metapage access. (This + * is okay because CREATE INDEX always creates an index on the lastest btree + * version, where all keys are unique.) + * + * This can return a number of attributes that is one greater than the + * number of key attributes for the index relation. This indicates that the + * caller must use a heap TID as a unique-ifier in new pivot tuple. + */ +static int +_bt_leave_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, + bool build) +{ + int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + int leavenatts; + BTScanInsert key; + + key = _bt_mkscankey(rel, firstright, build); + + /* + * Be consistent about the representation of BTREE_VERSION 3 tuples across + * Postgres versions; don't allow new pivot tuples to have truncated key + * attributes there. This keeps things consistent and simple for + * verification tools that have to handle multiple versions. + */ + if (!key->uniquekeys) + { + Assert(nkeyatts != IndexRelationGetNumberOfAttributes(rel)); + return nkeyatts; + } + + key->scantid = NULL; + + /* + * Even test nkeyatts (no truncated non-TID attributes) case, since caller + * cares about whether or not it can avoid appending a heap TID as a + * unique-ifier + */ + leavenatts = 1; + for (;;) + { + if (leavenatts > nkeyatts) + break; + key->keysz = leavenatts; + if (_bt_tuple_compare(rel, key, lastleft, nkeyatts) > 0) + break; + leavenatts++; + } + + /* Can't leak memory here */ + pfree(key); + + return leavenatts; } /* @@ -2137,6 +2345,7 @@ _bt_check_natts(Relation rel, Page page, OffsetNumber offnum) int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); IndexTuple itup; + int tupnatts; /* * We cannot reliably test a deleted or half-deleted page, since they have @@ -2156,6 +2365,7 @@ _bt_check_natts(Relation rel, Page page, OffsetNumber offnum) "BT_N_KEYS_OFFSET_MASK can't fit INDEX_MAX_KEYS"); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + tupnatts = BTreeTupleGetNAtts(itup, rel); if (P_ISLEAF(opaque)) { @@ -2165,7 +2375,7 @@ _bt_check_natts(Relation rel, Page page, OffsetNumber offnum) * Leaf tuples that are not the page high key (non-pivot tuples) * should never be truncated */ - return BTreeTupleGetNAtts(itup, rel) == natts; + return tupnatts == natts; } else { @@ -2176,7 +2386,7 @@ _bt_check_natts(Relation rel, Page page, OffsetNumber offnum) Assert(!P_RIGHTMOST(opaque)); /* Page high key tuple contains only key attributes */ - return BTreeTupleGetNAtts(itup, rel) == nkeyatts; + return tupnatts > 0 && tupnatts <= nkeyatts; } } else /* !P_ISLEAF(opaque) */ @@ -2209,8 +2419,73 @@ _bt_check_natts(Relation rel, Page page, OffsetNumber offnum) * Tuple contains only key attributes despite on is it page high * key or not */ - return BTreeTupleGetNAtts(itup, rel) == nkeyatts; + return tupnatts > 0 && tupnatts <= nkeyatts; } } } + +/* + * + * _bt_check_third_page() -- check whether tuple fits on a btree page at all. + * + * We actually need to be able to fit three items on every page, so restrict + * any one item to 1/3 the per-page available space. Note that itemsz should + * not include the ItemId overhead. + * + * It might be useful to apply TOAST methods rather than throw an error here. + * Using out of line storage would break assumptions made by suffix truncation + * and by contrib/amcheck, though. + */ +void +_bt_check_third_page(Relation rel, Relation heap, Page page, IndexTuple newtup) +{ + bool needheaptidspace; + Size itemsz; + + itemsz = MAXALIGN(IndexTupleSize(newtup)); + + /* Double check item size against limit */ + if (itemsz <= BTMaxItemSize(page)) + return; + + /* + * Tuple is probably too large to fit on page, but it's possible that the + * index uses version 2 or version 3, in which case a slightly higher + * limit applies. + */ + needheaptidspace = _bt_hasuniquekeys(rel); + if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid(page)) + return; + + if (needheaptidspace) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"", + itemsz, BTREE_VERSION, BTMaxItemSize(page), + RelationGetRelationName(rel)), + errdetail("Index row references tuple (%u,%u) in relation \"%s\".", + ItemPointerGetBlockNumber(&newtup->t_tid), + ItemPointerGetOffsetNumber(&newtup->t_tid), + RelationGetRelationName(heap)), + errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" + "Consider a function index of an MD5 hash of the value, " + "or use full text indexing."), + errtableconstraint(heap, + RelationGetRelationName(rel)))); + else + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %zu exceeds btree version 3 maximum %zu for index \"%s\"", + itemsz, BTMaxItemSizeNoHeapTid(page), + RelationGetRelationName(rel)), + errdetail("Index row references tuple (%u,%u) in relation \"%s\".", + ItemPointerGetBlockNumber(&newtup->t_tid), + ItemPointerGetOffsetNumber(&newtup->t_tid), + RelationGetRelationName(heap)), + errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" + "Consider a function index of an MD5 hash of the value, " + "or use full text indexing."), + errtableconstraint(heap, + RelationGetRelationName(rel)))); +} diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 67a94cb80a..fe8f4fe2a7 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -103,7 +103,7 @@ _bt_restore_meta(XLogReaderState *record, uint8 block_id) md = BTPageGetMeta(metapg); md->btm_magic = BTREE_MAGIC; - md->btm_version = BTREE_VERSION; + md->btm_version = xlrec->version; md->btm_root = xlrec->root; md->btm_level = xlrec->level; md->btm_fastroot = xlrec->fastroot; @@ -202,7 +202,7 @@ btree_xlog_insert(bool isleaf, bool ismeta, XLogReaderState *record) } static void -btree_xlog_split(bool onleft, bool lhighkey, XLogReaderState *record) +btree_xlog_split(bool onleft, XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record); @@ -213,8 +213,6 @@ btree_xlog_split(bool onleft, bool lhighkey, XLogReaderState *record) BTPageOpaque ropaque; char *datapos; Size datalen; - IndexTuple left_hikey = NULL; - Size left_hikeysz = 0; BlockNumber leftsib; BlockNumber rightsib; BlockNumber rnext; @@ -248,20 +246,6 @@ btree_xlog_split(bool onleft, bool lhighkey, XLogReaderState *record) _bt_restore_page(rpage, datapos, datalen); - /* - * When the high key isn't present is the wal record, then we assume it to - * be equal to the first key on the right page. It must be from the leaf - * level. - */ - if (!lhighkey) - { - ItemId hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque)); - - Assert(isleaf); - left_hikey = (IndexTuple) PageGetItem(rpage, hiItemId); - left_hikeysz = ItemIdGetLength(hiItemId); - } - PageSetLSN(rpage, lsn); MarkBufferDirty(rbuf); @@ -284,6 +268,8 @@ btree_xlog_split(bool onleft, bool lhighkey, XLogReaderState *record) OffsetNumber off; IndexTuple newitem = NULL; Size newitemsz = 0; + IndexTuple left_hikey = NULL; + Size left_hikeysz = 0; Page newlpage; OffsetNumber leftoff; @@ -298,13 +284,10 @@ btree_xlog_split(bool onleft, bool lhighkey, XLogReaderState *record) } /* Extract left hikey and its size (assuming 16-bit alignment) */ - if (lhighkey) - { - left_hikey = (IndexTuple) datapos; - left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey)); - datapos += left_hikeysz; - datalen -= left_hikeysz; - } + left_hikey = (IndexTuple) datapos; + left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey)); + datapos += left_hikeysz; + datalen -= left_hikeysz; Assert(datalen == 0); @@ -1003,16 +986,10 @@ btree_redo(XLogReaderState *record) btree_xlog_insert(false, true, record); break; case XLOG_BTREE_SPLIT_L: - btree_xlog_split(true, false, record); - break; - case XLOG_BTREE_SPLIT_L_HIGHKEY: - btree_xlog_split(true, true, record); + btree_xlog_split(true, record); break; case XLOG_BTREE_SPLIT_R: - btree_xlog_split(false, false, record); - break; - case XLOG_BTREE_SPLIT_R_HIGHKEY: - btree_xlog_split(false, true, record); + btree_xlog_split(false, record); break; case XLOG_BTREE_VACUUM: btree_xlog_vacuum(record); diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c index 5c4457179d..667c906b2e 100644 --- a/src/backend/access/rmgrdesc/nbtdesc.c +++ b/src/backend/access/rmgrdesc/nbtdesc.c @@ -35,8 +35,6 @@ btree_desc(StringInfo buf, XLogReaderState *record) } case XLOG_BTREE_SPLIT_L: case XLOG_BTREE_SPLIT_R: - case XLOG_BTREE_SPLIT_L_HIGHKEY: - case XLOG_BTREE_SPLIT_R_HIGHKEY: { xl_btree_split *xlrec = (xl_btree_split *) rec; @@ -130,12 +128,6 @@ btree_identify(uint8 info) case XLOG_BTREE_SPLIT_R: id = "SPLIT_R"; break; - case XLOG_BTREE_SPLIT_L_HIGHKEY: - id = "SPLIT_L_HIGHKEY"; - break; - case XLOG_BTREE_SPLIT_R_HIGHKEY: - id = "SPLIT_R_HIGHKEY"; - break; case XLOG_BTREE_VACUUM: id = "VACUUM"; break; diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index 5909404e1e..3932d22b62 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -964,7 +964,7 @@ tuplesort_begin_cluster(TupleDesc tupDesc, PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); } - _bt_freeskey(indexScanKey); + pfree(indexScanKey); MemoryContextSwitchTo(oldcontext); @@ -1042,7 +1042,7 @@ tuplesort_begin_index_btree(Relation heapRel, PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); } - _bt_freeskey(indexScanKey); + pfree(indexScanKey); MemoryContextSwitchTo(oldcontext); @@ -4057,9 +4057,10 @@ comparetup_index_btree(const SortTuple *a, const SortTuple *b, } /* - * If key values are equal, we sort on ItemPointer. This does not affect - * validity of the finished index, but it may be useful to have index - * scans in physical order. + * If key values are equal, we sort on ItemPointer. This is required for + * btree indexes, since heap TID is treated as an implicit last key + * attribute in order to ensure that all keys in the index are physically + * unique. */ { BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index ea495f1724..7baef7d685 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -97,7 +97,7 @@ typedef BTPageOpaqueData *BTPageOpaque; typedef struct BTMetaPageData { uint32 btm_magic; /* should contain BTREE_MAGIC */ - uint32 btm_version; /* should contain BTREE_VERSION */ + uint32 btm_version; /* should be >= BTREE_META_VERSION */ BlockNumber btm_root; /* current root location */ uint32 btm_level; /* tree level of the root page */ BlockNumber btm_fastroot; /* current "fast" root location */ @@ -114,16 +114,27 @@ typedef struct BTMetaPageData #define BTREE_METAPAGE 0 /* first page is meta */ #define BTREE_MAGIC 0x053162 /* magic number of btree pages */ -#define BTREE_VERSION 3 /* current version number */ +#define BTREE_VERSION 4 /* current version number */ #define BTREE_MIN_VERSION 2 /* minimal supported version number */ +#define BTREE_META_VERSION 3 /* minimal version with all meta fields */ /* * Maximum size of a btree index entry, including its tuple header. * * We actually need to be able to fit three items on every page, * so restrict any one item to 1/3 the per-page available space. + * + * There are rare cases where _bt_truncate() will need to enlarge + * a heap index tuple to make space for a tie-breaker heap TID + * attribute, which we account for here. */ #define BTMaxItemSize(page) \ + MAXALIGN_DOWN((PageGetPageSize(page) - \ + MAXALIGN(SizeOfPageHeaderData + \ + 3*sizeof(ItemIdData) + \ + 3*sizeof(ItemPointerData)) - \ + MAXALIGN(sizeof(BTPageOpaqueData))) / 3) +#define BTMaxItemSizeNoHeapTid(page) \ MAXALIGN_DOWN((PageGetPageSize(page) - \ MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \ MAXALIGN(sizeof(BTPageOpaqueData))) / 3) @@ -204,21 +215,23 @@ typedef struct BTMetaPageData * real offset (downlinks only need to store a block number). The offset * field only stores the number of attributes when the INDEX_ALT_TID_MASK * bit is set (we never assume that pivot tuples must explicitly store the - * number of attributes, and currently do not bother storing the number of - * attributes unless indnkeyatts actually differs from indnatts). - * INDEX_ALT_TID_MASK is only used for pivot tuples at present, though it's - * possible that it will be used within non-pivot tuples in the future. Do - * not assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot - * tuple. + * number of attributes). INDEX_ALT_TID_MASK is only used for pivot tuples + * at present, though it's possible that it will be used within non-pivot + * tuples in the future. Do not assume that a tuple with INDEX_ALT_TID_MASK + * set must be a pivot tuple. A pivot tuple must have INDEX_ALT_TID_MASK set + * as of BTREE_VERSION 4, however. * * The 12 least significant offset bits are used to represent the number of - * attributes in INDEX_ALT_TID_MASK tuples, leaving 4 bits that are reserved - * for future use (BT_RESERVED_OFFSET_MASK bits). BT_N_KEYS_OFFSET_MASK should - * be large enough to store any number <= INDEX_MAX_KEYS. + * attributes in INDEX_ALT_TID_MASK tuples, leaving 4 status bits + * (BT_RESERVED_OFFSET_MASK bits): BT_HEAP_TID_ATTR, plus 3 bits that are + * reserved for future use. BT_N_KEYS_OFFSET_MASK should be large enough to + * store any number <= INDEX_MAX_KEYS. */ #define INDEX_ALT_TID_MASK INDEX_AM_RESERVED_BIT #define BT_RESERVED_OFFSET_MASK 0xF000 #define BT_N_KEYS_OFFSET_MASK 0x0FFF +/* Reserved to indicate if heap TID is represented at end of tuple */ +#define BT_HEAP_TID_ATTR 0x1000 /* Get/set downlink block number */ #define BTreeInnerTupleGetDownLink(itup) \ @@ -241,14 +254,15 @@ typedef struct BTMetaPageData } while(0) /* - * Get/set number of attributes within B-tree index tuple. Asserts should be - * removed when BT_RESERVED_OFFSET_MASK bits will be used. + * Get/set number of attributes within B-tree index tuple. + * + * Note that this does not include an implicit tie-breaker heap-TID + * attribute, if any. */ #define BTreeTupleGetNAtts(itup, rel) \ ( \ (itup)->t_info & INDEX_ALT_TID_MASK ? \ ( \ - AssertMacro((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_RESERVED_OFFSET_MASK) == 0), \ ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK \ ) \ : \ @@ -257,10 +271,42 @@ typedef struct BTMetaPageData #define BTreeTupleSetNAtts(itup, n) \ do { \ (itup)->t_info |= INDEX_ALT_TID_MASK; \ - Assert(((n) & BT_RESERVED_OFFSET_MASK) == 0); \ ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_KEYS_OFFSET_MASK); \ } while(0) +/* + * Get tie-breaker heap TID attribute, if any. Macro works with both pivot + * and non-pivot tuples. + * + * Assumes that any tuple without INDEX_ALT_TID_MASK set has a t_tid that + * points to the heap, and that all pivot tuples have INDEX_ALT_TID_MASK set + * (since all pivot tuples must as of BTREE_VERSION 4). When non-pivot + * tuples use the INDEX_ALT_TID_MASK representation in the future, they'll + * probably also contain a heap TID at the end of the tuple. We avoid + * assuming that a tuple with INDEX_ALT_TID_MASK set is necessarily a pivot + * tuple. + */ +#define BTreeTupleGetHeapTID(itup) \ + ( \ + (itup)->t_info & INDEX_ALT_TID_MASK && \ + (ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_HEAP_TID_ATTR) != 0 ? \ + ( \ + (ItemPointer) (((char *) (itup) + IndexTupleSize(itup)) - \ + sizeof(ItemPointerData)) \ + ) \ + : (itup)->t_info & INDEX_ALT_TID_MASK ? NULL : (ItemPointer) &((itup)->t_tid) \ + ) +/* + * Set the heap TID attribute for a tuple that uses the INDEX_ALT_TID_MASK + * representation (currently limited to pivot tuples) + */ +#define BTreeTupleSetAltHeapTID(itup) \ + do { \ + Assert((itup)->t_info & INDEX_ALT_TID_MASK); \ + ItemPointerSetOffsetNumber(&(itup)->t_tid, \ + ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_HEAP_TID_ATTR); \ + } while(0) + /* * Operator strategy numbers for B-tree have been moved to access/stratnum.h, * because many places need to use them in ScanKeyInit() calls. @@ -319,6 +365,62 @@ typedef struct BTStackData typedef BTStackData *BTStack; +/* + * BTScanInsert is the btree-private state needed to find an initial position + * for an indexscan, or to insert new tuples. For details on its mutable + * state, see _bt_binsrch and _bt_findinsertloc. + * + * uniquekeys indicates if we expect all keys in the index to be unique by + * treating heap TID as a tie-breaker attribute (i.e. the index is + * BTREE_VERSION 4+). scantid should never be set when index is not a + * uniquekeys index. + * + * When nextkey is false (the usual case), _bt_search and _bt_binsrch will + * locate the first item >= scankey. When nextkey is true, they will locate + * the first item > scan key. + * + * scantid is the heap TID that is used as a final tie-breaker attribute, + * which may be set to NULL to indicate its absence. When inserting new + * tuples, it must be set, since every tuple in the tree unambiguously belongs + * in one exact position, even when there are entries in the tree that are + * considered duplicates by external code. Unique insertions set scantid only + * after unique checking indicates that it's safe to insert. Despite the + * representational difference, scantid is just another insertion scankey to + * routines like _bt_search(). + * + * keysz is the number of insertion scankeys present (scantid is counted + * separately). + * + * scankeys is an array of scan key entries for attributes that are compared + * before scantid (user-visible attributes). Every attribute should have an + * entry during insertion, though not necessarily when a regular index scan + * uses an insertion scankey to find an initial leaf page. The array is + * used as a flexible array member, though it's sized in a way that makes it + * possible to use stack allocations. See nbtree/README for full details. + */ + +typedef struct BTScanInsertData +{ + /* + * Mutable state. Used by _bt_binsrch() to inexpensively repeat a binary + * search when only scantid has changed. + */ + bool savebinsrch; + bool restorebinsrch; + OffsetNumber low; + OffsetNumber high; + + /* State used to find tuples on the leaf level */ + bool uniquekeys; + bool nextkey; + ItemPointer scantid; /* Not used in !uniquekeys case */ + int keysz; /* Size of scankeys */ + ScanKeyData scankeys[INDEX_MAX_KEYS]; /* Must appear last */ +} BTScanInsertData; + +typedef BTScanInsertData *BTScanInsert; + + /* * BTScanOpaqueData is the btree-private state needed for an indexscan. * This consists of preprocessed scan keys (see _bt_preprocess_keys() for @@ -541,6 +643,7 @@ extern void _bt_upgrademetapage(Page page); extern Buffer _bt_getroot(Relation rel, int access); extern Buffer _bt_gettrueroot(Relation rel); extern int _bt_getrootheight(Relation rel); +extern bool _bt_hasuniquekeys(Relation rel); extern void _bt_checkpage(Relation rel, Buffer buf); extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access); extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, @@ -559,15 +662,15 @@ extern int _bt_pagedel(Relation rel, Buffer buf); * prototypes for functions in nbtsearch.c */ extern BTStack _bt_search(Relation rel, - int keysz, ScanKey scankey, bool nextkey, + BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot); -extern Buffer _bt_moveright(Relation rel, Buffer buf, int keysz, - ScanKey scankey, bool nextkey, bool forupdate, BTStack stack, - int access, Snapshot snapshot); -extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, - ScanKey scankey, bool nextkey); -extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, - Page page, OffsetNumber offnum); +extern Buffer _bt_moveright(Relation rel, BTScanInsert key, Buffer buf, + bool forupdate, BTStack stack, int access, Snapshot snapshot); +extern OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf); +extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum); +extern int32 _bt_tuple_compare(Relation rel, BTScanInsert key, IndexTuple itup, + int ntupatts); +extern ItemPointer _bt_lowest_scantid(void); extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, @@ -576,9 +679,9 @@ extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, /* * prototypes for functions in nbtutils.c */ -extern ScanKey _bt_mkscankey(Relation rel, IndexTuple itup); +extern BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup, + bool assumeunique); extern ScanKey _bt_mkscankey_nodata(Relation rel); -extern void _bt_freeskey(ScanKey skey); extern void _bt_freestack(BTStack stack); extern void _bt_preprocess_array_keys(IndexScanDesc scan); extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir); @@ -600,8 +703,11 @@ extern bytea *btoptions(Datum reloptions, bool validate); extern bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull); -extern IndexTuple _bt_nonkey_truncate(Relation rel, IndexTuple itup); +extern IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, + IndexTuple firstright, bool build); extern bool _bt_check_natts(Relation rel, Page page, OffsetNumber offnum); +extern void _bt_check_third_page(Relation rel, Relation heap, Page page, + IndexTuple newtup); /* * prototypes for functions in nbtvalidate.c diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index 819373031c..06da0965f7 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -28,8 +28,7 @@ #define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */ #define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */ #define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */ -#define XLOG_BTREE_SPLIT_L_HIGHKEY 0x50 /* as above, include truncated highkey */ -#define XLOG_BTREE_SPLIT_R_HIGHKEY 0x60 /* as above, include truncated highkey */ +/* 0x50 and 0x60 are unused */ #define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */ #define XLOG_BTREE_UNLINK_PAGE 0x80 /* delete a half-dead page */ #define XLOG_BTREE_UNLINK_PAGE_META 0x90 /* same, and update metapage */ @@ -47,6 +46,7 @@ */ typedef struct xl_btree_metadata { + uint32 version; BlockNumber root; uint32 level; BlockNumber fastroot; @@ -82,20 +82,16 @@ typedef struct xl_btree_insert * * Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record. * The _L and _R variants indicate whether the inserted tuple went into the - * left or right split page (and thus, whether newitemoff and the new item - * are stored or not). The _HIGHKEY variants indicate that we've logged - * explicitly left page high key value, otherwise redo should use right page - * leftmost key as a left page high key. _HIGHKEY is specified for internal - * pages where right page leftmost key is suppressed, and for leaf pages - * of covering indexes where high key have non-key attributes truncated. + * left or right split page (and thus, whether newitemoff and the new item are + * stored or not). We always explicitly log the left page high key value. * * Backup Blk 0: original page / new left page * * The left page's data portion contains the new item, if it's the _L variant. - * (In the _R variants, the new item is one of the right page's tuples.) - * If level > 0, an IndexTuple representing the HIKEY of the left page - * follows. We don't need this on leaf pages, because it's the same as the - * leftmost key in the new right page. + * In the _R variants, the new item is one of the right page's tuples. An + * IndexTuple representing the HIKEY of the left page follows. We don't need + * this on leaf pages, because it's the same as the leftmost key in the new + * right page. * * Backup Blk 1: new right page * diff --git a/src/test/modules/test_extensions/expected/test_extensions.out b/src/test/modules/test_extensions/expected/test_extensions.out index 28d86c4b87..29b4ec95c1 100644 --- a/src/test/modules/test_extensions/expected/test_extensions.out +++ b/src/test/modules/test_extensions/expected/test_extensions.out @@ -30,10 +30,10 @@ NOTICE: installing required extension "test_ext_cyclic2" ERROR: cyclic dependency detected between extensions "test_ext_cyclic1" and "test_ext_cyclic2" DROP SCHEMA test_ext CASCADE; NOTICE: drop cascades to 5 other objects -DETAIL: drop cascades to extension test_ext3 -drop cascades to extension test_ext5 -drop cascades to extension test_ext2 +DETAIL: drop cascades to extension test_ext5 drop cascades to extension test_ext4 +drop cascades to extension test_ext3 +drop cascades to extension test_ext2 drop cascades to extension test_ext1 CREATE EXTENSION test_ext6; DROP EXTENSION test_ext6; diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index 717e965f30..e815a657b1 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -1070,9 +1070,9 @@ select distinct min(f1), max(f1) from minmaxtest; drop table minmaxtest cascade; NOTICE: drop cascades to 3 other objects -DETAIL: drop cascades to table minmaxtest1 +DETAIL: drop cascades to table minmaxtest3 drop cascades to table minmaxtest2 -drop cascades to table minmaxtest3 +drop cascades to table minmaxtest1 -- check for correct detection of nested-aggregate errors select max(min(unique1)) from tenk1; ERROR: aggregate function calls cannot be nested diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index 68cd3e5676..d2ece1355a 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -2684,19 +2684,19 @@ select alter2.plus1(41); -- clean up drop schema alter2 cascade; NOTICE: drop cascades to 13 other objects -DETAIL: drop cascades to table alter2.t1 -drop cascades to view alter2.v1 -drop cascades to function alter2.plus1(integer) -drop cascades to type alter2.posint -drop cascades to operator family alter2.ctype_hash_ops for access method hash +DETAIL: drop cascades to text search template alter2.tmpl +drop cascades to text search dictionary alter2.dict +drop cascades to text search parser alter2.prs +drop cascades to text search configuration alter2.cfg +drop cascades to conversion alter2.ascii_to_utf8 drop cascades to type alter2.ctype drop cascades to function alter2.same(alter2.ctype,alter2.ctype) drop cascades to operator alter2.=(alter2.ctype,alter2.ctype) -drop cascades to conversion alter2.ascii_to_utf8 -drop cascades to text search parser alter2.prs -drop cascades to text search configuration alter2.cfg -drop cascades to text search template alter2.tmpl -drop cascades to text search dictionary alter2.dict +drop cascades to operator family alter2.ctype_hash_ops for access method hash +drop cascades to type alter2.posint +drop cascades to function alter2.plus1(integer) +drop cascades to table alter2.t1 +drop cascades to view alter2.v1 -- -- composite types -- diff --git a/src/test/regress/expected/collate.out b/src/test/regress/expected/collate.out index fcbe3a5cc8..6a58a6ae8a 100644 --- a/src/test/regress/expected/collate.out +++ b/src/test/regress/expected/collate.out @@ -667,5 +667,6 @@ SELECT collation for ((SELECT b FROM collate_test1 LIMIT 1)); -- must get rid of them. -- \set VERBOSITY terse +SET client_min_messages TO 'warning'; DROP SCHEMA collate_tests CASCADE; -NOTICE: drop cascades to 17 other objects +RESET client_min_messages; diff --git a/src/test/regress/expected/create_type.out b/src/test/regress/expected/create_type.out index 2f7d5f94d7..8309756030 100644 --- a/src/test/regress/expected/create_type.out +++ b/src/test/regress/expected/create_type.out @@ -161,13 +161,13 @@ DROP FUNCTION base_fn_out(opaque); -- error ERROR: function base_fn_out(opaque) does not exist DROP TYPE base_type; -- error ERROR: cannot drop type base_type because other objects depend on it -DETAIL: function base_fn_out(base_type) depends on type base_type -function base_fn_in(cstring) depends on type base_type +DETAIL: function base_fn_in(cstring) depends on type base_type +function base_fn_out(base_type) depends on type base_type HINT: Use DROP ... CASCADE to drop the dependent objects too. DROP TYPE base_type CASCADE; NOTICE: drop cascades to 2 other objects -DETAIL: drop cascades to function base_fn_out(base_type) -drop cascades to function base_fn_in(cstring) +DETAIL: drop cascades to function base_fn_in(cstring) +drop cascades to function base_fn_out(base_type) -- Check usage of typmod with a user-defined type -- (we have borrowed numeric's typmod functions) CREATE TEMP TABLE mytab (foo widget(42,13,7)); -- should fail diff --git a/src/test/regress/expected/dependency.out b/src/test/regress/expected/dependency.out index 8e50f8ffbb..8d31110b87 100644 --- a/src/test/regress/expected/dependency.out +++ b/src/test/regress/expected/dependency.out @@ -128,9 +128,9 @@ FROM pg_type JOIN pg_class c ON typrelid = c.oid WHERE typname = 'deptest_t'; -- doesn't work: grant still exists DROP USER regress_dep_user1; ERROR: role "regress_dep_user1" cannot be dropped because some objects depend on it -DETAIL: owner of default privileges on new relations belonging to role regress_dep_user1 in schema deptest +DETAIL: privileges for table deptest1 privileges for database regression -privileges for table deptest1 +owner of default privileges on new relations belonging to role regress_dep_user1 in schema deptest DROP OWNED BY regress_dep_user1; DROP USER regress_dep_user1; \set VERBOSITY terse diff --git a/src/test/regress/expected/domain.out b/src/test/regress/expected/domain.out index 0b5a9041b0..f4899f2a38 100644 --- a/src/test/regress/expected/domain.out +++ b/src/test/regress/expected/domain.out @@ -643,10 +643,10 @@ update domnotnull set col1 = null; -- fails ERROR: domain dnotnulltest does not allow null values alter domain dnotnulltest drop not null; update domnotnull set col1 = null; +\set VERBOSITY terse drop domain dnotnulltest cascade; NOTICE: drop cascades to 2 other objects -DETAIL: drop cascades to column col1 of table domnotnull -drop cascades to column col2 of table domnotnull +\set VERBOSITY default -- Test ALTER DOMAIN .. DEFAULT .. create table domdeftest (col1 ddef1); insert into domdeftest default values; diff --git a/src/test/regress/expected/event_trigger.out b/src/test/regress/expected/event_trigger.out index 0e32d5c427..0755931db8 100644 --- a/src/test/regress/expected/event_trigger.out +++ b/src/test/regress/expected/event_trigger.out @@ -187,9 +187,9 @@ ERROR: event trigger "regress_event_trigger" does not exist -- should fail, regress_evt_user owns some objects drop role regress_evt_user; ERROR: role "regress_evt_user" cannot be dropped because some objects depend on it -DETAIL: owner of event trigger regress_event_trigger3 +DETAIL: owner of user mapping for regress_evt_user on server useless_server owner of default privileges on new relations belonging to role regress_evt_user -owner of user mapping for regress_evt_user on server useless_server +owner of event trigger regress_event_trigger3 -- cleanup before next test -- these are all OK; the second one should emit a NOTICE drop event trigger if exists regress_event_trigger2; @@ -276,14 +276,13 @@ CREATE EVENT TRIGGER regress_event_trigger_drop_objects ON sql_drop ALTER TABLE schema_one.table_one DROP COLUMN a; DROP SCHEMA schema_one, schema_two CASCADE; NOTICE: drop cascades to 7 other objects -DETAIL: drop cascades to table schema_two.table_two -drop cascades to table schema_two.table_three -drop cascades to function schema_two.add(integer,integer) +DETAIL: drop cascades to function schema_two.add(integer,integer) drop cascades to function schema_two.newton(integer) -drop cascades to table schema_one.table_one -drop cascades to table schema_one."table two" +drop cascades to table schema_two.table_three +drop cascades to table schema_two.table_two drop cascades to table schema_one.table_three -NOTICE: table "schema_two_table_two" does not exist, skipping +drop cascades to table schema_one."table two" +drop cascades to table schema_one.table_one NOTICE: table "audit_tbls_schema_two_table_three" does not exist, skipping ERROR: object audit_tbls.schema_two_table_three of type table cannot be dropped CONTEXT: PL/pgSQL function undroppable() line 14 at RAISE @@ -292,61 +291,61 @@ PL/pgSQL function test_evtrig_dropped_objects() line 8 at EXECUTE DELETE FROM undroppable_objs WHERE object_identity = 'audit_tbls.schema_two_table_three'; DROP SCHEMA schema_one, schema_two CASCADE; NOTICE: drop cascades to 7 other objects -DETAIL: drop cascades to table schema_two.table_two -drop cascades to table schema_two.table_three -drop cascades to function schema_two.add(integer,integer) +DETAIL: drop cascades to function schema_two.add(integer,integer) drop cascades to function schema_two.newton(integer) -drop cascades to table schema_one.table_one -drop cascades to table schema_one."table two" +drop cascades to table schema_two.table_three +drop cascades to table schema_two.table_two drop cascades to table schema_one.table_three -NOTICE: table "schema_two_table_two" does not exist, skipping +drop cascades to table schema_one."table two" +drop cascades to table schema_one.table_one NOTICE: table "audit_tbls_schema_two_table_three" does not exist, skipping -NOTICE: table "schema_one_table_one" does not exist, skipping -NOTICE: table "schema_one_table two" does not exist, skipping +NOTICE: table "schema_two_table_two" does not exist, skipping NOTICE: table "schema_one_table_three" does not exist, skipping +NOTICE: table "schema_one_table two" does not exist, skipping +NOTICE: table "schema_one_table_one" does not exist, skipping ERROR: object schema_one.table_three of type table cannot be dropped CONTEXT: PL/pgSQL function undroppable() line 14 at RAISE DELETE FROM undroppable_objs WHERE object_identity = 'schema_one.table_three'; DROP SCHEMA schema_one, schema_two CASCADE; NOTICE: drop cascades to 7 other objects -DETAIL: drop cascades to table schema_two.table_two -drop cascades to table schema_two.table_three -drop cascades to function schema_two.add(integer,integer) +DETAIL: drop cascades to function schema_two.add(integer,integer) drop cascades to function schema_two.newton(integer) -drop cascades to table schema_one.table_one -drop cascades to table schema_one."table two" +drop cascades to table schema_two.table_three +drop cascades to table schema_two.table_two drop cascades to table schema_one.table_three -NOTICE: table "schema_two_table_two" does not exist, skipping +drop cascades to table schema_one."table two" +drop cascades to table schema_one.table_one NOTICE: table "audit_tbls_schema_two_table_three" does not exist, skipping -NOTICE: table "schema_one_table_one" does not exist, skipping -NOTICE: table "schema_one_table two" does not exist, skipping +NOTICE: table "schema_two_table_two" does not exist, skipping NOTICE: table "schema_one_table_three" does not exist, skipping +NOTICE: table "schema_one_table two" does not exist, skipping +NOTICE: table "schema_one_table_one" does not exist, skipping SELECT * FROM dropped_objects WHERE schema IS NULL OR schema <> 'pg_toast'; type | schema | object --------------+------------+------------------------------------- table column | schema_one | schema_one.table_one.a schema | | schema_two - table | schema_two | schema_two.table_two - type | schema_two | schema_two.table_two - type | schema_two | schema_two.table_two[] + function | schema_two | schema_two.add(integer,integer) + aggregate | schema_two | schema_two.newton(integer) table | audit_tbls | audit_tbls.schema_two_table_three type | audit_tbls | audit_tbls.schema_two_table_three type | audit_tbls | audit_tbls.schema_two_table_three[] table | schema_two | schema_two.table_three type | schema_two | schema_two.table_three type | schema_two | schema_two.table_three[] - function | schema_two | schema_two.add(integer,integer) - aggregate | schema_two | schema_two.newton(integer) + table | schema_two | schema_two.table_two + type | schema_two | schema_two.table_two + type | schema_two | schema_two.table_two[] schema | | schema_one - table | schema_one | schema_one.table_one - type | schema_one | schema_one.table_one - type | schema_one | schema_one.table_one[] - table | schema_one | schema_one."table two" - type | schema_one | schema_one."table two" - type | schema_one | schema_one."table two"[] table | schema_one | schema_one.table_three type | schema_one | schema_one.table_three type | schema_one | schema_one.table_three[] + table | schema_one | schema_one."table two" + type | schema_one | schema_one."table two" + type | schema_one | schema_one."table two"[] + table | schema_one | schema_one.table_one + type | schema_one | schema_one.table_one + type | schema_one | schema_one.table_one[] (23 rows) DROP OWNED BY regress_evt_user; diff --git a/src/test/regress/expected/foreign_data.out b/src/test/regress/expected/foreign_data.out index 75365501d4..1171a8865f 100644 --- a/src/test/regress/expected/foreign_data.out +++ b/src/test/regress/expected/foreign_data.out @@ -441,8 +441,8 @@ ALTER SERVER s1 OWNER TO regress_test_indirect; RESET ROLE; DROP ROLE regress_test_indirect; -- ERROR ERROR: role "regress_test_indirect" cannot be dropped because some objects depend on it -DETAIL: owner of server s1 -privileges for foreign-data wrapper foo +DETAIL: privileges for foreign-data wrapper foo +owner of server s1 \des+ List of foreign servers Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description @@ -2060,9 +2060,9 @@ DROP TABLE temp_parted; DROP SCHEMA foreign_schema CASCADE; DROP ROLE regress_test_role; -- ERROR ERROR: role "regress_test_role" cannot be dropped because some objects depend on it -DETAIL: privileges for server s4 +DETAIL: owner of user mapping for regress_test_role on server s6 privileges for foreign-data wrapper foo -owner of user mapping for regress_test_role on server s6 +privileges for server s4 DROP SERVER t1 CASCADE; NOTICE: drop cascades to user mapping for public on server t1 DROP USER MAPPING FOR regress_test_role SERVER s6; diff --git a/src/test/regress/expected/foreign_key.out b/src/test/regress/expected/foreign_key.out index fee594531d..0e1f7ad7ea 100644 --- a/src/test/regress/expected/foreign_key.out +++ b/src/test/regress/expected/foreign_key.out @@ -253,13 +253,13 @@ SELECT * FROM FKTABLE; (5 rows) -- this should fail for lack of CASCADE +\set VERBOSITY terse DROP TABLE PKTABLE; ERROR: cannot drop table pktable because other objects depend on it -DETAIL: constraint constrname2 on table fktable depends on table pktable -HINT: Use DROP ... CASCADE to drop the dependent objects too. DROP TABLE PKTABLE CASCADE; NOTICE: drop cascades to constraint constrname2 on table fktable DROP TABLE FKTABLE; +\set VERBOSITY default -- -- First test, check with no on delete or on update -- diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out index d768e5df2c..007ad3738d 100644 --- a/src/test/regress/expected/inherit.out +++ b/src/test/regress/expected/inherit.out @@ -952,8 +952,8 @@ Inherits: c1, drop table p1 cascade; NOTICE: drop cascades to 3 other objects -DETAIL: drop cascades to table c1 -drop cascades to table c2 +DETAIL: drop cascades to table c2 +drop cascades to table c1 drop cascades to table c3 drop table p2 cascade; create table pp1 (f1 int); @@ -1098,9 +1098,9 @@ SELECT a.attrelid::regclass, a.attname, a.attinhcount, e.expected DROP TABLE inht1, inhs1 CASCADE; NOTICE: drop cascades to 4 other objects -DETAIL: drop cascades to table inht2 +DETAIL: drop cascades to table inht3 +drop cascades to table inht2 drop cascades to table inhts -drop cascades to table inht3 drop cascades to table inht4 -- Test non-inheritable indices [UNIQUE, EXCLUDE] constraints CREATE TABLE test_constraints (id int, val1 varchar, val2 int, UNIQUE(val1, val2)); @@ -1352,8 +1352,8 @@ select * from patest0 join (select f1 from int4_tbl limit 1) ss on id = f1; drop table patest0 cascade; NOTICE: drop cascades to 2 other objects -DETAIL: drop cascades to table patest1 -drop cascades to table patest2 +DETAIL: drop cascades to table patest2 +drop cascades to table patest1 -- -- Test merge-append plans for inheritance trees -- @@ -1499,9 +1499,9 @@ reset enable_seqscan; reset enable_parallel_append; drop table matest0 cascade; NOTICE: drop cascades to 3 other objects -DETAIL: drop cascades to table matest1 +DETAIL: drop cascades to table matest3 drop cascades to table matest2 -drop cascades to table matest3 +drop cascades to table matest1 -- -- Check that use of an index with an extraneous column doesn't produce -- a plan with extraneous sorting diff --git a/src/test/regress/expected/matview.out b/src/test/regress/expected/matview.out index 08cd4bea48..dc7878454d 100644 --- a/src/test/regress/expected/matview.out +++ b/src/test/regress/expected/matview.out @@ -310,15 +310,15 @@ SELECT type, m.totamt AS mtot, v.totamt AS vtot FROM mvtest_tm m LEFT JOIN mvtes -- make sure that dependencies are reported properly when they block the drop DROP TABLE mvtest_t; ERROR: cannot drop table mvtest_t because other objects depend on it -DETAIL: view mvtest_tv depends on table mvtest_t +DETAIL: materialized view mvtest_tm depends on table mvtest_t +materialized view mvtest_tmm depends on materialized view mvtest_tm +view mvtest_tv depends on table mvtest_t view mvtest_tvv depends on view mvtest_tv materialized view mvtest_tvvm depends on view mvtest_tvv view mvtest_tvvmv depends on materialized view mvtest_tvvm materialized view mvtest_bb depends on view mvtest_tvvmv materialized view mvtest_mvschema.mvtest_tvm depends on view mvtest_tv materialized view mvtest_tvmm depends on materialized view mvtest_mvschema.mvtest_tvm -materialized view mvtest_tm depends on table mvtest_t -materialized view mvtest_tmm depends on materialized view mvtest_tm HINT: Use DROP ... CASCADE to drop the dependent objects too. -- make sure dependencies are dropped and reported -- and make sure that transactional behavior is correct on rollback @@ -326,15 +326,15 @@ HINT: Use DROP ... CASCADE to drop the dependent objects too. BEGIN; DROP TABLE mvtest_t CASCADE; NOTICE: drop cascades to 9 other objects -DETAIL: drop cascades to view mvtest_tv +DETAIL: drop cascades to materialized view mvtest_tm +drop cascades to materialized view mvtest_tmm +drop cascades to view mvtest_tv drop cascades to view mvtest_tvv drop cascades to materialized view mvtest_tvvm drop cascades to view mvtest_tvvmv drop cascades to materialized view mvtest_bb drop cascades to materialized view mvtest_mvschema.mvtest_tvm drop cascades to materialized view mvtest_tvmm -drop cascades to materialized view mvtest_tm -drop cascades to materialized view mvtest_tmm ROLLBACK; -- some additional tests not using base tables CREATE VIEW mvtest_vt1 AS SELECT 1 moo; @@ -484,10 +484,10 @@ SELECT * FROM mvtest_mv_v_4; DROP TABLE mvtest_v CASCADE; NOTICE: drop cascades to 4 other objects -DETAIL: drop cascades to materialized view mvtest_mv_v -drop cascades to materialized view mvtest_mv_v_2 +DETAIL: drop cascades to materialized view mvtest_mv_v_4 drop cascades to materialized view mvtest_mv_v_3 -drop cascades to materialized view mvtest_mv_v_4 +drop cascades to materialized view mvtest_mv_v_2 +drop cascades to materialized view mvtest_mv_v -- Check that unknown literals are converted to "text" in CREATE MATVIEW, -- so that we don't end up with unknown-type columns. CREATE MATERIALIZED VIEW mv_unspecified_types AS diff --git a/src/test/regress/expected/rowsecurity.out b/src/test/regress/expected/rowsecurity.out index bc16ca4c43..d91f6305f6 100644 --- a/src/test/regress/expected/rowsecurity.out +++ b/src/test/regress/expected/rowsecurity.out @@ -3502,8 +3502,8 @@ SELECT refclassid::regclass, deptype SAVEPOINT q; DROP ROLE regress_rls_eve; --fails due to dependency on POLICY p ERROR: role "regress_rls_eve" cannot be dropped because some objects depend on it -DETAIL: target of policy p on table tbl1 -privileges for table tbl1 +DETAIL: privileges for table tbl1 +target of policy p on table tbl1 ROLLBACK TO q; ALTER POLICY p ON tbl1 TO regress_rls_frank USING (true); SAVEPOINT q; diff --git a/src/test/regress/expected/select_into.out b/src/test/regress/expected/select_into.out index 942f975e95..30e10b12d2 100644 --- a/src/test/regress/expected/select_into.out +++ b/src/test/regress/expected/select_into.out @@ -46,9 +46,9 @@ CREATE TABLE selinto_schema.tmp3 (a,b,c) RESET SESSION AUTHORIZATION; DROP SCHEMA selinto_schema CASCADE; NOTICE: drop cascades to 3 other objects -DETAIL: drop cascades to table selinto_schema.tmp1 +DETAIL: drop cascades to table selinto_schema.tmp3 drop cascades to table selinto_schema.tmp2 -drop cascades to table selinto_schema.tmp3 +drop cascades to table selinto_schema.tmp1 DROP USER regress_selinto_user; -- Tests for WITH NO DATA and column name consistency CREATE TABLE ctas_base (i int, j int); diff --git a/src/test/regress/expected/triggers.out b/src/test/regress/expected/triggers.out index 7d59de98eb..70b7c6eead 100644 --- a/src/test/regress/expected/triggers.out +++ b/src/test/regress/expected/triggers.out @@ -568,9 +568,9 @@ LINE 2: FOR EACH STATEMENT WHEN (OLD.* IS DISTINCT FROM NEW.*) -- check dependency restrictions ALTER TABLE main_table DROP COLUMN b; ERROR: cannot drop column b of table main_table because other objects depend on it -DETAIL: trigger after_upd_b_row_trig on table main_table depends on column b of table main_table +DETAIL: trigger after_upd_b_stmt_trig on table main_table depends on column b of table main_table trigger after_upd_a_b_row_trig on table main_table depends on column b of table main_table -trigger after_upd_b_stmt_trig on table main_table depends on column b of table main_table +trigger after_upd_b_row_trig on table main_table depends on column b of table main_table HINT: Use DROP ... CASCADE to drop the dependent objects too. -- this should succeed, but we'll roll it back to keep the triggers around begin; diff --git a/src/test/regress/expected/truncate.out b/src/test/regress/expected/truncate.out index 2e26510522..c8b9a71689 100644 --- a/src/test/regress/expected/truncate.out +++ b/src/test/regress/expected/truncate.out @@ -276,11 +276,10 @@ SELECT * FROM trunc_faa; (0 rows) ROLLBACK; +\set VERBOSITY terse DROP TABLE trunc_f CASCADE; NOTICE: drop cascades to 3 other objects -DETAIL: drop cascades to table trunc_fa -drop cascades to table trunc_faa -drop cascades to table trunc_fb +\set VERBOSITY default -- Test ON TRUNCATE triggers CREATE TABLE trunc_trigger_test (f1 int, f2 text, f3 text); CREATE TABLE trunc_trigger_log (tgop text, tglevel text, tgwhen text, diff --git a/src/test/regress/expected/typed_table.out b/src/test/regress/expected/typed_table.out index 2e47ecbcf5..c76efee358 100644 --- a/src/test/regress/expected/typed_table.out +++ b/src/test/regress/expected/typed_table.out @@ -75,19 +75,12 @@ CREATE TABLE persons4 OF person_type ( name WITH OPTIONS DEFAULT '' -- error, specified more than once ); ERROR: column "name" specified more than once +\set VERBOSITY terse DROP TYPE person_type RESTRICT; ERROR: cannot drop type person_type because other objects depend on it -DETAIL: table persons depends on type person_type -function get_all_persons() depends on type person_type -table persons2 depends on type person_type -table persons3 depends on type person_type -HINT: Use DROP ... CASCADE to drop the dependent objects too. DROP TYPE person_type CASCADE; NOTICE: drop cascades to 4 other objects -DETAIL: drop cascades to table persons -drop cascades to function get_all_persons() -drop cascades to table persons2 -drop cascades to table persons3 +\set VERBOSITY default CREATE TABLE persons5 OF stuff; -- only CREATE TYPE AS types may be used ERROR: type stuff is not a composite type DROP TABLE stuff; diff --git a/src/test/regress/expected/updatable_views.out b/src/test/regress/expected/updatable_views.out index e64d693e9c..8eca01a8e7 100644 --- a/src/test/regress/expected/updatable_views.out +++ b/src/test/regress/expected/updatable_views.out @@ -328,24 +328,10 @@ UPDATE ro_view20 SET b=upper(b); ERROR: cannot update view "ro_view20" DETAIL: Views that return set-returning functions are not automatically updatable. HINT: To enable updating the view, provide an INSTEAD OF UPDATE trigger or an unconditional ON UPDATE DO INSTEAD rule. +\set VERBOSITY terse DROP TABLE base_tbl CASCADE; NOTICE: drop cascades to 16 other objects -DETAIL: drop cascades to view ro_view1 -drop cascades to view ro_view17 -drop cascades to view ro_view2 -drop cascades to view ro_view3 -drop cascades to view ro_view5 -drop cascades to view ro_view6 -drop cascades to view ro_view7 -drop cascades to view ro_view8 -drop cascades to view ro_view9 -drop cascades to view ro_view11 -drop cascades to view ro_view13 -drop cascades to view rw_view15 -drop cascades to view rw_view16 -drop cascades to view ro_view20 -drop cascades to view ro_view4 -drop cascades to view rw_view14 +\set VERBOSITY default DROP VIEW ro_view10, ro_view12, ro_view18; DROP SEQUENCE uv_seq CASCADE; NOTICE: drop cascades to view ro_view19 @@ -1054,10 +1040,10 @@ SELECT * FROM base_tbl; (2 rows) RESET SESSION AUTHORIZATION; +\set VERBOSITY terse DROP TABLE base_tbl CASCADE; NOTICE: drop cascades to 2 other objects -DETAIL: drop cascades to view rw_view1 -drop cascades to view rw_view2 +\set VERBOSITY default -- nested-view permissions CREATE TABLE base_tbl(a int, b text, c float); INSERT INTO base_tbl VALUES (1, 'Row 1', 1.0); @@ -1178,10 +1164,10 @@ ERROR: permission denied for table base_tbl UPDATE rw_view2 SET b = 'bar' WHERE a = 1; -- not allowed ERROR: permission denied for table base_tbl RESET SESSION AUTHORIZATION; +\set VERBOSITY terse DROP TABLE base_tbl CASCADE; NOTICE: drop cascades to 2 other objects -DETAIL: drop cascades to view rw_view1 -drop cascades to view rw_view2 +\set VERBOSITY default DROP USER regress_view_user1; DROP USER regress_view_user2; -- column defaults @@ -1439,11 +1425,10 @@ SELECT events & 4 != 0 AS upd, f | f | t (1 row) +\set VERBOSITY terse DROP TABLE base_tbl CASCADE; NOTICE: drop cascades to 3 other objects -DETAIL: drop cascades to view rw_view1 -drop cascades to view rw_view2 -drop cascades to view rw_view3 +\set VERBOSITY default -- inheritance tests CREATE TABLE base_tbl_parent (a int); CREATE TABLE base_tbl_child (CHECK (a > 0)) INHERITS (base_tbl_parent); @@ -1540,10 +1525,10 @@ SELECT * FROM base_tbl_child ORDER BY a; 20 (6 rows) +\set VERBOSITY terse DROP TABLE base_tbl_parent, base_tbl_child CASCADE; NOTICE: drop cascades to 2 other objects -DETAIL: drop cascades to view rw_view1 -drop cascades to view rw_view2 +\set VERBOSITY default -- simple WITH CHECK OPTION CREATE TABLE base_tbl (a int, b int DEFAULT 10); INSERT INTO base_tbl VALUES (1,2), (2,3), (1,-1); @@ -1711,10 +1696,10 @@ SELECT * FROM base_tbl; 30 (3 rows) +\set VERBOSITY terse DROP TABLE base_tbl CASCADE; NOTICE: drop cascades to 2 other objects -DETAIL: drop cascades to view rw_view1 -drop cascades to view rw_view2 +\set VERBOSITY default -- WITH CHECK OPTION with no local view qual CREATE TABLE base_tbl (a int); CREATE VIEW rw_view1 AS SELECT * FROM base_tbl WITH CHECK OPTION; @@ -1740,11 +1725,10 @@ INSERT INTO rw_view3 VALUES (-3); -- should fail ERROR: new row violates check option for view "rw_view2" DETAIL: Failing row contains (-3). INSERT INTO rw_view3 VALUES (3); -- ok +\set VERBOSITY terse DROP TABLE base_tbl CASCADE; NOTICE: drop cascades to 3 other objects -DETAIL: drop cascades to view rw_view1 -drop cascades to view rw_view2 -drop cascades to view rw_view3 +\set VERBOSITY default -- WITH CHECK OPTION with scalar array ops CREATE TABLE base_tbl (a int, b int[]); CREATE VIEW rw_view1 AS SELECT * FROM base_tbl WHERE a = ANY (b) @@ -1911,10 +1895,10 @@ SELECT * FROM base_tbl; -5 | 10 (7 rows) +\set VERBOSITY terse DROP TABLE base_tbl CASCADE; NOTICE: drop cascades to 2 other objects -DETAIL: drop cascades to view rw_view1 -drop cascades to view rw_view2 +\set VERBOSITY default DROP FUNCTION rw_view1_trig_fn(); CREATE TABLE base_tbl (a int); CREATE VIEW rw_view1 AS SELECT a,10 AS b FROM base_tbl; @@ -1923,10 +1907,10 @@ CREATE RULE rw_view1_ins_rule AS ON INSERT TO rw_view1 CREATE VIEW rw_view2 AS SELECT * FROM rw_view1 WHERE a > b WITH LOCAL CHECK OPTION; INSERT INTO rw_view2 VALUES (2,3); -- ok, but not in view (doesn't fail rw_view2's check) +\set VERBOSITY terse DROP TABLE base_tbl CASCADE; NOTICE: drop cascades to 2 other objects -DETAIL: drop cascades to view rw_view1 -drop cascades to view rw_view2 +\set VERBOSITY default -- security barrier view CREATE TABLE base_tbl (person text, visibility text); INSERT INTO base_tbl VALUES ('Tom', 'public'), @@ -2111,10 +2095,10 @@ EXPLAIN (costs off) DELETE FROM rw_view2 WHERE NOT snoop(person); Filter: ((visibility = 'public'::text) AND snoop(person) AND (NOT snoop(person))) (3 rows) +\set VERBOSITY terse DROP TABLE base_tbl CASCADE; NOTICE: drop cascades to 2 other objects -DETAIL: drop cascades to view rw_view1 -drop cascades to view rw_view2 +\set VERBOSITY default -- security barrier view on top of table with rules CREATE TABLE base_tbl(id int PRIMARY KEY, data text, deleted boolean); INSERT INTO base_tbl VALUES (1, 'Row 1', false), (2, 'Row 2', true); diff --git a/src/test/regress/output/tablespace.source b/src/test/regress/output/tablespace.source index fe3614cd76..9b2e95973d 100644 --- a/src/test/regress/output/tablespace.source +++ b/src/test/regress/output/tablespace.source @@ -242,10 +242,10 @@ NOTICE: no matching relations in tablespace "regress_tblspace_renamed" found DROP TABLESPACE regress_tblspace_renamed; DROP SCHEMA testschema CASCADE; NOTICE: drop cascades to 5 other objects -DETAIL: drop cascades to table testschema.foo -drop cascades to table testschema.asselect -drop cascades to table testschema.asexecute +DETAIL: drop cascades to table testschema.tablespace_acl drop cascades to table testschema.atable -drop cascades to table testschema.tablespace_acl +drop cascades to table testschema.asexecute +drop cascades to table testschema.asselect +drop cascades to table testschema.foo DROP ROLE regress_tablespace_user1; DROP ROLE regress_tablespace_user2; diff --git a/src/test/regress/sql/collate.sql b/src/test/regress/sql/collate.sql index 4ddde95a5e..94ef4e277e 100644 --- a/src/test/regress/sql/collate.sql +++ b/src/test/regress/sql/collate.sql @@ -260,4 +260,6 @@ SELECT collation for ((SELECT b FROM collate_test1 LIMIT 1)); -- must get rid of them. -- \set VERBOSITY terse +SET client_min_messages TO 'warning'; DROP SCHEMA collate_tests CASCADE; +RESET client_min_messages; diff --git a/src/test/regress/sql/domain.sql b/src/test/regress/sql/domain.sql index 68da27de22..d19e2c9d28 100644 --- a/src/test/regress/sql/domain.sql +++ b/src/test/regress/sql/domain.sql @@ -381,7 +381,9 @@ alter domain dnotnulltest drop not null; update domnotnull set col1 = null; +\set VERBOSITY terse drop domain dnotnulltest cascade; +\set VERBOSITY default -- Test ALTER DOMAIN .. DEFAULT .. create table domdeftest (col1 ddef1); diff --git a/src/test/regress/sql/foreign_key.sql b/src/test/regress/sql/foreign_key.sql index 068ab2aab7..591916871a 100644 --- a/src/test/regress/sql/foreign_key.sql +++ b/src/test/regress/sql/foreign_key.sql @@ -159,9 +159,11 @@ UPDATE PKTABLE SET ptest1=1 WHERE ptest1=2; SELECT * FROM FKTABLE; -- this should fail for lack of CASCADE +\set VERBOSITY terse DROP TABLE PKTABLE; DROP TABLE PKTABLE CASCADE; DROP TABLE FKTABLE; +\set VERBOSITY default -- diff --git a/src/test/regress/sql/truncate.sql b/src/test/regress/sql/truncate.sql index 6ddfb6dd1d..fee7e76ec3 100644 --- a/src/test/regress/sql/truncate.sql +++ b/src/test/regress/sql/truncate.sql @@ -125,7 +125,9 @@ SELECT * FROM trunc_fa; SELECT * FROM trunc_faa; ROLLBACK; +\set VERBOSITY terse DROP TABLE trunc_f CASCADE; +\set VERBOSITY default -- Test ON TRUNCATE triggers diff --git a/src/test/regress/sql/typed_table.sql b/src/test/regress/sql/typed_table.sql index 9ef0cdfcc7..953cd1f14b 100644 --- a/src/test/regress/sql/typed_table.sql +++ b/src/test/regress/sql/typed_table.sql @@ -43,8 +43,10 @@ CREATE TABLE persons4 OF person_type ( name WITH OPTIONS DEFAULT '' -- error, specified more than once ); +\set VERBOSITY terse DROP TYPE person_type RESTRICT; DROP TYPE person_type CASCADE; +\set VERBOSITY default CREATE TABLE persons5 OF stuff; -- only CREATE TYPE AS types may be used diff --git a/src/test/regress/sql/updatable_views.sql b/src/test/regress/sql/updatable_views.sql index dc6d5cbe35..9103793ff4 100644 --- a/src/test/regress/sql/updatable_views.sql +++ b/src/test/regress/sql/updatable_views.sql @@ -98,7 +98,9 @@ DELETE FROM ro_view18; UPDATE ro_view19 SET last_value=1000; UPDATE ro_view20 SET b=upper(b); +\set VERBOSITY terse DROP TABLE base_tbl CASCADE; +\set VERBOSITY default DROP VIEW ro_view10, ro_view12, ro_view18; DROP SEQUENCE uv_seq CASCADE; @@ -457,7 +459,9 @@ DELETE FROM rw_view2 WHERE aa=4; -- not allowed SELECT * FROM base_tbl; RESET SESSION AUTHORIZATION; +\set VERBOSITY terse DROP TABLE base_tbl CASCADE; +\set VERBOSITY default -- nested-view permissions @@ -533,7 +537,9 @@ UPDATE rw_view2 SET b = 'bar' WHERE a = 1; -- not allowed RESET SESSION AUTHORIZATION; +\set VERBOSITY terse DROP TABLE base_tbl CASCADE; +\set VERBOSITY default DROP USER regress_view_user1; DROP USER regress_view_user2; @@ -678,7 +684,9 @@ SELECT events & 4 != 0 AS upd, events & 16 != 0 AS del FROM pg_catalog.pg_relation_is_updatable('rw_view3'::regclass, false) t(events); +\set VERBOSITY terse DROP TABLE base_tbl CASCADE; +\set VERBOSITY default -- inheritance tests @@ -710,7 +718,9 @@ DELETE FROM ONLY rw_view2 WHERE a IN (-8, 8); -- Should delete -8 only SELECT * FROM ONLY base_tbl_parent ORDER BY a; SELECT * FROM base_tbl_child ORDER BY a; +\set VERBOSITY terse DROP TABLE base_tbl_parent, base_tbl_child CASCADE; +\set VERBOSITY default -- simple WITH CHECK OPTION @@ -772,7 +782,9 @@ SELECT * FROM information_schema.views WHERE table_name = 'rw_view2'; INSERT INTO rw_view2 VALUES (30); -- ok, but not in view SELECT * FROM base_tbl; +\set VERBOSITY terse DROP TABLE base_tbl CASCADE; +\set VERBOSITY default -- WITH CHECK OPTION with no local view qual @@ -790,7 +802,9 @@ INSERT INTO rw_view2 VALUES (2); -- ok INSERT INTO rw_view3 VALUES (-3); -- should fail INSERT INTO rw_view3 VALUES (3); -- ok +\set VERBOSITY terse DROP TABLE base_tbl CASCADE; +\set VERBOSITY default -- WITH CHECK OPTION with scalar array ops @@ -918,7 +932,9 @@ INSERT INTO rw_view2 VALUES (5); -- ok UPDATE rw_view2 SET a = -5 WHERE a = 5; -- ok, but not in view (doesn't fail rw_view2's check) SELECT * FROM base_tbl; +\set VERBOSITY terse DROP TABLE base_tbl CASCADE; +\set VERBOSITY default DROP FUNCTION rw_view1_trig_fn(); CREATE TABLE base_tbl (a int); @@ -928,7 +944,9 @@ CREATE RULE rw_view1_ins_rule AS ON INSERT TO rw_view1 CREATE VIEW rw_view2 AS SELECT * FROM rw_view1 WHERE a > b WITH LOCAL CHECK OPTION; INSERT INTO rw_view2 VALUES (2,3); -- ok, but not in view (doesn't fail rw_view2's check) +\set VERBOSITY terse DROP TABLE base_tbl CASCADE; +\set VERBOSITY default -- security barrier view @@ -1012,7 +1030,9 @@ EXPLAIN (costs off) SELECT * FROM rw_view2 WHERE snoop(person); EXPLAIN (costs off) UPDATE rw_view2 SET person=person WHERE snoop(person); EXPLAIN (costs off) DELETE FROM rw_view2 WHERE NOT snoop(person); +\set VERBOSITY terse DROP TABLE base_tbl CASCADE; +\set VERBOSITY default -- security barrier view on top of table with rules diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 9fe950b29d..08cf72d670 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -167,6 +167,8 @@ BTArrayKeyInfo BTBuildState BTCycleId BTIndexStat +BTInsertionKey +BTInsertionKeyData BTLeader BTMetaPageData BTOneVacInfo @@ -2207,6 +2209,8 @@ SpecialJoinInfo SpinDelayStatus SplitInterval SplitLR +SplitMode +SplitPoint SplitVar SplitedPageLayout StackElem -- 2.17.1