From 8c0c21b752dea0db00686b306a68980fad580806 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Thu, 31 Jan 2019 17:40:00 -0800 Subject: [PATCH v13 6/7] Allow tuples to be relocated from root by amcheck. Teach contrib/amcheck's bt_index_parent_check() function to take advantage of the uniqueness property of heapkeyspace indexes in support of a new verification option: non-pivot tuples (non-highkey tuples on the leaf level) can optionally be relocated using a new search that starts from the root page. The new "relocate" verification option is exhaustive, and can therefore make a call to bt_index_parent_check() take a lot longer. Relocating tuples during verification is intended as an option for backend developers, since the corruption scenarios that it alone is uniquely capable of detecting seem fairly far-fetched. For example, "relocate" verification is generally the only way of detecting corruption of the least significant byte of a key from a pivot tuple in the root page, since only a few tuples on a cousin leaf page are liable to "be overlooked" by index scans. --- contrib/amcheck/Makefile | 2 +- contrib/amcheck/amcheck--1.1--1.2.sql | 19 +++ contrib/amcheck/amcheck.control | 2 +- contrib/amcheck/expected/check_btree.out | 5 +- contrib/amcheck/sql/check_btree.sql | 5 +- contrib/amcheck/verify_nbtree.c | 157 +++++++++++++++++++++-- doc/src/sgml/amcheck.sgml | 7 +- 7 files changed, 181 insertions(+), 16 deletions(-) create mode 100644 contrib/amcheck/amcheck--1.1--1.2.sql diff --git a/contrib/amcheck/Makefile b/contrib/amcheck/Makefile index c5764b544f..dcec3b8520 100644 --- a/contrib/amcheck/Makefile +++ b/contrib/amcheck/Makefile @@ -4,7 +4,7 @@ MODULE_big = amcheck OBJS = verify_nbtree.o $(WIN32RES) EXTENSION = amcheck -DATA = amcheck--1.0--1.1.sql amcheck--1.0.sql +DATA = amcheck--1.1--1.2.sql amcheck--1.0--1.1.sql amcheck--1.0.sql PGFILEDESC = "amcheck - function for verifying relation integrity" REGRESS = check check_btree diff --git a/contrib/amcheck/amcheck--1.1--1.2.sql b/contrib/amcheck/amcheck--1.1--1.2.sql new file mode 100644 index 0000000000..de7b657f2f --- /dev/null +++ b/contrib/amcheck/amcheck--1.1--1.2.sql @@ -0,0 +1,19 @@ +/* contrib/amcheck/amcheck--1.1--1.2.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION amcheck UPDATE TO '1.2'" to load this file. \quit + +-- In order to avoid issues with dependencies when updating amcheck to 1.2, +-- create new, overloaded version of the 1.1 function signature + +-- +-- bt_index_parent_check() +-- +CREATE FUNCTION bt_index_parent_check(index regclass, + heapallindexed boolean, relocate boolean) +RETURNS VOID +AS 'MODULE_PATHNAME', 'bt_index_parent_check' +LANGUAGE C STRICT PARALLEL RESTRICTED; + +-- Don't want this to be available to public +REVOKE ALL ON FUNCTION bt_index_parent_check(regclass, boolean, boolean) FROM PUBLIC; diff --git a/contrib/amcheck/amcheck.control b/contrib/amcheck/amcheck.control index 469048403d..c6e310046d 100644 --- a/contrib/amcheck/amcheck.control +++ b/contrib/amcheck/amcheck.control @@ -1,5 +1,5 @@ # amcheck extension comment = 'functions for verifying relation integrity' -default_version = '1.1' +default_version = '1.2' module_pathname = '$libdir/amcheck' relocatable = true diff --git a/contrib/amcheck/expected/check_btree.out b/contrib/amcheck/expected/check_btree.out index 1e6079ddd2..687fde8fce 100644 --- a/contrib/amcheck/expected/check_btree.out +++ b/contrib/amcheck/expected/check_btree.out @@ -126,7 +126,8 @@ SELECT bt_index_parent_check('bttest_multi_idx', true); (1 row) -- --- Test for multilevel page deletion/downlink present checks +-- Test for multilevel page deletion/downlink present checks, and relocate +-- checks -- INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,80000) i; ALTER TABLE delete_test_table ADD PRIMARY KEY (a,b,c,d); @@ -137,7 +138,7 @@ VACUUM delete_test_table; -- root" DELETE FROM delete_test_table WHERE a < 79990; VACUUM delete_test_table; -SELECT bt_index_parent_check('delete_test_table_pkey', true); +SELECT bt_index_parent_check('delete_test_table_pkey', true, true); bt_index_parent_check ----------------------- diff --git a/contrib/amcheck/sql/check_btree.sql b/contrib/amcheck/sql/check_btree.sql index 3f1e0d17ef..d33d3e6682 100644 --- a/contrib/amcheck/sql/check_btree.sql +++ b/contrib/amcheck/sql/check_btree.sql @@ -78,7 +78,8 @@ INSERT INTO bttest_multi SELECT i, i%2 FROM generate_series(1, 100000) as i; SELECT bt_index_parent_check('bttest_multi_idx', true); -- --- Test for multilevel page deletion/downlink present checks +-- Test for multilevel page deletion/downlink present checks, and relocate +-- checks -- INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,80000) i; ALTER TABLE delete_test_table ADD PRIMARY KEY (a,b,c,d); @@ -89,7 +90,7 @@ VACUUM delete_test_table; -- root" DELETE FROM delete_test_table WHERE a < 79990; VACUUM delete_test_table; -SELECT bt_index_parent_check('delete_test_table_pkey', true); +SELECT bt_index_parent_check('delete_test_table_pkey', true, true); -- -- BUG #15597: must not assume consistent input toasting state when forming diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index a7d060b3ec..151c6d5fdb 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -74,6 +74,8 @@ typedef struct BtreeCheckState bool readonly; /* Also verifying heap has no unindexed tuples? */ bool heapallindexed; + /* Also relocating non-pivot tuples? */ + bool relocate; /* Per-page context */ MemoryContext targetcontext; /* Buffer access strategy */ @@ -123,10 +125,11 @@ PG_FUNCTION_INFO_V1(bt_index_check); PG_FUNCTION_INFO_V1(bt_index_parent_check); static void bt_index_check_internal(Oid indrelid, bool parentcheck, - bool heapallindexed); + bool heapallindexed, bool relocate); static inline void btree_index_checkable(Relation rel); static void bt_check_every_level(Relation rel, Relation heaprel, - bool heapkeyspace, bool readonly, bool heapallindexed); + bool heapkeyspace, bool readonly, bool heapallindexed, + bool relocate); static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level); static void bt_target_page_check(BtreeCheckState *state); @@ -139,6 +142,7 @@ static void bt_tuple_present_callback(Relation index, HeapTuple htup, bool tupleIsAlive, void *checkstate); static IndexTuple bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup); +static bool bt_relocate_from_root(BtreeCheckState *state, IndexTuple itup); static inline bool offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset); static inline bool invariant_l_offset(BtreeCheckState *state, BTScanInsert key, @@ -176,7 +180,7 @@ bt_index_check(PG_FUNCTION_ARGS) if (PG_NARGS() == 2) heapallindexed = PG_GETARG_BOOL(1); - bt_index_check_internal(indrelid, false, heapallindexed); + bt_index_check_internal(indrelid, false, heapallindexed, false); PG_RETURN_VOID(); } @@ -195,11 +199,14 @@ bt_index_parent_check(PG_FUNCTION_ARGS) { Oid indrelid = PG_GETARG_OID(0); bool heapallindexed = false; + bool relocate = false; - if (PG_NARGS() == 2) + if (PG_NARGS() >= 2) heapallindexed = PG_GETARG_BOOL(1); + if (PG_NARGS() == 3) + relocate = PG_GETARG_BOOL(2); - bt_index_check_internal(indrelid, true, heapallindexed); + bt_index_check_internal(indrelid, true, heapallindexed, relocate); PG_RETURN_VOID(); } @@ -208,7 +215,8 @@ bt_index_parent_check(PG_FUNCTION_ARGS) * Helper for bt_index_[parent_]check, coordinating the bulk of the work. */ static void -bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed) +bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed, + bool relocate) { Oid heapid; Relation indrel; @@ -266,7 +274,7 @@ bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed) /* Check index, possibly against table it is an index on */ heapkeyspace = _bt_heapkeyspace(indrel); bt_check_every_level(indrel, heaprel, heapkeyspace, parentcheck, - heapallindexed); + heapallindexed, relocate); /* * Release locks early. That's ok here because nothing in the called @@ -337,7 +345,7 @@ btree_index_checkable(Relation rel) */ static void bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, - bool readonly, bool heapallindexed) + bool readonly, bool heapallindexed, bool relocate) { BtreeCheckState *state; Page metapage; @@ -361,6 +369,7 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, state->heapkeyspace = heapkeyspace; state->readonly = readonly; state->heapallindexed = heapallindexed; + state->relocate = relocate; if (state->heapallindexed) { @@ -429,6 +438,14 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, } } + Assert(!state->relocate || state->readonly); + if (state->relocate && !state->heapkeyspace) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("index \"%s\" does not support relocating tuples", + RelationGetRelationName(rel)), + errhint("Only indexes initialized on PostgreSQL 12 support relocation verification."))); + /* Create context for page */ state->targetcontext = AllocSetContextCreate(CurrentMemoryContext, "amcheck context", @@ -921,6 +938,32 @@ bt_target_page_check(BtreeCheckState *state) if (offset_is_negative_infinity(topaque, offset)) continue; + /* + * Readonly callers may optionally relocate non-pivot tuples for + * heapkeyspace indexes. A new search starting from the root + * relocates every current entry in turn. + */ + if (state->relocate && P_ISLEAF(topaque) && + !bt_relocate_from_root(state, itup)) + { + char *itid, + *htid; + + itid = psprintf("(%u,%u)", state->targetblock, offset); + htid = psprintf("(%u,%u)", + ItemPointerGetBlockNumber(&(itup->t_tid)), + ItemPointerGetOffsetNumber(&(itup->t_tid))); + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("could not relocate tuple in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Index tid=%s points to heap tid=%s page lsn=%X/%X.", + itid, htid, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn))); + } + /* Build insertion scankey for current page offset */ skey = bt_mkscankey_minusinfkey(state->rel, itup); @@ -1525,6 +1568,9 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey, * internal pages. In more general terms, a negative infinity item is * only negative infinity with respect to the subtree that the page is * at the root of. + * + * See also: bt_relocate_from_root(), which can even detect transitive + * inconsistencies on cousin leaf pages. */ if (offset_is_negative_infinity(copaque, offset)) continue; @@ -1925,6 +1971,101 @@ bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup) return reformed; } +/* + * Search for itup in index, starting from fast root page. itup must be a + * non-pivot tuple. This is only supported with heapkeyspace indexes, since + * we rely on having fully unique keys to relocate itup without visiting more + * than one page on each level, barring an interrupted page split, where we + * may have to move right. (A concurrent page split is impossible because + * caller must be readonly caller.) + * + * This routine can detect very subtle transitive consistency issues across + * more than one level of the tree. Leaf pages all have a high key (even the + * rightmost page has a conceptual positive infinity high key), but not a low + * key. Their downlink in parent is a lower bound, which along with the high + * key is almost enough to detect every possible inconsistency. A downlink + * separator key value won't always be available from parent, though, because + * the first items of internal pages are negative infinity items, truncated + * down to zero attributes during internal page splits. While it's true that + * bt_downlink_check() and the high key check can detect most imaginable key + * space problems, there are remaining problems it won't detect with non-pivot + * tuples in cousin leaf pages. Starting a search from the root for every + * existing leaf tuple detects small inconsistencies in upper levels of the + * tree that cannot be detected any other way. (Besides all this, it's + * probably a useful testing strategy to exhaustively verify that all + * non-pivot tuples can be relocated in the index using the same code paths as + * those used by index scans.) + * + * Alternative nbtree design that could be used perform "cousin verification" + * inexpensively/transitively (may make current approach clearer): + * + * A cousin leaf page has a lower bound that comes from its grandparent page + * rather than its parent page, as already discussed (note also that a "second + * cousin" leaf page gets its lower bound from its great-grandparent, and so + * on). Any pivot tuple in the root page after the first tuple (which is an + * "absolute" negative infinity tuple, since its leftmost on the level) should + * separate every leaf page into <= and > pages. Even with the current + * design, there should be an unbroken seam of identical-to-root-pivot high + * key separator values at the right edge of the <= pages, all the way down to + * (and including) the leaf level. Recall that page deletion won't delete the + * rightmost child of a leaf page unless that page is the only child, and it + * is deleting the parent page as well. + * + * If we didn't truncate the item at first/negative infinity offset to zero + * attributes during internal page splits then there would also be an unbroken + * seam of identical-to-root-pivot "low key" separator values on the left edge + * of the pages that are > the separator value; this alternative design would + * allow us to verify the same invariants directly, without ever having to + * cross more than one level of the tree (we'd still have to cross one level + * because leaf pages would still not store a low key directly, and we'd still + * need bitwise-equality cross checks of downlink separator in parent against + * the low keys in their non-leaf children). + */ +static bool +bt_relocate_from_root(BtreeCheckState *state, IndexTuple itup) +{ + BTScanInsert key; + BTStack stack; + Buffer lbuf; + bool exists; + + /* No need to use bt_mkscankey_minusinfkey() here */ + key = _bt_mkscankey(state->rel, itup); + Assert(key->heapkeyspace && key->scantid != NULL); + + /* + * Search from root. + * + * Ideally, we would arrange to only move right within _bt_search() when + * an interrupted page split is detected (i.e. when the incomplete split + * bit is found to be set), but for now we accept the possibility that + * that could conceal certain remaining inconsistencies. + */ + Assert(state->readonly && state->relocate); + exists = false; + stack = _bt_search(state->rel, key, &lbuf, BT_READ, NULL); + + if (BufferIsValid(lbuf)) + { + OffsetNumber offnum; + Page page; + + /* Get matching tuple on leaf page */ + offnum = _bt_binsrch(state->rel, key, lbuf); + /* Compare first >= matching item on leaf page, if any */ + page = BufferGetPage(lbuf); + if (offnum <= PageGetMaxOffsetNumber(page) && + _bt_compare(state->rel, key, page, offnum) == 0) + exists = true; + _bt_relbuf(state->rel, lbuf); + } + + _bt_freestack(stack); + pfree(key); + + return exists; +} + /* * Is particular offset within page (whose special state is passed by caller) * the page negative-infinity item? diff --git a/doc/src/sgml/amcheck.sgml b/doc/src/sgml/amcheck.sgml index 8bb60d5c2d..c638456638 100644 --- a/doc/src/sgml/amcheck.sgml +++ b/doc/src/sgml/amcheck.sgml @@ -112,7 +112,7 @@ ORDER BY c.relpages DESC LIMIT 10; - bt_index_parent_check(index regclass, heapallindexed boolean) returns void + bt_index_parent_check(index regclass, heapallindexed boolean, relocate boolean) returns void bt_index_parent_check @@ -126,7 +126,10 @@ ORDER BY c.relpages DESC LIMIT 10; argument is true, the function verifies the presence of all heap tuples that should be found within the index, and that there are no missing downlinks in the index - structure. The checks that can be performed by + structure. When the optional relocate + argument is true, verification relocates + tuples on the leaf level by performing a new search from the + root page. The checks that can be performed by bt_index_parent_check are a superset of the checks that can be performed by bt_index_check. bt_index_parent_check can be thought of as -- 2.17.1