commit 0f1b12bc433f0db580d2af0c893a208f7105e4b0 Author: Alexander Korotkov Date: Thu Dec 6 01:04:33 2018 +0300 Improve checking for missing parent links Instead of collecting lossy bitmap, traverse from one downlink to subsequent using rightlinks. Intermediate pages are candidates to have lost parent downlinks. diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 55a3a4bbe0c..e9b1b779453 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -93,14 +93,19 @@ typedef struct BtreeCheckState /* Target page's LSN */ XLogRecPtr targetlsn; + /* + * Rightlink and incomplete split flag of previous block referenced by + * downlink. + */ + BlockNumber prevrightlink; + bool previncompletesplit; + /* * Mutable state, for optional heapallindexed verification: */ /* Bloom filter fingerprints B-Tree index */ bloom_filter *filter; - /* Bloom filter fingerprints downlink blocks within tree */ - bloom_filter *downlinkfilter; /* Right half of incomplete split marker */ bool rightsplit; /* Debug counter */ @@ -137,7 +142,10 @@ static void bt_target_page_check(BtreeCheckState *state); static BTScanInsert bt_right_page_check_scankey(BtreeCheckState *state); static void bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey, BlockNumber childblock); -static void bt_downlink_missing_check(BtreeCheckState *state); +static void bt_downlink_connectivity_check(BtreeCheckState *state, + BlockNumber downlink, Page loaded_page, uint32 parent_level); +static void bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, + BlockNumber targetblock, Page target); static void bt_tuple_present_callback(Relation index, HeapTuple htup, Datum *values, bool *isnull, bool tupleIsAlive, void *checkstate); @@ -432,20 +440,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, errmsg("index \"%s\" cannot be verified using transaction snapshot", RelationGetRelationName(rel)))); } - else - { - /* - * Extra readonly downlink check. - * - * In readonly case, we know that there cannot be a concurrent - * page split or a concurrent page deletion, which gives us the - * opportunity to verify that every non-ignorable page had a - * downlink one level up. We must be tolerant of interrupted page - * splits and page deletions, though. This is taken care of in - * bt_downlink_missing_check(). - */ - state->downlinkfilter = bloom_create(total_pages, work_mem, seed); - } } Assert(!state->rootdescend || state->readonly); @@ -523,16 +517,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, IndexInfo *indexinfo = BuildIndexInfo(state->rel); TableScanDesc scan; - /* Report on extra downlink checks performed in readonly case */ - if (state->readonly) - { - ereport(DEBUG1, - (errmsg_internal("finished verifying presence of downlink blocks within index \"%s\" with bitset %.2f%% set", - RelationGetRelationName(rel), - 100.0 * bloom_prop_bits_set(state->downlinkfilter)))); - bloom_free(state->downlinkfilter); - } - /* * Create our own scan for table_index_build_scan(), rather than * getting it to do so for us. This is required so that we can @@ -635,6 +619,9 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) level.istruerootlevel ? " (true root level)" : level.level == 0 ? " (leaf level)" : ""); + state->prevrightlink = InvalidBlockNumber; + state->previncompletesplit = false; + do { /* Don't rely on CHECK_FOR_INTERRUPTS() calls at lower level */ @@ -940,22 +927,24 @@ bt_target_page_check(BtreeCheckState *state) (uint32) state->targetlsn))); } - /* Fingerprint downlink blocks in heapallindexed + readonly case */ - if (state->heapallindexed && state->readonly && !P_ISLEAF(topaque)) - { - BlockNumber childblock = BTreeInnerTupleGetDownLink(itup); - - bloom_add_element(state->downlinkfilter, - (unsigned char *) &childblock, - sizeof(BlockNumber)); - } - /* * Don't try to generate scankey using "negative infinity" item on * internal pages. They are always truncated to zero attributes. */ if (offset_is_negative_infinity(topaque, offset)) + { + /* + * Initialize downlink connectivity check if needed. + */ + if (!P_ISLEAF(topaque) && state->readonly) + { + bt_downlink_connectivity_check(state, + BTreeInnerTupleGetDownLink(itup), + NULL, + topaque->btpo.level); + } continue; + } /* * Readonly callers may optionally verify that non-pivot tuples can @@ -1239,12 +1228,15 @@ bt_target_page_check(BtreeCheckState *state) } /* - * * Check if page has a downlink in parent * - * - * This can only be checked in heapallindexed + readonly case. + * If we traversed the whole level to the rightmost page, there might be + * missing downlinks for the pages to the right of rightmost downlink. + * Check for them. */ - if (state->heapallindexed && state->readonly) - bt_downlink_missing_check(state); + if (!P_ISLEAF(topaque) && P_RIGHTMOST(topaque) && state->readonly) + { + bt_downlink_connectivity_check(state, P_NONE, + NULL, topaque->btpo.level); + } } /* @@ -1461,6 +1453,115 @@ bt_right_page_check_scankey(BtreeCheckState *state) return bt_mkscankey_pivotsearch(state->rel, firstitup); } +/* + * Check connectivity of downlinks. Traverse rightlinks from previous downlink + * to the current one. Check that there are no intermediate pages with missing + * downlinks. + * + * If 'loaded_page' is given, it's assumed to be contents of 'downlink'. + */ +static void +bt_downlink_connectivity_check(BtreeCheckState *state, + BlockNumber downlink, + Page loaded_page, + uint32 parent_level) +{ + BlockNumber blkno = state->prevrightlink; + Page page; + BTPageOpaque opaque; + bool rightsplit = state->previncompletesplit; + bool first = true; + + /* + * If no previos rightlink is memorized, get it from current downlink for + * future usage. + */ + if (!BlockNumberIsValid(blkno)) + { + Assert(BlockNumberIsValid(downlink) && downlink != P_NONE); + + if (loaded_page) + page = loaded_page; + else + page = palloc_btree_page(state, downlink); + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + state->prevrightlink = opaque->btpo_next; + state->previncompletesplit = P_INCOMPLETE_SPLIT(opaque); + return; + } + + while (true) + { + /* + * Did we traverse the whole tree level and this is check for pages to + * the right of rightmost downlink? + */ + if (blkno == P_NONE && downlink == P_NONE) + { + state->prevrightlink = InvalidBlockNumber; + state->previncompletesplit = false; + return; + } + + /* Did we traverse the whole tree level and don't find next downlink? */ + if (blkno == P_NONE) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("can't traverse from downlink %u to downlink %u of index \"%s\"", + state->prevrightlink, downlink, + RelationGetRelationName(state->rel)))); + + /* Load page contents */ + if (blkno == downlink && loaded_page) + page = loaded_page; + else + page = palloc_btree_page(state, blkno); + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* Check level for non-ignorable page */ + if (!P_IGNORE(opaque) && opaque->btpo.level != parent_level - 1) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("block found while traversing rightlinks from downlink of index \"%s\" has invalid level", + RelationGetRelationName(state->rel)), + errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.", + blkno, parent_level - 1, opaque->btpo.level))); + + /* Try to detect circular links */ + if ((!first && blkno == state->prevrightlink) || blkno == opaque->btpo_prev) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("circular link chain found in block %u of index \"%s\"", + blkno, RelationGetRelationName(state->rel)))); + + if (!first && !P_IGNORE(opaque)) + { + /* blkno probably has missing parent downlink */ + bt_downlink_missing_check(state, rightsplit, blkno, page); + } + + rightsplit = P_INCOMPLETE_SPLIT(opaque); + + /* Exit if we already found next downlink */ + if (blkno == downlink) + { + state->prevrightlink = opaque->btpo_next; + state->previncompletesplit = rightsplit; + return; + } + + /* Traverse to the next page using rightlink */ + blkno = opaque->btpo_next; + + /* Free page contents if it's allocated by us */ + if (page != loaded_page) + pfree(page); + first = false; + } +} + /* * Checks one of target's downlink against its child page. * @@ -1478,6 +1579,7 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey, OffsetNumber maxoffset; Page child; BTPageOpaque copaque; + BTPageOpaque topaque; /* * Caller must have ShareLock on target relation, because of @@ -1527,10 +1629,18 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey, * Check all items, rather than checking just the first and trusting that * the operator class obeys the transitive law. */ + topaque = (BTPageOpaque) PageGetSpecialPointer(state->target); child = palloc_btree_page(state, childblock); copaque = (BTPageOpaque) PageGetSpecialPointer(child); maxoffset = PageGetMaxOffsetNumber(child); + /* + * Since we've already loaded the child block, combine this check with + * check for downlink connectivity. + */ + bt_downlink_connectivity_check(state, childblock, + child, topaque->btpo.level); + /* * Since there cannot be a concurrent VACUUM operation in readonly mode, * and since a page has no links within other pages (siblings and parent) @@ -1625,23 +1735,27 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey, * be concerned about concurrent page splits or page deletions. */ static void -bt_downlink_missing_check(BtreeCheckState *state) +bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, + BlockNumber targetblock, Page target) { - BTPageOpaque topaque = (BTPageOpaque) PageGetSpecialPointer(state->target); + BTPageOpaque topaque = (BTPageOpaque) PageGetSpecialPointer(target); ItemId itemid; IndexTuple itup; Page child; BTPageOpaque copaque; uint32 level; BlockNumber childblk; + XLogRecPtr targetlsn; - Assert(state->heapallindexed && state->readonly); + Assert(state->readonly); Assert(!P_IGNORE(topaque)); /* No next level up with downlinks to fingerprint from the true root */ if (P_ISROOT(topaque)) return; + targetlsn = PageGetLSN(target); + /* * Incomplete (interrupted) page splits can account for the lack of a * downlink. Some inserting transaction should eventually complete the @@ -1663,26 +1777,20 @@ bt_downlink_missing_check(BtreeCheckState *state) * by design, so it shouldn't be taken to indicate corruption. See * _bt_pagedel() for full details. */ - if (state->rightsplit) + if (rightsplit) { ereport(DEBUG1, (errcode(ERRCODE_NO_DATA), errmsg("harmless interrupted page split detected in index %s", RelationGetRelationName(state->rel)), errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.", - state->targetblock, topaque->btpo.level, + targetblock, topaque->btpo.level, topaque->btpo_prev, - (uint32) (state->targetlsn >> 32), - (uint32) state->targetlsn))); + (uint32) (targetlsn >> 32), + (uint32) targetlsn))); return; } - /* Target's downlink is typically present in parent/fingerprinted */ - if (!bloom_lacks_element(state->downlinkfilter, - (unsigned char *) &state->targetblock, - sizeof(BlockNumber))) - return; - /* * Target is probably the "top parent" of a multi-level page deletion. * We'll need to descend the subtree to make sure that descendant pages @@ -1699,9 +1807,9 @@ bt_downlink_missing_check(BtreeCheckState *state) errmsg("leaf index block lacks downlink in index \"%s\"", RelationGetRelationName(state->rel)), errdetail_internal("Block=%u page lsn=%X/%X.", - state->targetblock, - (uint32) (state->targetlsn >> 32), - (uint32) state->targetlsn))); + targetblock, + (uint32) (targetlsn >> 32), + (uint32) targetlsn))); /* Descend from the target page, which is an internal page */ elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"", @@ -1729,7 +1837,7 @@ bt_downlink_missing_check(BtreeCheckState *state) errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down", RelationGetRelationName(state->rel)), errdetail_internal("Top parent/target block=%u block pointed to=%u expected level=%u level in pointed to block=%u.", - state->targetblock, childblk, + targetblock, childblk, level - 1, copaque->btpo.level))); level = copaque->btpo.level; @@ -1767,9 +1875,9 @@ bt_downlink_missing_check(BtreeCheckState *state) errmsg_internal("downlink to deleted leaf page found in index \"%s\"", RelationGetRelationName(state->rel)), errdetail_internal("Top parent/target block=%u leaf block=%u top parent/target lsn=%X/%X.", - state->targetblock, childblk, - (uint32) (state->targetlsn >> 32), - (uint32) state->targetlsn))); + targetblock, childblk, + (uint32) (targetlsn >> 32), + (uint32) targetlsn))); /* * Iff leaf page is half-dead, its high key top parent link should point @@ -1785,7 +1893,7 @@ bt_downlink_missing_check(BtreeCheckState *state) { itemid = PageGetItemIdCareful(state, childblk, child, P_HIKEY); itup = (IndexTuple) PageGetItem(child, itemid); - if (BTreeTupleGetTopParent(itup) == state->targetblock) + if (BTreeTupleGetTopParent(itup) == targetblock) return; } @@ -1794,9 +1902,9 @@ bt_downlink_missing_check(BtreeCheckState *state) errmsg("internal index block lacks downlink in index \"%s\"", RelationGetRelationName(state->rel)), errdetail_internal("Block=%u level=%u page lsn=%X/%X.", - state->targetblock, topaque->btpo.level, - (uint32) (state->targetlsn >> 32), - (uint32) state->targetlsn))); + targetblock, topaque->btpo.level, + (uint32) (targetlsn >> 32), + (uint32) targetlsn))); } /*