From a82c9aee92a90bd7a33f861103b5dcb9ca39d56a Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Mon, 8 Nov 2021 22:58:14 -0800 Subject: [PATCH v2] Fix aborted HOT update bug in heap pruning. Author: Peter Geoghegan Reported-By: Alexander Lakhin Diagnosed-By: Andres Freund Bug: #17255 Discussion: https://postgr.es/m/17255-14c0ac58d0f9b583@postgresql.org Backpatch: ??? --- src/backend/access/heap/pruneheap.c | 290 ++++++++++++++++++---------- 1 file changed, 187 insertions(+), 103 deletions(-) diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index c7331d810..31fa4d2a3 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -31,6 +31,7 @@ typedef struct { Relation rel; + BlockNumber targetblkno; /* tuple visibility test, initialized for the relation */ GlobalVisState *vistest; @@ -56,14 +57,16 @@ typedef struct OffsetNumber redirected[MaxHeapTuplesPerPage * 2]; OffsetNumber nowdead[MaxHeapTuplesPerPage]; OffsetNumber nowunused[MaxHeapTuplesPerPage]; - /* marked[i] is true if item i is entered in one of the above arrays */ - bool marked[MaxHeapTuplesPerPage + 1]; + /* fromvalidchain[i] is true if item i is from a known valid HOT chain */ + bool fromvalidchain[MaxHeapTuplesPerPage + 1]; } PruneState; /* Local functions */ static int heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate); +static int heap_prune_heaponly(Buffer buffer, OffsetNumber offnum, + PruneState *prstate); static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid); static void heap_prune_record_redirect(PruneState *prstate, OffsetNumber offnum, OffsetNumber rdoffnum); @@ -242,13 +245,14 @@ heap_page_prune(Relation relation, Buffer buffer, */ prstate.new_prune_xid = InvalidTransactionId; prstate.rel = relation; + prstate.targetblkno = BufferGetBlockNumber(buffer); prstate.vistest = vistest; prstate.old_snap_xmin = old_snap_xmin; prstate.old_snap_ts = old_snap_ts; prstate.old_snap_used = false; prstate.latestRemovedXid = InvalidTransactionId; prstate.nredirected = prstate.ndead = prstate.nunused = 0; - memset(prstate.marked, 0, sizeof(prstate.marked)); + memset(prstate.fromvalidchain, 0, sizeof(prstate.fromvalidchain)); /* Scan the page */ maxoff = PageGetMaxOffsetNumber(page); @@ -256,10 +260,8 @@ heap_page_prune(Relation relation, Buffer buffer, offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { - ItemId itemid; - /* Ignore items already processed as part of an earlier chain */ - if (prstate.marked[offnum]) + if (prstate.fromvalidchain[offnum]) continue; /* @@ -269,15 +271,29 @@ heap_page_prune(Relation relation, Buffer buffer, if (off_loc) *off_loc = offnum; - /* Nothing to do if slot is empty or already dead */ - itemid = PageGetItemId(page, offnum); - if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid)) - continue; - /* Process this item or chain of items */ ndeleted += heap_prune_chain(buffer, offnum, &prstate); } + /* + * Scan the page again, processing any heap-only tuples that we now know + * are not part of any valid HOT chain + */ + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + /* Ignore items already processed as part of a known valid chain */ + if (prstate.fromvalidchain[offnum]) + continue; + + if (off_loc) + *off_loc = offnum; + + /* Process this disconnected heap-only tuple */ + ndeleted += heap_prune_heaponly(buffer, offnum, &prstate); + } + /* Clear the offset information once we have processed the given page. */ if (off_loc) *off_loc = InvalidOffsetNumber; @@ -383,19 +399,8 @@ heap_page_prune(Relation relation, Buffer buffer, pgstat_update_heap_dead_tuples(relation, ndeleted - prstate.ndead); /* - * XXX Should we update the FSM information of this page ? - * - * There are two schools of thought here. We may not want to update FSM - * information so that the page is not used for unrelated UPDATEs/INSERTs - * and any free space in this page will remain available for further - * UPDATEs in *this* page, thus improving chances for doing HOT updates. - * - * But for a large table and where a page does not receive further UPDATEs - * for a long time, we might waste this space by not updating the FSM - * information. The relation may get extended and fragmented further. - * - * One possibility is to leave "fillfactor" worth of space in this page - * and update FSM with the remaining space. + * Don't update the FSM information on this page. We leave it up to our + * caller to decide what to do about that. */ return ndeleted; @@ -521,73 +526,70 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) Page dp = (Page) BufferGetPage(buffer); TransactionId priorXmax = InvalidTransactionId; ItemId rootlp; - HeapTupleHeader htup; OffsetNumber latestdead = InvalidOffsetNumber, maxoff = PageGetMaxOffsetNumber(dp), offnum; OffsetNumber chainitems[MaxHeapTuplesPerPage]; - int nchain = 0, - i; - HeapTupleData tup; - - tup.t_tableOid = RelationGetRelid(prstate->rel); + int nchain; + bool rootisredirect PG_USED_FOR_ASSERTS_ONLY = false; rootlp = PageGetItemId(dp, rootoffnum); - /* - * If it's a heap-only tuple, then it is not the start of a HOT chain. - */ if (ItemIdIsNormal(rootlp)) { + HeapTupleHeader htup; + htup = (HeapTupleHeader) PageGetItem(dp, rootlp); - tup.t_data = htup; - tup.t_len = ItemIdGetLength(rootlp); - ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), rootoffnum); - + /* + * If it's a heap-only tuple, then it is not the start of a HOT chain. + * We should process it by finding it through its HOT chain (during a + * later call that starts at the HOT chain's root item). + * + * Sometimes that won't happen because the heap-only tuple will be + * disconnected from its HOT chain following a HOT update whose xact + * aborts. These heap-only tuples are processed during a later call + * to heap_prune_heaponly() instead. + */ if (HeapTupleHeaderIsHeapOnly(htup)) - { - /* - * If the tuple is DEAD and doesn't chain to anything else, mark - * it unused immediately. (If it does chain, we can only remove - * it as part of pruning its chain.) - * - * We need this primarily to handle aborted HOT updates, that is, - * XMIN_INVALID heap-only tuples. Those might not be linked to by - * any chain, since the parent tuple might be re-updated before - * any pruning occurs. So we have to be able to reap them - * separately from chain-pruning. (Note that - * HeapTupleHeaderIsHotUpdated will never return true for an - * XMIN_INVALID tuple, so this code will work even when there were - * sequential updates within the aborted transaction.) - * - * Note that we might first arrive at a dead heap-only tuple - * either here or while following a chain below. Whichever path - * gets there first will mark the tuple unused. - */ - if (heap_prune_satisfies_vacuum(prstate, &tup, buffer) - == HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup)) - { - heap_prune_record_unused(prstate, rootoffnum); - HeapTupleHeaderAdvanceLatestRemovedXid(htup, - &prstate->latestRemovedXid); - ndeleted++; - } + return 0; + } + else if (!ItemIdIsRedirected(rootlp)) + { + /* + * Nothing to do if slot cannot possibly be valid root item of HOT + * chain or a simple heap tuple + */ + Assert(ItemIdIsDead(rootlp) || !ItemIdIsUsed(rootlp)); + prstate->fromvalidchain[rootoffnum] = true; - /* Nothing more to do */ - return ndeleted; - } + return 0; + } + else + { + /* + * Remember that root item is an LP_REDIRECT item, not a plain + * (not-heap-only) tuple item + */ + rootisredirect = true; } - /* Start from the root tuple */ + /* + * Start from the root item. Mark it as valid up front, since root items + * are always processed up front, in first pass over page. + */ + Assert(!prstate->fromvalidchain[rootoffnum]); + prstate->fromvalidchain[rootoffnum] = true; offnum = rootoffnum; + nchain = 0; /* while not end of the chain */ for (;;) { ItemId lp; - bool tupdead, - recent_dead; + HeapTupleHeader htup; + HeapTupleData tup; + bool tupdead; /* Sanity check (pure paranoia) */ if (offnum < FirstOffsetNumber) @@ -601,15 +603,11 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) break; /* If item is already processed, stop --- it must not be same chain */ - if (prstate->marked[offnum]) + if (nchain != 0 && prstate->fromvalidchain[offnum]) break; lp = PageGetItemId(dp, offnum); - /* Unused item obviously isn't part of the chain */ - if (!ItemIdIsUsed(lp)) - break; - /* * If we are looking at the redirected root line pointer, jump to the * first normal tuple in the chain. If we find a redirect somewhere @@ -619,42 +617,63 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) { if (nchain > 0) break; /* not at start of chain */ + Assert(rootisredirect); chainitems[nchain++] = offnum; offnum = ItemIdGetRedirect(rootlp); continue; } - /* - * Likewise, a dead line pointer can't be part of the chain. (We - * already eliminated the case of dead root tuple outside this - * function.) - */ - if (ItemIdIsDead(lp)) + /* LP_UNUSED or LP_DEAD items obviously not part of the chain */ + if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp)) + { + /* + * XXX What if we just came from root item, which is a plain heap + * tuple? Do we need assert that notices when we reach an LP_DEAD + * or LP_UNUSED item having started from such a root item? + */ + Assert(!rootisredirect || nchain > 1); break; + } Assert(ItemIdIsNormal(lp)); htup = (HeapTupleHeader) PageGetItem(dp, lp); - tup.t_data = htup; tup.t_len = ItemIdGetLength(lp); - ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), offnum); + tup.t_tableOid = RelationGetRelid(prstate->rel); + tup.t_data = htup; + ItemPointerSet(&(tup.t_self), prstate->targetblkno, offnum); /* * Check the tuple XMIN against prior XMAX, if any */ if (TransactionIdIsValid(priorXmax) && !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax)) + { + Assert(!rootisredirect || nchain > 1); break; + } /* - * OK, this tuple is indeed a member of the chain. + * Must be a heap-only tuple unless it's the root item, and the root + * item happens to be a tuple with storage. The XMIN/XMAX cross-check + * would have failed otherwise, since priorXmax must have come from a + * HOT updated tuple. + */ + Assert((nchain == 0 && !rootisredirect) || + HeapTupleHeaderIsHeapOnly(htup)); + + /* + * OK, this tuple is indeed a member of the chain. Commit to this + * interpretation by marking the item as found while traversing a + * chain. */ chainitems[nchain++] = offnum; + prstate->fromvalidchain[offnum] = true; /* * Check tuple's visibility status. */ - tupdead = recent_dead = false; + tupdead = false; switch (heap_prune_satisfies_vacuum(prstate, &tup, buffer)) { @@ -663,7 +682,6 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) break; case HEAPTUPLE_RECENTLY_DEAD: - recent_dead = true; /* * This tuple may soon become DEAD. Update the hint field so @@ -679,6 +697,7 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) * This tuple may soon become DEAD. Update the hint field so * that the page is reconsidered for pruning in future. */ + Assert(!tupdead); heap_prune_record_prunable(prstate, HeapTupleHeaderGetUpdateXid(htup)); break; @@ -692,6 +711,7 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) * But we don't. See related decisions about when to mark the * page prunable in heapam.c. */ + Assert(!tupdead); break; default: @@ -700,11 +720,18 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) } /* - * Remember the last DEAD tuple seen. We will advance past - * RECENTLY_DEAD tuples just in case there's a DEAD one after them; - * but we can't advance past anything else. We have to make sure that - * we don't miss any DEAD tuples, since DEAD tuples that still have - * tuple storage after pruning will confuse VACUUM. + * Remember the last DEAD tuple seen from this HOT chain. + * + * We expect to sometimes find a RECENTLY_DEAD tuple after a DEAD + * tuple. When this happens, the RECENTLY_DEAD tuple will be treated + * as if it was DEAD all along. Manage that now. + * + * We're not just interested in DEAD and RECENTLY_DEAD tuples. We + * need to traverse each and every HOT chain exhaustively, in order to + * determine which heap-only tuples are part of a valid HOT chain. + * Heap-only tuples that cannot be located like this must not be part + * of a valid HOT chain. They are therefore processed during our + * second pass over the page. */ if (tupdead) { @@ -712,8 +739,6 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) HeapTupleHeaderAdvanceLatestRemovedXid(htup, &prstate->latestRemovedXid); } - else if (!recent_dead) - break; /* * If the tuple is not HOT-updated, then we are at the end of this @@ -729,7 +754,7 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) * Advance to next chain member. */ Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == - BufferGetBlockNumber(buffer)); + prstate->targetblkno); offnum = ItemPointerGetOffsetNumber(&htup->t_ctid); priorXmax = HeapTupleHeaderGetUpdateXid(htup); } @@ -741,6 +766,8 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) */ if (OffsetNumberIsValid(latestdead)) { + int i; + /* * Mark as unused each intermediate item that we are able to remove * from the chain. @@ -757,10 +784,13 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) /* * If the root entry had been a normal tuple, we are deleting it, so * count it in the result. But changing a redirect (even to DEAD - * state) doesn't count. + * state) doesn't get counted in ndeleted. */ if (ItemIdIsNormal(rootlp)) + { + Assert(!rootisredirect); ndeleted++; + } /* * If the DEAD tuple is at the end of the chain, the entire chain is @@ -787,6 +817,68 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) return ndeleted; } +/* + * Handle disconnected heap-only tuples during second pass over page. We + * always expect to mark these tuples as DEAD here. + * + * We need this to handle aborted HOT updates. We only do this for heap-only + * tuples that are not reachable by traversing any valid HOT chain. This + * happens when the parent tuple was re-updated (after earlier aborted + * update). The re-updating session is under no obligation to clean up the + * aborted tuple itself, and couldn't do so reliably if it wanted to (it would + * have to first acquire a super-exclusive lock). + * + * Pruning must never miss any DEAD tuples. DEAD tuples that still have tuple + * storage after pruning will confuse VACUUM. +*/ +static int +heap_prune_heaponly(Buffer buffer, OffsetNumber offnum, PruneState *prstate) +{ + Page dp = (Page) BufferGetPage(buffer); + ItemId lp; + HeapTupleHeader htup; + HeapTupleData tup; + HTSV_Result res; + + lp = PageGetItemId(dp, offnum); + Assert(ItemIdIsNormal(lp)); + htup = (HeapTupleHeader) PageGetItem(dp, lp); + + /* + * Caller must make sure that the tuple at 'offnum' is in fact a heap-only + * tuple that is disconnected from its HOT chain (i.e. isn't reachable + * through a HOT traversal that starts at any plausible-looking root item + * on the page). + */ + Assert(!prstate->fromvalidchain[offnum]); + Assert(HeapTupleHeaderIsHeapOnly(htup)); + + /* + * We expect that disconnected heap-only tuples must be from aborted + * transactions. Any RECENTLY_DEAD tuples we see here are really DEAD, + * but the heap_prune_satisfies_vacuum test is too coarse to detect it. + */ + tup.t_len = ItemIdGetLength(lp); + tup.t_tableOid = RelationGetRelid(prstate->rel); + tup.t_data = htup; + ItemPointerSet(&(tup.t_self), prstate->targetblkno, offnum); + res = heap_prune_satisfies_vacuum(prstate, &tup, buffer); + if (res == HEAPTUPLE_DEAD || res == HEAPTUPLE_RECENTLY_DEAD) + { + heap_prune_record_unused(prstate, offnum); + HeapTupleHeaderAdvanceLatestRemovedXid(htup, + &prstate->latestRemovedXid); + } + else + Assert(false); + + /* + * Should always be DEAD. A DEAD heap-only tuple is always counted in + * top-level ndeleted counter for pruning operation. + */ + return 1; +} + /* Record lowest soon-prunable XID */ static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid) @@ -810,10 +902,6 @@ heap_prune_record_redirect(PruneState *prstate, prstate->redirected[prstate->nredirected * 2] = offnum; prstate->redirected[prstate->nredirected * 2 + 1] = rdoffnum; prstate->nredirected++; - Assert(!prstate->marked[offnum]); - prstate->marked[offnum] = true; - Assert(!prstate->marked[rdoffnum]); - prstate->marked[rdoffnum] = true; } /* Record line pointer to be marked dead */ @@ -823,8 +911,6 @@ heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum) Assert(prstate->ndead < MaxHeapTuplesPerPage); prstate->nowdead[prstate->ndead] = offnum; prstate->ndead++; - Assert(!prstate->marked[offnum]); - prstate->marked[offnum] = true; } /* Record line pointer to be marked unused */ @@ -834,8 +920,6 @@ heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum) Assert(prstate->nunused < MaxHeapTuplesPerPage); prstate->nowunused[prstate->nunused] = offnum; prstate->nunused++; - Assert(!prstate->marked[offnum]); - prstate->marked[offnum] = true; } -- 2.30.2