From 32722e5837f6bbffb00f06a0ec5b92a6243a6e8a Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Thu, 3 Mar 2022 18:13:15 -0800 Subject: [PATCH v5] Make heap pruning more robust. Follow-up to bugfix commit 18b87b20. Author: Peter Geoghegan Reviewed-By: Andres Freund Discussion: https://postgr.es/m/CAH2-WznNKY6ydUczuTXutVmb_dj3MnAcoaVYc8xyignWfNQ%3DFQ%40mail.gmail.com Backpatch: master only (no backpatch) --- src/backend/access/heap/pruneheap.c | 398 +++++++++++++++++----------- 1 file changed, 238 insertions(+), 160 deletions(-) diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index b6500763a..81b8fd62f 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -58,23 +58,24 @@ typedef struct OffsetNumber nowdead[MaxHeapTuplesPerPage]; OffsetNumber nowunused[MaxHeapTuplesPerPage]; - /* - * marked[i] is true if item i is entered in one of the above arrays. - * - * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is - * 1. Otherwise every access would need to subtract 1. - */ - bool marked[MaxHeapTuplesPerPage + 1]; - /* * Tuple visibility is only computed once for each tuple, for correctness * and efficiency reasons; see comment in heap_page_prune() for * details. This is of type int8[,] intead of HTSV_Result[], so we can use * -1 to indicate no visibility has been computed, e.g. for LP_DEAD items. * - * Same indexing as ->marked. + * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is + * 1. Otherwise every access would need to subtract 1. */ int8 htsv[MaxHeapTuplesPerPage + 1]; + + /* + * visited[i] is true if item i was already visited by second pass over + * page (when we decide which tuples constitute each HOT chain). + * + * Same indexing as ->htsv. + */ + bool visited[MaxHeapTuplesPerPage + 1]; } PruneState; /* Local functions */ @@ -84,6 +85,8 @@ static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate, static int heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate); +static int heap_prune_disconnected(Buffer buffer, OffsetNumber offnum, + PruneState *prstate); static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid); static void heap_prune_record_redirect(PruneState *prstate, OffsetNumber offnum, OffsetNumber rdoffnum); @@ -293,32 +296,23 @@ heap_page_prune(Relation relation, Buffer buffer, prstate.old_snap_xmin = old_snap_xmin; prstate.old_snap_ts = old_snap_ts; prstate.old_snap_used = false; - prstate.latestRemovedXid = InvalidTransactionId; - prstate.nredirected = prstate.ndead = prstate.nunused = 0; - memset(prstate.marked, 0, sizeof(prstate.marked)); maxoff = PageGetMaxOffsetNumber(page); tup.t_tableOid = RelationGetRelid(prstate.rel); /* - * Determine HTSV for all tuples. + * Determine HTSV for all tuples in first pass over page, and save it in + * prstate for later passes. Scan the page backwards (in reverse item + * offset number order). * - * This is required for correctness to deal with cases where running HTSV - * twice could result in different results (e.g. RECENTLY_DEAD can turn to - * DEAD if another checked item causes GlobalVisTestIsRemovableFullXid() - * to update the horizon, INSERT_IN_PROGRESS can change to DEAD if the - * inserting transaction aborts, ...). That in turn could cause - * heap_prune_chain() to behave incorrectly if a tuple is reached twice, - * once directly via a heap_prune_chain() and once following a HOT chain. - * - * It's also good for performance. Most commonly tuples within a page are - * stored at decreasing offsets (while the items are stored at increasing - * offsets). When processing all tuples on a page this leads to reading - * memory at decreasing offsets within a page, with a variable stride. - * That's hard for CPU prefetchers to deal with. Processing the items in - * reverse order (and thus the tuples in increasing order) increases - * prefetching efficiency significantly / decreases the number of cache - * misses. + * This approach is good for performance. Most commonly tuples within a + * page are stored at decreasing offsets (while the items are stored at + * increasing offsets). When processing all tuples on a page this leads + * to reading memory at decreasing offsets within a page, with a variable + * stride. That's hard for CPU prefetchers to deal with. Processing the + * items in reverse order (and thus the tuples in increasing order) + * increases prefetching efficiency significantly / decreases the number + * of cache misses. */ for (offnum = maxoff; offnum >= FirstOffsetNumber; @@ -350,30 +344,58 @@ heap_page_prune(Relation relation, Buffer buffer, buffer); } - /* Scan the page */ + /* + * Now scan the page a second time to process each HOT chain. This uses + * HTSV state saved by initial pass. + * + * It's possible that a few heap-only tuples will not get visited during + * this scan over the page. This happens when the tuples cannot be + * located by following a valid HOT chain. + */ + prstate.latestRemovedXid = InvalidTransactionId; + prstate.nredirected = prstate.ndead = prstate.nunused = 0; + memset(prstate.visited, 0, sizeof(prstate.visited)); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { - ItemId itemid; - - /* Ignore items already processed as part of an earlier chain */ - if (prstate.marked[offnum]) + /* Ignore items already visited as part of an earlier chain */ + if (prstate.visited[offnum]) continue; - /* see preceding loop */ + /* see first scan/loop */ if (off_loc) *off_loc = offnum; - /* Nothing to do if slot is empty or already dead */ - itemid = PageGetItemId(page, offnum); - if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid)) - continue; - /* Process this item or chain of items */ ndeleted += heap_prune_chain(buffer, offnum, &prstate); } + /* + * Now scan the page a third and final time. This is where we process + * disconnected heap-only tuples missed earlier on. + * + * These disconnected heap-only tuples (which always originate in aborted + * transactions) are always considered DEAD (and always become LP_UNUSED). + * Delaying processing of these tuples until here enables processing of + * HOT chains as whole units (that are known to be self-consistent). + */ + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + /* Ignore items already visited (only want disconnected tuples) */ + if (likely(prstate.visited[offnum])) + continue; + + /* see first scan/loop */ + if (off_loc) + *off_loc = offnum; + + /* Process this disconnected heap-only tuple */ + ndeleted += heap_prune_disconnected(buffer, offnum, &prstate); + } + /* Clear the offset information once we have processed the given page. */ if (off_loc) *off_loc = InvalidOffsetNumber; @@ -557,20 +579,22 @@ heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer) /* - * Prune specified line pointer or a HOT chain originating at line pointer. + * Prune HOT chain (or simple tuple) originating at specified line pointer. * - * If the item is an index-referenced tuple (i.e. not a heap-only tuple), - * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT - * chain. We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple. - * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really - * DEAD, our visibility test is just too coarse to detect it. + * Used during second pass over the heap page (the root item pass). Won't + * process any heap-only tuples that cannot be found by traversing a HOT chain + * whose root item is at offset rootoffnum. Remaining "disconnected" + * heap-only tuples are dealt with in caller's third and final pass over the + * page instead. This is how we make sure that no DEAD tuples (or whole HOT + * chains) are missed during pruning. * * In general, pruning must never leave behind a DEAD tuple that still has * tuple storage. VACUUM isn't prepared to deal with that case. That's why * VACUUM prunes the same heap page a second time (without dropping its lock * in the interim) when it sees a newly DEAD tuple that we initially saw as - * in-progress. Retrying pruning like this can only happen when an inserting - * transaction concurrently aborts. + * in-progress. Retrying pruning like this can only happen due to certain + * edge-cases, like the case where an inserting transaction concurrently + * aborts. * * The root line pointer is redirected to the tuple immediately after the * latest DEAD tuple. If all tuples in the chain are DEAD, the root line @@ -588,7 +612,6 @@ heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer) static int heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) { - int ndeleted = 0; Page dp = (Page) BufferGetPage(buffer); TransactionId priorXmax = InvalidTransactionId; ItemId rootlp; @@ -596,63 +619,54 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) OffsetNumber latestdead = InvalidOffsetNumber, maxoff = PageGetMaxOffsetNumber(dp), offnum; + bool pastlatestdead = false; OffsetNumber chainitems[MaxHeapTuplesPerPage]; - int nchain = 0, - i; + int nchain; + + Assert(!prstate->visited[rootoffnum]); rootlp = PageGetItemId(dp, rootoffnum); - /* - * If it's a heap-only tuple, then it is not the start of a HOT chain. - */ if (ItemIdIsNormal(rootlp)) { Assert(prstate->htsv[rootoffnum] != -1); htup = (HeapTupleHeader) PageGetItem(dp, rootlp); + /* + * If it's a heap-only tuple, then it is not the start of a HOT chain. + * We'll process it later, either by traversing its HOT chain + * (starting from the root item), or in heap_prune_disconnected() call + * made during our third pass over page. + */ if (HeapTupleHeaderIsHeapOnly(htup)) - { - /* - * If the tuple is DEAD and doesn't chain to anything else, mark - * it unused immediately. (If it does chain, we can only remove - * it as part of pruning its chain.) - * - * We need this primarily to handle aborted HOT updates, that is, - * XMIN_INVALID heap-only tuples. Those might not be linked to by - * any chain, since the parent tuple might be re-updated before - * any pruning occurs. So we have to be able to reap them - * separately from chain-pruning. (Note that - * HeapTupleHeaderIsHotUpdated will never return true for an - * XMIN_INVALID tuple, so this code will work even when there were - * sequential updates within the aborted transaction.) - * - * Note that we might first arrive at a dead heap-only tuple - * either here or while following a chain below. Whichever path - * gets there first will mark the tuple unused. - */ - if (prstate->htsv[rootoffnum] == HEAPTUPLE_DEAD && - !HeapTupleHeaderIsHotUpdated(htup)) - { - heap_prune_record_unused(prstate, rootoffnum); - HeapTupleHeaderAdvanceLatestRemovedXid(htup, - &prstate->latestRemovedXid); - ndeleted++; - } + return 0; + } + else if (!ItemIdIsRedirected(rootlp)) + { + /* + * Nothing to do if slot cannot possibly be valid root item of HOT + * chain or a simple heap tuple + */ + Assert(ItemIdIsDead(rootlp) || !ItemIdIsUsed(rootlp)); + Assert(prstate->htsv[rootoffnum] == -1); + prstate->visited[rootoffnum] = true; - /* Nothing more to do */ - return ndeleted; - } + return 0; } - /* Start from the root tuple */ + /* + * Start from the root item. Mark it as valid up front, since root items + * are always processed here (not as disconnected tuples in third pass + * over page). + */ + prstate->visited[rootoffnum] = true; offnum = rootoffnum; + nchain = 0; /* while not end of the chain */ for (;;) { ItemId lp; - bool tupdead, - recent_dead; /* Sanity check (pure paranoia) */ if (offnum < FirstOffsetNumber) @@ -666,15 +680,11 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) break; /* If item is already processed, stop --- it must not be same chain */ - if (prstate->marked[offnum]) + if (nchain != 0 && prstate->visited[offnum]) break; lp = PageGetItemId(dp, offnum); - /* Unused item obviously isn't part of the chain */ - if (!ItemIdIsUsed(lp)) - break; - /* * If we are looking at the redirected root line pointer, jump to the * first normal tuple in the chain. If we find a redirect somewhere @@ -689,13 +699,17 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) continue; } - /* - * Likewise, a dead line pointer can't be part of the chain. (We - * already eliminated the case of dead root tuple outside this - * function.) - */ - if (ItemIdIsDead(lp)) + /* LP_UNUSED or LP_DEAD items obviously not part of the chain */ + if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp)) + { + /* + * Can consider LP_UNUSED/LP_DEAD items visited when we arrive + * here by following a heap-only tuple's t_ctid link + */ + Assert(prstate->htsv[rootoffnum] == -1); + prstate->visited[offnum] = true; break; + } Assert(ItemIdIsNormal(lp)); Assert(prstate->htsv[offnum] != -1); @@ -709,33 +723,50 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) break; /* - * OK, this tuple is indeed a member of the chain. + * OK, this tuple is indeed a member of the chain. Stick with this + * interpretation for the entire prune operation by marking the item + * as visited now. */ + Assert((nchain == 0 && ItemIdIsNormal(rootlp)) || + HeapTupleHeaderIsHeapOnly(htup)); chainitems[nchain++] = offnum; + prstate->visited[offnum] = true; /* * Check tuple's visibility status. */ - tupdead = recent_dead = false; - switch ((HTSV_Result) prstate->htsv[offnum]) { case HEAPTUPLE_DEAD: - tupdead = true; + + /* + * Remember the offnum of the last DEAD tuple in this HOT + * chain. To keep things simple, don't treat heap-only tuples + * from a HOT chain as DEAD unless they're only preceded by + * other DEAD tuples (in addition to actually being DEAD). + * Remaining tuples that appear DEAD (but don't get treated as + * such by us) are from concurrently aborting updaters. + * + * VACUUM will ask us to prune the heap page again when it + * sees that there is a DEAD tuple left behind, but that would + * be necessary regardless of our approach here. + */ + if (!pastlatestdead) + { + latestdead = offnum; + HeapTupleHeaderAdvanceLatestRemovedXid(htup, + &prstate->latestRemovedXid); + } + else + { + /* Deliberately don't call heap_prune_record_prunable() */ + } + break; case HEAPTUPLE_RECENTLY_DEAD: - recent_dead = true; - - /* - * This tuple may soon become DEAD. Update the hint field so - * that the page is reconsidered for pruning in future. - */ - heap_prune_record_prunable(prstate, - HeapTupleHeaderGetUpdateXid(htup)); - break; - case HEAPTUPLE_DELETE_IN_PROGRESS: + pastlatestdead = true; /* no further DEAD tuples in CHAIN */ /* * This tuple may soon become DEAD. Update the hint field so @@ -747,6 +778,7 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) case HEAPTUPLE_LIVE: case HEAPTUPLE_INSERT_IN_PROGRESS: + pastlatestdead = true; /* no further DEAD tuples in CHAIN */ /* * If we wanted to optimize for aborts, we might consider @@ -761,25 +793,12 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) break; } - /* - * Remember the last DEAD tuple seen. We will advance past - * RECENTLY_DEAD tuples just in case there's a DEAD one after them; - * but we can't advance past anything else. We have to make sure that - * we don't miss any DEAD tuples, since DEAD tuples that still have - * tuple storage after pruning will confuse VACUUM. - */ - if (tupdead) - { - latestdead = offnum; - HeapTupleHeaderAdvanceLatestRemovedXid(htup, - &prstate->latestRemovedXid); - } - else if (!recent_dead) - break; - /* * If the tuple is not HOT-updated, then we are at the end of this - * HOT-update chain. + * HOT-update chain. There might actually be more tuples that were + * considered part of the same HOT chain in the past, before the + * updater's xact aborted. We'll process any such tuples later on + * instead, inside heap_prune_disconnected(). */ if (!HeapTupleHeaderIsHotUpdated(htup)) break; @@ -797,17 +816,29 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) } /* - * If we found a DEAD tuple in the chain, adjust the HOT chain so that all - * the DEAD tuples at the start of the chain are removed and the root line - * pointer is appropriately redirected. + * Should never find an LP_REDIRECT root item that didn't already point to + * a valid item. While a heap-only tuple's t_ctid link can legitimately + * point to almost anything, the rules for LP_REDIRECT links are far + * stricter: LP_REDIRECTs must _always_ point to a valid heap-only tuple. */ + Assert(ItemIdIsNormal(rootlp) || + (ItemIdIsRedirected(rootlp) && nchain >= 2)); + if (OffsetNumberIsValid(latestdead)) { + int i, + ndeleted = 0; + /* - * Mark as unused each intermediate item that we are able to remove - * from the chain. + * Okay, at least one tuple from the chain (or the single plain heap + * tuple) is considered DEAD. Record what to do with items in the + * chain now. * - * When the previous item is the last dead tuple seen, we are at the + * First deal with the non-root items from HOT chain. Mark earlier + * items we consider DEAD as LP_UNUSED (since they're heap-only + * tuples). + * + * When the previous item is the last DEAD tuple seen, we are at the * right candidate for redirection. */ for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++) @@ -817,36 +848,91 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) } /* - * If the root entry had been a normal tuple, we are deleting it, so - * count it in the result. But changing a redirect (even to DEAD - * state) doesn't count. + * If the root item is a normal tuple, we are logically deleting it, + * so count it in the result. But changing an LP_REDIRECT (even to + * make it LP_DEAD) doesn't get counted in ndeleted -- that would + * amount to double-counting DEAD tuples (with tuple storage) in + * ndeleted. */ if (ItemIdIsNormal(rootlp)) ndeleted++; /* + * Finally, consider what to do with the root item itself. + * * If the DEAD tuple is at the end of the chain, the entire chain is - * dead and the root line pointer can be marked dead. Otherwise just - * redirect the root to the correct chain member. + * considered DEAD. The root item must therefore become LP_DEAD. + * Otherwise just redirect the root to the correct chain member. */ if (i >= nchain) heap_prune_record_dead(prstate, rootoffnum); else heap_prune_record_redirect(prstate, rootoffnum, chainitems[i]); - } - else if (nchain < 2 && ItemIdIsRedirected(rootlp)) - { - /* - * We found a redirect item that doesn't point to a valid follow-on - * item. This can happen if the loop in heap_page_prune caused us to - * visit the dead successor of a redirect item before visiting the - * redirect item. We can clean up by setting the redirect item to - * DEAD state. - */ - heap_prune_record_dead(prstate, rootoffnum); + + return ndeleted; } - return ndeleted; + return 0; +} + +/* + * Handle disconnected heap-only tuples during third and final pass over page. + * We always expect to process these tuples as DEAD tuples here. Since + * they're heap-only tuples it follows that they'll always be set LP_UNUSED. + * + * This is how we handle aborted heap-only tuples that were not visited in our + * second pass (via HOT chain traversal with the usual cross-checks). These + * tuples occur when a parent tuple is updated, the updater aborts, and some + * unrelated updater re-updates the original parent tuple again. The parent's + * t_ctid link won't continue to point to the aborted tuple. (Even when it + * does, we won't consider the parent to have been HOT updated, just because + * its XMAX aborted -- so we still end up here for the aborted tuple). + * + * Like heap_prune_chain, we don't actually change the page here. + * + * Returns the number of tuples (to be) deleted from the page, though this + * should always be 1 in practice. +*/ +static int +heap_prune_disconnected(Buffer buffer, OffsetNumber offnum, + PruneState *prstate) +{ + Page dp = (Page) BufferGetPage(buffer); + ItemId lp; + HeapTupleHeader htup; + + lp = PageGetItemId(dp, offnum); + Assert(ItemIdIsNormal(lp)); + Assert(prstate->htsv[offnum] != -1); + htup = (HeapTupleHeader) PageGetItem(dp, lp); + + /* + * Caller must make sure that the tuple at 'offnum' is in fact a heap-only + * tuple that is disconnected from its HOT chain + */ + Assert(!prstate->visited[offnum]); + Assert(HeapTupleHeaderIsHeapOnly(htup)); + + /* + * We expect that disconnected heap-only tuples must be from aborted + * transactions. They must already be DEAD, or something is amiss. + */ + if (likely((HTSV_Result) prstate->htsv[offnum] == HEAPTUPLE_DEAD)) + { + heap_prune_record_unused(prstate, offnum); + + /* Unnecessary, but be conservative here */ + HeapTupleHeaderAdvanceLatestRemovedXid(htup, + &prstate->latestRemovedXid); + return 1; + } + + /* + * Should always be DEAD. A DEAD heap-only tuple is always counted in + * top-level ndeleted counter for pruning operation. + */ + Assert(false); + return 0; } /* Record lowest soon-prunable XID */ @@ -872,10 +958,6 @@ heap_prune_record_redirect(PruneState *prstate, prstate->redirected[prstate->nredirected * 2] = offnum; prstate->redirected[prstate->nredirected * 2 + 1] = rdoffnum; prstate->nredirected++; - Assert(!prstate->marked[offnum]); - prstate->marked[offnum] = true; - Assert(!prstate->marked[rdoffnum]); - prstate->marked[rdoffnum] = true; } /* Record line pointer to be marked dead */ @@ -885,8 +967,6 @@ heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum) Assert(prstate->ndead < MaxHeapTuplesPerPage); prstate->nowdead[prstate->ndead] = offnum; prstate->ndead++; - Assert(!prstate->marked[offnum]); - prstate->marked[offnum] = true; } /* Record line pointer to be marked unused */ @@ -896,8 +976,6 @@ heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum) Assert(prstate->nunused < MaxHeapTuplesPerPage); prstate->nowunused[prstate->nunused] = offnum; prstate->nunused++; - Assert(!prstate->marked[offnum]); - prstate->marked[offnum] = true; } -- 2.30.2