From 134bd550bd7cb8c182fe3a28789705be5bf8785a Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Fri, 11 Mar 2022 19:16:02 -0800 Subject: [PATCH v10 2/3] Generalize how VACUUM skips all-frozen pages. Non-aggressive VACUUMs were at a gratuitous disadvantage (relative to aggressive VACUUMs) around advancing relfrozenxid before now. The underlying issue was that lazy_scan_heap conditioned its skipping behavior on whether or not the current VACUUM was aggressive. VACUUM could fail to increment its frozenskipped_pages counter as a result, and so could miss out on advancing relfrozenxid for no good reason. The approach taken during aggressive VACUUMs avoided the problem, but that only worked in the aggressive case. Fix the issue by generalizing how we skip all-frozen pages: remember whether a range of skippable pages consists only of all-frozen pages as we're initially establishing the range of skippable pages. If we decide to skip the range of pages, and if the range as a whole is not an all-frozen range, remember that fact for later (this makes it unsafe to advance relfrozenxid). We no longer need to recheck any pages using the visibility map. We no longer directly track frozenskipped_pages at all. And we no longer need ad-hoc VM_ALL_VISIBLE()/VM_ALL_FROZEN() calls for pages from a range of blocks that we already decided were safe to skip. The issue is subtle. Before now, the non-aggressive case always had to recheck the visibility map at the point of actually skipping each page. This created a window for some other session to concurrently unset the same heap page's bit in the visibility map. If the bit was unset at exactly the wrong time, then the non-aggressive case would conservatively conclude that the page was _never_ all-frozen on recheck. And so frozenskipped_pages would not be incremented for the page. lazy_scan_heap had already "committed" to skipping the page at that point, though, which was enough to make it unsafe to advance relfrozenxid/relminmxid later on. It's possible that this issue hardly ever came up in practice. It's hard to be sure either way. We only had to be unlucky once to lose out on advancing relfrozenxid -- a single affected heap page was enough to throw VACUUM off. That seems like something to avoid on general principle. This is similar to an issue addressed by commit 44fa8488, which taught vacuumlazy.c to not give up on non-aggressive relfrozenxid advancement just because a cleanup lock wasn't immediately available on some heap page. Also refactor the mechanism that disables skipping using the visibility map during VACUUM(DISABLE_PAGE_SKIPPING). Our old approach made VACUUM behave as if there were no pages with VM bits set. Our new approach has VACUUM set up a range of pages in the usual way, without actually going through with skipping the range in the end. This has the advantage of making VACUUM(DISABLE_PAGE_SKIPPING) apply standard cross checks that report on visibility map corruption via WARNINGs. Author: Peter Geoghegan Discussion: https://postgr.es/m/CAH2-Wzn6bGJGfOy3zSTJicKLw99PHJeSOQBOViKjSCinaxUKDQ@mail.gmail.com --- src/backend/access/heap/vacuumlazy.c | 298 ++++++++++++++------------- 1 file changed, 158 insertions(+), 140 deletions(-) diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 9f5178e0a..3bc75d401 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -176,6 +176,8 @@ typedef struct LVRelState /* Tracks oldest extant XID/MXID for setting relfrozenxid/relminmxid */ TransactionId NewRelfrozenXid; MultiXactId NewRelminMxid; + /* Have we skipped any all-visible (not all-frozen) pages? */ + bool skippedallvis; /* Error reporting state */ char *relnamespace; @@ -196,7 +198,6 @@ typedef struct LVRelState VacDeadItems *dead_items; /* TIDs whose index tuples we'll delete */ BlockNumber rel_pages; /* total number of pages */ BlockNumber scanned_pages; /* # pages examined (not skipped via VM) */ - BlockNumber frozenskipped_pages; /* # frozen pages skipped via VM */ BlockNumber removed_pages; /* # pages removed by relation truncation */ BlockNumber lpdead_item_pages; /* # pages with LP_DEAD items */ BlockNumber missed_dead_pages; /* # pages with missed dead tuples */ @@ -247,6 +248,10 @@ typedef struct LVSavedErrInfo /* non-export function prototypes */ static void lazy_scan_heap(LVRelState *vacrel, int nworkers); +static BlockNumber lazy_scan_skip_range(LVRelState *vacrel, Buffer *vmbuffer, + BlockNumber next_unskippable_block, + bool *all_visible_next_unskippable, + bool *all_frozen_skippable_range); static bool lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno, Page page, bool sharelock, Buffer vmbuffer); @@ -471,7 +476,6 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, /* Initialize page counters explicitly (be tidy) */ vacrel->scanned_pages = 0; - vacrel->frozenskipped_pages = 0; vacrel->removed_pages = 0; vacrel->lpdead_item_pages = 0; vacrel->missed_dead_pages = 0; @@ -518,6 +522,8 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, /* Initialize state used to track oldest extant XID/XMID */ vacrel->NewRelfrozenXid = OldestXmin; vacrel->NewRelminMxid = OldestMxact; + /* Cannot advance relfrozenxid when we skipped all-visible pages */ + vacrel->skippedallvis = false; /* * Call lazy_scan_heap to perform all required heap pruning, index @@ -575,7 +581,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, * to an XID that is either older or newer than FreezeLimit (same applies * to relminmxid and MultiXactCutoff). */ - if (vacrel->scanned_pages + vacrel->frozenskipped_pages < orig_rel_pages) + if (vacrel->skippedallvis) { /* Cannot advance relfrozenxid/relminmxid */ Assert(!aggressive); @@ -587,8 +593,6 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, } else { - Assert(vacrel->scanned_pages + vacrel->frozenskipped_pages == - orig_rel_pages); Assert(!aggressive || TransactionIdPrecedesOrEquals(FreezeLimit, vacrel->NewRelfrozenXid)); @@ -842,7 +846,9 @@ lazy_scan_heap(LVRelState *vacrel, int nworkers) next_failsafe_block, next_fsm_block_to_vacuum; Buffer vmbuffer = InvalidBuffer; - bool skipping_blocks; + bool skipping_range, + all_visible_next_unskippable, + all_frozen_skippable_range; const int initprog_index[] = { PROGRESS_VACUUM_PHASE, PROGRESS_VACUUM_TOTAL_HEAP_BLKS, @@ -874,167 +880,85 @@ lazy_scan_heap(LVRelState *vacrel, int nworkers) pgstat_progress_update_multi_param(3, initprog_index, initprog_val); /* - * Set things up for skipping blocks using visibility map. - * - * Except when vacrel->aggressive is set, we want to skip pages that are - * all-visible according to the visibility map, but only when we can skip - * at least SKIP_PAGES_THRESHOLD consecutive pages. Since we're reading - * sequentially, the OS should be doing readahead for us, so there's no - * gain in skipping a page now and then; that's likely to disable - * readahead and so be counterproductive. Also, skipping even a single - * page means that we can't update relfrozenxid, so we only want to do it - * if we can skip a goodly number of pages. - * - * When vacrel->aggressive is set, we can't skip pages just because they - * are all-visible, but we can still skip pages that are all-frozen, since - * such pages do not need freezing and do not affect the value that we can - * safely set for relfrozenxid or relminmxid. + * Set up an initial range of blocks to skip via the visibility map. * * Before entering the main loop, establish the invariant that * next_unskippable_block is the next block number >= blkno that we can't - * skip based on the visibility map, either all-visible for a regular scan - * or all-frozen for an aggressive scan. We set it to rel_pages when - * there's no such block. We also set up the skipping_blocks flag - * correctly at this stage. - * - * Note: The value returned by visibilitymap_get_status could be slightly - * out-of-date, since we make this test before reading the corresponding - * heap page or locking the buffer. This is OK. If we mistakenly think - * that the page is all-visible or all-frozen when in fact the flag's just - * been cleared, we might fail to vacuum the page. It's easy to see that - * skipping a page when aggressive is not set is not a very big deal; we - * might leave some dead tuples lying around, but the next vacuum will - * find them. But even when aggressive *is* set, it's still OK if we miss - * a page whose all-frozen marking has just been cleared. Any new XIDs - * just added to that page are necessarily >= vacrel->OldestXmin, and so - * they cannot invalidate NewRelfrozenXid tracking. A similar argument - * applies for NewRelminMxid tracking and OldestMxact. + * skip based on the visibility map. */ - next_unskippable_block = 0; - if (vacrel->skipwithvm) - { - while (next_unskippable_block < rel_pages) - { - uint8 vmstatus; + next_unskippable_block = lazy_scan_skip_range(vacrel, &vmbuffer, 0, + &all_visible_next_unskippable, + &all_frozen_skippable_range); - vmstatus = visibilitymap_get_status(vacrel->rel, - next_unskippable_block, - &vmbuffer); - if (vacrel->aggressive) - { - if ((vmstatus & VISIBILITYMAP_ALL_FROZEN) == 0) - break; - } - else - { - if ((vmstatus & VISIBILITYMAP_ALL_VISIBLE) == 0) - break; - } - vacuum_delay_point(); - next_unskippable_block++; - } - } - - if (next_unskippable_block >= SKIP_PAGES_THRESHOLD) - skipping_blocks = true; - else - skipping_blocks = false; + /* + * Decide whether or not we'll actually skip the first skippable range. + * + * We want to skip pages that are all-visible according to the visibility + * map (or all-frozen in the aggressive case), but only when we can skip + * at least SKIP_PAGES_THRESHOLD consecutive pages. Since we're reading + * sequentially, the OS should be doing readahead for us, so there's no + * gain in skipping a page now and then; that's likely to disable + * readahead and so be counterproductive. + */ + skipping_range = (vacrel->skipwithvm && + next_unskippable_block >= SKIP_PAGES_THRESHOLD); for (blkno = 0; blkno < rel_pages; blkno++) { Buffer buf; Page page; - bool all_visible_according_to_vm = false; + bool all_visible_according_to_vm; LVPagePruneState prunestate; - pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno); - - update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_SCAN_HEAP, - blkno, InvalidOffsetNumber); - if (blkno == next_unskippable_block) { - /* Time to advance next_unskippable_block */ - next_unskippable_block++; - if (vacrel->skipwithvm) - { - while (next_unskippable_block < rel_pages) - { - uint8 vmskipflags; - - vmskipflags = visibilitymap_get_status(vacrel->rel, - next_unskippable_block, - &vmbuffer); - if (vacrel->aggressive) - { - if ((vmskipflags & VISIBILITYMAP_ALL_FROZEN) == 0) - break; - } - else - { - if ((vmskipflags & VISIBILITYMAP_ALL_VISIBLE) == 0) - break; - } - vacuum_delay_point(); - next_unskippable_block++; - } - } + /* + * We can't skip this block. It might still be all-visible, + * though. This can happen when an aggressive VACUUM cannot skip + * an all-visible block. + */ + all_visible_according_to_vm = all_visible_next_unskippable; /* - * We know we can't skip the current block. But set up - * skipping_blocks to do the right thing at the following blocks. + * Determine a range of blocks to skip after we scan and process + * this block. We pass blkno + 1 as next_unskippable_block. The + * final next_unskippable_block won't change when there are no + * blocks to skip (skippable blocks are those after blkno, but + * before final next_unskippable_block). */ - if (next_unskippable_block - blkno > SKIP_PAGES_THRESHOLD) - skipping_blocks = true; - else - skipping_blocks = false; + next_unskippable_block = + lazy_scan_skip_range(vacrel, &vmbuffer, blkno + 1, + &all_visible_next_unskippable, + &all_frozen_skippable_range); - /* - * Normally, the fact that we can't skip this block must mean that - * it's not all-visible. But in an aggressive vacuum we know only - * that it's not all-frozen, so it might still be all-visible. - */ - if (vacrel->aggressive && - VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer)) - all_visible_according_to_vm = true; + /* Decide whether or not we'll actually skip the new range */ + skipping_range = + (vacrel->skipwithvm && + next_unskippable_block - blkno > SKIP_PAGES_THRESHOLD); } else { - /* - * The current page can be skipped if we've seen a long enough run - * of skippable blocks to justify skipping it -- provided it's not - * the last page in the relation (according to rel_pages). - * - * We always scan the table's last page to determine whether it - * has tuples or not, even if it would otherwise be skipped. This - * avoids having lazy_truncate_heap() take access-exclusive lock - * on the table to attempt a truncation that just fails - * immediately because there are tuples on the last page. - */ - if (skipping_blocks && blkno < rel_pages - 1) + /* Every block in the range must be safe to skip */ + all_visible_according_to_vm = true; + + Assert(blkno < next_unskippable_block); + Assert(blkno < rel_pages - 1); /* see lazy_scan_skip_range */ + Assert(!vacrel->aggressive || all_frozen_skippable_range); + + if (skipping_range) { /* - * Tricky, tricky. If this is in aggressive vacuum, the page - * must have been all-frozen at the time we checked whether it - * was skippable, but it might not be any more. We must be - * careful to count it as a skipped all-frozen page in that - * case, or else we'll think we can't update relfrozenxid and - * relminmxid. If it's not an aggressive vacuum, we don't - * know whether it was initially all-frozen, so we have to - * recheck. + * If this range of blocks is not all-frozen, then we cannot + * advance relfrozenxid later. This is another reason for + * SKIP_PAGES_THRESHOLD; it helps us to avoid losing out on + * advancing relfrozenxid where it makes the least sense. */ - if (vacrel->aggressive || - VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer)) - vacrel->frozenskipped_pages++; + if (!all_frozen_skippable_range) + vacrel->skippedallvis = true; continue; } - /* - * SKIP_PAGES_THRESHOLD (threshold for skipping) was not - * crossed, or this is the last page. Scan the page, even - * though it's all-visible (and possibly even all-frozen). - */ - all_visible_according_to_vm = true; + /* We decided to not skip this range, so scan its page */ } vacuum_delay_point(); @@ -1046,6 +970,11 @@ lazy_scan_heap(LVRelState *vacrel, int nworkers) */ vacrel->scanned_pages++; + /* Report as block scanned, update error traceback information */ + pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno); + update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_SCAN_HEAP, + blkno, InvalidOffsetNumber); + /* * Regularly check if wraparound failsafe should trigger. * @@ -1425,6 +1354,95 @@ lazy_scan_heap(LVRelState *vacrel, int nworkers) Assert(!IsInParallelMode()); } +/* + * Set up a range of skippable blocks using visibility map. + * + * lazy_scan_heap() caller calls here every time it needs to set up a new + * range of blocks to skip via the visibility map. Caller passes the block + * immediately after its last next_unskippable_block to set up a new range. + * We return a new next_unskippable_block for this range. This is often a + * degenerate 0-page range (we return caller's next_unskippable_block when + * that happens). + * + * Sets *all_visible_next_unskippable describes whether the returned block can + * be assumed all-visible. Also sets *all_frozen_skippable_range to indicate + * whether the range is known to contain any all-visible pages. + * + * When vacrel->aggressive is set, caller can't skip pages just because they + * are all-visible, but can still skip pages that are all-frozen, since such + * pages do not need freezing and do not affect the value that we can safely + * set for relfrozenxid or relminmxid. *all_frozen_skippable_range is never + * set 'true' for aggressive callers for this reason. + * + * Note: If caller thinks that one of the pages from the range is all-visible + * or all-frozen when in fact the flag's just been cleared, caller might fail + * to vacuum the page. It's easy to see that skipping a page in a VACUUM that + * ultimately cannot advance relfrozenxid or relminmxid is not a very big + * deal; we might leave some dead tuples lying around, but the next vacuum + * will find them. But even in VACUUMs that *are* capable of advancing + * relfrozenxid, it's still OK if we miss a page whose all-frozen marking gets + * concurrently cleared. Any new XIDs from such a page must be >= OldestXmin, + * and so cannot invalidate NewRelfrozenXid tracking. A similar argument + * applies for NewRelminMxid tracking and OldestMxact. + */ +static BlockNumber +lazy_scan_skip_range(LVRelState *vacrel, Buffer *vmbuffer, + BlockNumber next_unskippable_block, + bool *all_visible_next_unskippable, + bool *all_frozen_skippable_range) +{ + BlockNumber rel_pages = vacrel->rel_pages; + + *all_visible_next_unskippable = true; + *all_frozen_skippable_range = true; + + while (next_unskippable_block < rel_pages) + { + uint8 vmstatus; + + vmstatus = visibilitymap_get_status(vacrel->rel, + next_unskippable_block, + vmbuffer); + if ((vmstatus & VISIBILITYMAP_ALL_VISIBLE) == 0) + { + Assert((vmstatus & VISIBILITYMAP_ALL_FROZEN) == 0); + *all_visible_next_unskippable = false; + break; + } + + /* + * We always scan the table's last page later to determine whether it + * has tuples or not, even if it would otherwise be skipped. This + * avoids having lazy_truncate_heap() take access-exclusive lock on + * the table to attempt a truncation that just fails immediately + * because there are tuples on the last page. + */ + if (next_unskippable_block == rel_pages - 1) + { + /* Last block case need only set all_visible_next_unskippable */ + Assert(*all_visible_next_unskippable); + break; + } + + if ((vmstatus & VISIBILITYMAP_ALL_FROZEN) == 0) + { + if (vacrel->aggressive) + break; + + /* + * This block may be skipped too. It's not all-frozen, though, so + * entire skippable range will be deemed not-all-frozen. + */ + *all_frozen_skippable_range = false; + } + + vacuum_delay_point(); + next_unskippable_block++; + } + + return next_unskippable_block; +} + /* * lazy_scan_new_or_empty() -- lazy_scan_heap() new/empty page handling. * -- 2.30.2