From 0ebde19d306488c59d8b3b6e0913c5bb51c5c5e6 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 13 Dec 2021 15:00:49 -0800
Subject: [PATCH v6 5/5] Freeze tuples early to advance relfrozenxid.

Freeze whenever pruning modified the page, or whenever we see that we're
going to mark the page all-visible without also marking it all-frozen.

There has been plenty of discussion of early/opportunistic freezing in
the past.  It is generally considered important as a way of minimizing
repeated dirtying of heap pages (or the total volume of FPIs in the WAL
stream) over time.  While that goal is certainly very important, this
patch has another priority: making VACUUM advance relfrozenxid sooner
and more frequently.

The overall effect is that tables like pgbench's history table can be
vacuumed very frequently, and have most individual vacuum operations
generate 0 FPIs in WAL -- they will never need an aggressive VACUUM.

GUCs like vacuum_freeze_min_age never made much sense after the freeze
map work in PostgreSQL 9.6.  The default is 50 million transactions,
which current tends to result in our being unable to freeze tuples
before the page is marked all-visible (but not all-frozen).  This
creates a huge performance cliff later on, during the first aggressive
VACUUM.  Freezing early effectively avoids accumulating "debt" from very
old unfrozen tuples.
---
 src/include/access/heapam.h          |  1 +
 src/backend/access/heap/pruneheap.c  |  8 ++-
 src/backend/access/heap/vacuumlazy.c | 87 +++++++++++++++++++++++++---
 3 files changed, 88 insertions(+), 8 deletions(-)

diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index d35402f9f..ba094507c 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -188,6 +188,7 @@ extern int	heap_page_prune(Relation relation, Buffer buffer,
 							struct GlobalVisState *vistest,
 							TransactionId old_snap_xmin,
 							TimestampTz old_snap_ts_ts,
+							bool *modified,
 							int	*nnewlpdead,
 							OffsetNumber *off_loc);
 extern void heap_page_prune_execute(Buffer buffer,
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 3201fcc52..7d2b72e89 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -202,11 +202,12 @@ heap_page_prune_opt(Relation relation, Buffer buffer)
 		 */
 		if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
 		{
+			bool	modified;
 			int		ndeleted,
 					nnewlpdead;
 
 			ndeleted = heap_page_prune(relation, buffer, vistest, limited_xmin,
-									   limited_ts, &nnewlpdead, NULL);
+									   limited_ts, &modified, &nnewlpdead, NULL);
 
 			/*
 			 * Report the number of tuples reclaimed to pgstats.  This is
@@ -264,6 +265,7 @@ heap_page_prune(Relation relation, Buffer buffer,
 				GlobalVisState *vistest,
 				TransactionId old_snap_xmin,
 				TimestampTz old_snap_ts,
+				bool *modified,
 				int	*nnewlpdead,
 				OffsetNumber *off_loc)
 {
@@ -445,6 +447,8 @@ heap_page_prune(Relation relation, Buffer buffer,
 
 			PageSetLSN(BufferGetPage(buffer), recptr);
 		}
+
+		*modified = true;
 	}
 	else
 	{
@@ -457,12 +461,14 @@ heap_page_prune(Relation relation, Buffer buffer,
 		 * point in repeating the prune/defrag process until something else
 		 * happens to the page.
 		 */
+		*modified = false;
 		if (((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid ||
 			PageIsFull(page))
 		{
 			((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
 			PageClearFull(page);
 			MarkBufferDirtyHint(buffer, true);
+			*modified = true;
 		}
 	}
 
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index dd557fddb..a7704977a 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -168,6 +168,7 @@ typedef struct LVRelState
 
 	/* VACUUM operation's cutoff for pruning */
 	TransactionId OldestXmin;
+	MultiXactId OldestMxact;
 	/* VACUUM operation's cutoff for freezing XIDs and MultiXactIds */
 	TransactionId FreezeLimit;
 	MultiXactId MultiXactCutoff;
@@ -355,11 +356,16 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
 
 	/*
 	 * Get cutoffs that determine which tuples we need to freeze during the
-	 * VACUUM operation.
+	 * VACUUM operation.  This includes information that is used during
+	 * opportunistic freezing, where the most aggressive possible cutoffs
+	 * (OldestXmin and OldestMxact) are used for some heap pages, based on
+	 * considerations about cost.
 	 *
 	 * Also determines if this is to be an aggressive VACUUM.  This will
 	 * eventually be required for any table where (for whatever reason) no
 	 * non-aggressive VACUUM ran to completion, and advanced relfrozenxid.
+	 * This used to be much more common, but we now work hard to advance
+	 * relfrozenxid in non-aggressive VACUUMs.
 	 */
 	aggressive = vacuum_set_xid_limits(rel,
 									   params->freeze_min_age,
@@ -472,6 +478,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
 
 	/* Set cutoffs for entire VACUUM */
 	vacrel->OldestXmin = OldestXmin;
+	vacrel->OldestMxact = OldestMxact;
 	vacrel->FreezeLimit = FreezeLimit;
 	vacrel->MultiXactCutoff = MultiXactCutoff;
 
@@ -1590,6 +1597,10 @@ lazy_scan_prune(LVRelState *vacrel,
 	xl_heap_freeze_tuple frozen[MaxHeapTuplesPerPage];
 	TransactionId NewRelfrozenxid;
 	MultiXactId NewRelminmxid;
+	bool		modified;
+	TransactionId FreezeLimit = vacrel->FreezeLimit;
+	MultiXactId MultiXactCutoff = vacrel->MultiXactCutoff;
+	bool		earlyfreezing = false;
 
 	Assert(BufferGetBlockNumber(buf) == blkno);
 
@@ -1616,8 +1627,19 @@ retry:
 	 * that were deleted from indexes.
 	 */
 	tuples_deleted = heap_page_prune(rel, buf, vistest,
-									 InvalidTransactionId, 0, &nnewlpdead,
-									 &vacrel->offnum);
+									 InvalidTransactionId, 0, &modified,
+									 &nnewlpdead, &vacrel->offnum);
+
+	/*
+	 * If page was modified during pruning, then perform early freezing
+	 * opportunistically
+	 */
+	if (!earlyfreezing && modified)
+	{
+		earlyfreezing = true;
+		FreezeLimit = vacrel->OldestXmin;
+		MultiXactCutoff = vacrel->OldestMxact;
+	}
 
 	/*
 	 * Now scan the page to collect LP_DEAD items and check for tuples
@@ -1672,7 +1694,7 @@ retry:
 		if (ItemIdIsDead(itemid))
 		{
 			deadoffsets[lpdead_items++] = offnum;
-			prunestate->all_visible = false;
+			/* Don't set all_visible to false just yet */
 			prunestate->has_lpdead_items = true;
 			continue;
 		}
@@ -1806,8 +1828,8 @@ retry:
 		if (heap_prepare_freeze_tuple(tuple.t_data,
 									  vacrel->relfrozenxid,
 									  vacrel->relminmxid,
-									  vacrel->FreezeLimit,
-									  vacrel->MultiXactCutoff,
+									  FreezeLimit,
+									  MultiXactCutoff,
 									  &frozen[nfrozen],
 									  &tuple_totally_frozen,
 									  &NewRelfrozenxid,
@@ -1827,6 +1849,57 @@ retry:
 
 	vacrel->offnum = InvalidOffsetNumber;
 
+	/*
+	 * Reconsider applying early freezing before committing to processing the
+	 * page as currently planned.  There are 2 reasons to change our mind:
+	 *
+	 * 1. The standard FreezeLimit cutoff generally indicates that we should
+	 * freeze XIDs that are more than freeze_min_age XIDs in the past
+	 * (relative to OldestXmin).  But that should only be treated as a rough
+	 * guideline; it makes sense to freeze all eligible tuples on pages where
+	 * we're going to freeze at least one in any case.
+	 *
+	 * 2. If the page is now eligible to be marked all_visible, but is not
+	 * also eligible to be marked all_frozen, then we freeze early to make
+	 * sure that the page becomes all_frozen.  We should avoid building up
+	 * "freeze debt" that can only be paid off by an aggressive VACUUM, later
+	 * on.  This makes it much less likely that an aggressive VACUUM will ever
+	 * be required.
+	 *
+	 * Note: We deliberately track all_visible in a way that excludes LP_DEAD
+	 * items here.  Any page that is "all_visible for tuples with storage"
+	 * will be eligible to have its visibility map bit set during the ongoing
+	 * VACUUM, one way or another.  LP_DEAD items only make it unsafe to set
+	 * the page all_visible during the first heap pass, but the second heap
+	 * pass should be able to perform equivalent processing. (The second heap
+	 * pass cannot freeze tuples, though.)
+	 */
+	if (!earlyfreezing &&
+		((nfrozen > 0 && nfrozen < num_tuples) ||
+		 (prunestate->all_visible && !prunestate->all_frozen)))
+	{
+		/*
+		 * XXX Need to worry about leaking MultiXacts in FreezeMultiXactId()
+		 * now (via heap_prepare_freeze_tuple calls)?  That was already
+		 * possible, but presumably this makes it much more likely.
+		 *
+		 * On the other hand, that's only possible when we need to replace an
+		 * existing MultiXact with a new one.  Even then, we won't have
+		 * preallocated a new MultiXact (which we now risk leaking) if there
+		 * was only one remaining XID, and the XID is for an updater (we'll
+		 * only prepare to replace xmax with the XID directly).  So maybe it's
+		 * still a narrow enough problem to be ignored.
+		 */
+		earlyfreezing = true;
+		FreezeLimit = vacrel->OldestXmin;
+		MultiXactCutoff = vacrel->OldestMxact;
+		goto retry;
+	}
+
+	/* Time to define all_visible in a way that accounts for LP_DEAD items */
+	if (lpdead_items > 0)
+		prunestate->all_visible = false;
+
 	/*
 	 * We have now divided every item on the page into either an LP_DEAD item
 	 * that will need to be vacuumed in indexes later, or a LP_NORMAL tuple
@@ -1872,7 +1945,7 @@ retry:
 		{
 			XLogRecPtr	recptr;
 
-			recptr = log_heap_freeze(vacrel->rel, buf, vacrel->FreezeLimit,
+			recptr = log_heap_freeze(vacrel->rel, buf, FreezeLimit,
 									 frozen, nfrozen);
 			PageSetLSN(page, recptr);
 		}
-- 
2.30.2