From 8701fdd939766362254600401f2d66fd0af007e6 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Sun, 12 Jun 2022 15:46:08 -0700
Subject: [PATCH v8 2/6] Add page-level freezing to VACUUM.

Teach VACUUM to decide on whether or not to trigger freezing at the
level of whole heap pages, not individual tuple fields.  OldestXmin is
now treated as the cutoff for freezing eligibility in all cases, while
FreezeLimit is used to trigger freezing at the level of each page (we
now freeze all eligible XIDs on a page when freezing is triggered for
the page).

FreezeMultiXactId() now uses both FreezeLimit and OldestXmin to decide
how to process MultiXacts (not just FreezeLimit).  We always prefer to
avoid allocating new MultiXacts during VACUUM on general principle.
Page-level freezing can be triggered and use a maximally aggressive XID
cutoff to freeze XIDs (OldestXmin), while using a less aggressive XID
cutoff (FreezeLimit) to determine whether or not members from a Multi
need to be frozen expensively.  VACUUM will process Multis very eagerly
when it's cheap to do so, and very lazily when it's expensive to do so.

We can choose when and how to freeze Multixacts provided we never leave
behind a Multi that's < MultiXactCutoff, or a Multi with one or more XID
members < FreezeLimit.  Provided VACUUM's NewRelfrozenXid/NewRelminMxid
tracking account for all this, we are free to choose what to do about
each Multi based on the costs and the benefits.  VACUUM should be just
as capable of avoiding an expensive second pass over each Multi (which
must check the commit status of each member XID) as it was before, even
when page-level freezing is triggered on many pages with recently
allocated MultiXactIds.

Later work will teach VACUUM to apply an alternative eager freezing
strategy that triggers page-level freezing earlier, based on additional
criteria.  This commit improves the cost profile of freezing by building
on the freeze plan deduplication optimization added by commit 9e540599.
The high level user facing design of VACUUM hasn't really changed just
yet.

Author: Peter Geoghegan <pg@bowt.ie>
Reviewed-By: Jeff Davis <pgsql@j-davis.com>
Reviewed-By: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
---
 src/include/access/heapam.h          |  39 +++-
 src/backend/access/heap/heapam.c     | 298 ++++++++++++++++-----------
 src/backend/access/heap/vacuumlazy.c | 106 +++++++---
 3 files changed, 282 insertions(+), 161 deletions(-)

diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index abc3a1f34..ca4fab970 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -113,6 +113,40 @@ typedef struct HeapTupleFreeze
 	OffsetNumber offset;
 } HeapTupleFreeze;
 
+/*
+ * State used by VACUUM to track the details of freezing all eligible tuples
+ * on a given heap page.
+ *
+ * VACUUM prepares freeze plans for each page via heap_prepare_freeze_tuple
+ * calls (every tuple with storage gets its own call).  This page-level freeze
+ * state is updated across each call, which ultimately determines whether or
+ * not freezing the page is required. (VACUUM freezes the page via a call to
+ * heap_freeze_execute_prepared, which freezes using prepared freeze plans.)
+ *
+ * Aside from the basic question of whether or not freezing will go ahead, the
+ * state also tracks the oldest extant XID/MXID in the table as a whole, for
+ * the purposes of advancing relfrozenxid/relminmxid values in pg_class later
+ * on.  Each heap_prepare_freeze_tuple call pushes NewRelfrozenXid and/or
+ * NewRelminMxid back as required to avoid unsafe final pg_class values.  Any
+ * and all unfrozen XIDs or MXIDs that remain after VACUUM finishes _must_
+ * have values >= the final relfrozenxid/relminmxid values in pg_class.  This
+ * includes XIDs that remain as MultiXact members from any tuple's xmax.
+ */
+typedef struct HeapPageFreeze
+{
+	/* Is heap_prepare_freeze_tuple caller required to freeze page? */
+	bool		freeze_required;
+
+	/* Values used when heap_freeze_execute_prepared is called for page */
+	TransactionId NewRelfrozenXid;
+	MultiXactId NewRelminMxid;
+
+	/* "No freeze" variants used when page freezing doesn't take place */
+	TransactionId NoFreezeNewRelfrozenXid;
+	MultiXactId NoFreezeNewRelminMxid;
+
+} HeapPageFreeze;
+
 /* ----------------
  *		function prototypes for heap access method
  *
@@ -181,10 +215,9 @@ extern void heap_inplace_update(Relation relation, HeapTuple tuple);
 extern bool heap_prepare_freeze_tuple(HeapTupleHeader tuple,
 									  const struct VacuumCutoffs *cutoffs,
 									  HeapTupleFreeze *frz, bool *totally_frozen,
-									  TransactionId *NewRelFrozenXid,
-									  MultiXactId *NewRelminMxid);
+									  HeapPageFreeze *pagefrz);
 extern void heap_freeze_execute_prepared(Relation rel, Buffer buffer,
-										 TransactionId FreezeLimit,
+										 TransactionId snapshotConflictHorizon,
 										 HeapTupleFreeze *tuples, int ntuples);
 extern bool heap_freeze_tuple(HeapTupleHeader tuple,
 							  TransactionId relfrozenxid, TransactionId relminmxid,
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 74b3a459e..45cdc1ae8 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -6102,9 +6102,7 @@ heap_inplace_update(Relation relation, HeapTuple tuple)
  *		MultiXactId.
  *
  * "flags" is an output value; it's used to tell caller what to do on return.
- *
- * "mxid_oldest_xid_out" is an output value; it's used to track the oldest
- * extant Xid within any Multixact that will remain after freezing executes.
+ * "pagefrz" is an input/output value, used to manage page level freezing.
  *
  * Possible values that we can set in "flags":
  * FRM_NOOP
@@ -6119,16 +6117,34 @@ heap_inplace_update(Relation relation, HeapTuple tuple)
  *		The return value is a new MultiXactId to set as new Xmax.
  *		(caller must obtain proper infomask bits using GetMultiXactIdHintBits)
  *
- * "mxid_oldest_xid_out" is only set when "flags" contains either FRM_NOOP or
- * FRM_RETURN_IS_MULTI, since we only leave behind a MultiXactId for these.
+ * Caller delegates control of page freezing to us.  In practice we always
+ * force freezing of caller's page unless FRM_NOOP processing is indicated.
+ * We help caller ensure that XIDs < FreezeLimit and MXIDs < MultiXactCutoff
+ * can never be left behind.  We freely choose when and how to process each
+ * Multi, without ever violating the cutoff invariants for freezing.
  *
- * NB: Creates a _new_ MultiXactId when FRM_RETURN_IS_MULTI is set in "flags".
+ * It's useful to remove Multis on a proactive timeline (relative to freezing
+ * XIDs) to keep MultiXact member SLRU buffer misses to a minimum.  It can also
+ * be cheaper in the short run, for us, since we too can avoid SLRU buffer
+ * misses through eager processing.
+ *
+ * NB: Creates a _new_ MultiXactId when FRM_RETURN_IS_MULTI is set, though only
+ * when FreezeLimit and/or MultiXactCutoff cutoffs leave us with no choice.
+ * This can usually be put off, which is usually enough to avoid it altogether.
+ *
+ * NB: Caller must maintain "no freeze" NewRelfrozenXid/NewRelminMxid variants
+ * using heap_tuple_should_freeze when we haven't forced page-level freezing.
+ *
+ * NB: Caller should avoid needlessly calling heap_tuple_should_freeze when we
+ * have already forced page-level freezing, since that might incur the same
+ * SLRU buffer misses that we specifically intended to avoid by freezing.
  */
 static TransactionId
-FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
+FreezeMultiXactId(MultiXactId multi, HeapTupleHeader tuple,
 				  const struct VacuumCutoffs *cutoffs, uint16 *flags,
-				  TransactionId *mxid_oldest_xid_out)
+				  HeapPageFreeze *pagefrz)
 {
+	uint16		t_infomask = tuple->t_infomask;
 	TransactionId newxmax = InvalidTransactionId;
 	MultiXactMember *members;
 	int			nmembers;
@@ -6138,7 +6154,9 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
 	bool		has_lockers;
 	TransactionId update_xid;
 	bool		update_committed;
-	TransactionId temp_xid_out;
+	TransactionId NewRelfrozenXid = pagefrz->NewRelfrozenXid;
+	TransactionId axid PG_USED_FOR_ASSERTS_ONLY;
+	MultiXactId amxid PG_USED_FOR_ASSERTS_ONLY;
 
 	*flags = 0;
 
@@ -6150,14 +6168,16 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
 	{
 		/* Ensure infomask bits are appropriately set/reset */
 		*flags |= FRM_INVALIDATE_XMAX;
-		return InvalidTransactionId;
+		pagefrz->freeze_required = true;
+		Assert(!TransactionIdIsValid(newxmax));
+		return newxmax;
 	}
 	else if (MultiXactIdPrecedes(multi, cutoffs->relminmxid))
 		ereport(ERROR,
 				(errcode(ERRCODE_DATA_CORRUPTED),
 				 errmsg_internal("found multixact %u from before relminmxid %u",
 								 multi, cutoffs->relminmxid)));
-	else if (MultiXactIdPrecedes(multi, cutoffs->MultiXactCutoff))
+	else if (MultiXactIdPrecedes(multi, cutoffs->OldestMxact))
 	{
 		/*
 		 * This old multi cannot possibly have members still running, but
@@ -6170,7 +6190,7 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
 			ereport(ERROR,
 					(errcode(ERRCODE_DATA_CORRUPTED),
 					 errmsg_internal("multixact %u from before cutoff %u found to be still running",
-									 multi, cutoffs->MultiXactCutoff)));
+									 multi, cutoffs->OldestMxact)));
 
 		if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))
 		{
@@ -6206,14 +6226,14 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
 			}
 			else
 			{
+				if (TransactionIdPrecedes(newxmax, NewRelfrozenXid))
+					NewRelfrozenXid = newxmax;
 				*flags |= FRM_RETURN_IS_XID;
 			}
 		}
 
-		/*
-		 * Don't push back mxid_oldest_xid_out using FRM_RETURN_IS_XID Xid, or
-		 * when no Xids will remain
-		 */
+		pagefrz->NewRelfrozenXid = NewRelfrozenXid;
+		pagefrz->freeze_required = true;
 		return newxmax;
 	}
 
@@ -6229,11 +6249,13 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
 	{
 		/* Nothing worth keeping */
 		*flags |= FRM_INVALIDATE_XMAX;
-		return InvalidTransactionId;
+		pagefrz->freeze_required = true;
+		Assert(!TransactionIdIsValid(newxmax));
+		return newxmax;
 	}
 
 	need_replace = false;
-	temp_xid_out = *mxid_oldest_xid_out;	/* init for FRM_NOOP */
+	NewRelfrozenXid = pagefrz->NewRelfrozenXid; /* init for FRM_NOOP */
 	for (int i = 0; i < nmembers; i++)
 	{
 		TransactionId xid = members[i].xid;
@@ -6242,26 +6264,31 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
 
 		if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
 		{
+			/* Can't violate the FreezeLimit invariant */
 			need_replace = true;
 			break;
 		}
-		if (TransactionIdPrecedes(members[i].xid, temp_xid_out))
-			temp_xid_out = members[i].xid;
+		if (TransactionIdPrecedes(xid, NewRelfrozenXid))
+			NewRelfrozenXid = xid;
 	}
 
-	/*
-	 * In the simplest case, there is no member older than FreezeLimit; we can
-	 * keep the existing MultiXactId as-is, avoiding a more expensive second
-	 * pass over the multi
-	 */
+	/* Can't violate the MultiXactCutoff invariant, either */
+	if (!need_replace)
+		need_replace = MultiXactIdPrecedes(multi, cutoffs->MultiXactCutoff);
+
 	if (!need_replace)
 	{
 		/*
-		 * When mxid_oldest_xid_out gets pushed back here it's likely that the
-		 * update Xid was the oldest member, but we don't rely on that
+		 * FRM_NOOP case is the only one where we don't force page-level
+		 * freezing (see header comments).
+		 *
+		 * Might have to ratchet back NewRelminMxid, NewRelfrozenXid, or both
+		 * together.
 		 */
 		*flags |= FRM_NOOP;
-		*mxid_oldest_xid_out = temp_xid_out;
+		pagefrz->NewRelfrozenXid = NewRelfrozenXid;
+		if (MultiXactIdPrecedes(multi, pagefrz->NewRelminMxid))
+			pagefrz->NewRelminMxid = multi;
 		pfree(members);
 		return multi;
 	}
@@ -6270,13 +6297,20 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
 	 * Do a more thorough second pass over the multi to figure out which
 	 * member XIDs actually need to be kept.  Checking the precise status of
 	 * individual members might even show that we don't need to keep anything.
+	 *
+	 * We only reach this far when replacing xmax is absolutely mandatory.
+	 * heap_tuple_should_freeze will indicate that the tuple should be frozen.
 	 */
+	axid = cutoffs->OldestXmin;
+	amxid = cutoffs->OldestMxact;
+	Assert(heap_tuple_should_freeze(tuple, cutoffs, &axid, &amxid));
+
 	nnewmembers = 0;
 	newmembers = palloc(sizeof(MultiXactMember) * nmembers);
 	has_lockers = false;
 	update_xid = InvalidTransactionId;
 	update_committed = false;
-	temp_xid_out = *mxid_oldest_xid_out;	/* init for FRM_RETURN_IS_MULTI */
+	NewRelfrozenXid = pagefrz->NewRelfrozenXid; /* init for second pass */
 
 	/*
 	 * Determine whether to keep each member txid, or to ignore it instead
@@ -6365,11 +6399,11 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
 		/*
 		 * We determined that this is an Xid corresponding to an update that
 		 * must be retained -- add it to new members list for later.  Also
-		 * consider pushing back mxid_oldest_xid_out.
+		 * consider pushing back NewRelfrozenXid tracker.
 		 */
 		newmembers[nnewmembers++] = members[i];
-		if (TransactionIdPrecedes(xid, temp_xid_out))
-			temp_xid_out = xid;
+		if (TransactionIdPrecedes(xid, NewRelfrozenXid))
+			NewRelfrozenXid = xid;
 	}
 
 	pfree(members);
@@ -6380,10 +6414,14 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
 	 */
 	if (nnewmembers == 0)
 	{
-		/* nothing worth keeping!? Tell caller to remove the whole thing */
+		/*
+		 * Keeping nothing (neither an Xid nor a MultiXactId) in xmax.  Won't
+		 * have to ratchet back NewRelfrozenXid or NewRelminMxid.
+		 */
 		*flags |= FRM_INVALIDATE_XMAX;
 		newxmax = InvalidTransactionId;
-		/* Don't push back mxid_oldest_xid_out -- no Xids will remain */
+
+		Assert(pagefrz->NewRelfrozenXid == NewRelfrozenXid);
 	}
 	else if (TransactionIdIsValid(update_xid) && !has_lockers)
 	{
@@ -6399,22 +6437,28 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
 		if (update_committed)
 			*flags |= FRM_MARK_COMMITTED;
 		newxmax = update_xid;
-		/* Don't push back mxid_oldest_xid_out using FRM_RETURN_IS_XID Xid */
+
+		/* Might have already pushed back NewRelfrozenXid with update_xid */
+		Assert(TransactionIdPrecedesOrEquals(NewRelfrozenXid, update_xid));
 	}
 	else
 	{
 		/*
 		 * Create a new multixact with the surviving members of the previous
 		 * one, to set as new Xmax in the tuple.  The oldest surviving member
-		 * might push back mxid_oldest_xid_out.
+		 * might have already pushed back NewRelfrozenXid.
 		 */
 		newxmax = MultiXactIdCreateFromMembers(nnewmembers, newmembers);
 		*flags |= FRM_RETURN_IS_MULTI;
-		*mxid_oldest_xid_out = temp_xid_out;
+
+		/* Never need to push back NewRelminMxid when newxmax is new multi */
+		Assert(MultiXactIdPrecedesOrEquals(cutoffs->OldestMxact, newxmax));
 	}
 
 	pfree(newmembers);
 
+	pagefrz->NewRelfrozenXid = NewRelfrozenXid;
+	pagefrz->freeze_required = true;
 	return newxmax;
 }
 
@@ -6422,29 +6466,33 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
  * heap_prepare_freeze_tuple
  *
  * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
- * are older than the specified cutoff XID and cutoff MultiXactId.  If so,
+ * are older than the FreezeLimit and/or MultiXactCutoff cutoffs.  If so,
  * setup enough state (in the *frz output argument) to later execute and
- * WAL-log what we would need to do, and return true.  Return false if nothing
- * is to be changed.  In addition, set *totally_frozen to true if the tuple
- * will be totally frozen after these operations are performed and false if
- * more freezing will eventually be required.
+ * WAL-log what caller needs to do for the tuple, and return true.  Return
+ * false if nothing can be changed about the tuple right now.
  *
- * VACUUM caller must assemble HeapTupleFreeze entries for every tuple that we
- * returned true for when called.  A later heap_freeze_execute_prepared call
- * will execute freezing for caller's page as a whole.
+ * Also sets *totally_frozen to true if the tuple will be totally frozen once
+ * caller executes returned freeze plan (or if the tuple was already totally
+ * frozen by an earlier VACUUM).  This indicates that there are no remaining
+ * XIDs or MultiXactIds that will need to be processed by a future VACUUM.
+ *
+ * VACUUM caller must assemble HeapTupleFreeze freeze plan entries for every
+ * tuple that we returned true for, and call heap_freeze_execute_prepared to
+ * execute freezing.  Caller must initialize pagefrz fields for page as a
+ * whole before first call here for each heap page.
+ *
+ * We sometimes force freezing of xmax MultiXactId values long before it is
+ * strictly necessary to do so just to ensure the FreezeLimit postcondition.
+ * It's worth processing MultiXactIds proactively when it is cheap to do so,
+ * and it's convenient to make that happen by piggy-backing it on the "force
+ * freezing" mechanism.  Conversely, we sometimes delay freezing MultiXactIds
+ * because it is expensive right now (though only when it's still possible to
+ * do so without violating the FreezeLimit/MultiXactCutoff postcondition).
  *
  * It is assumed that the caller has checked the tuple with
  * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD
  * (else we should be removing the tuple, not freezing it).
  *
- * The *NewRelFrozenXid and *NewRelminMxid arguments are the current target
- * relfrozenxid and relminmxid for VACUUM caller's heap rel.  Any and all
- * unfrozen XIDs or MXIDs that remain in caller's rel after VACUUM finishes
- * _must_ have values >= the final relfrozenxid/relminmxid values in pg_class.
- * This includes XIDs that remain as MultiXact members from any tuple's xmax.
- * Each call here pushes back *NewRelFrozenXid and/or *NewRelminMxid as needed
- * to avoid unsafe final values in rel's authoritative pg_class tuple.
- *
  * NB: This function has side effects: it might allocate a new MultiXactId.
  * It will be set as tuple's new xmax when our *frz output is processed within
  * heap_execute_freeze_tuple later on.  If the tuple is in a shared buffer
@@ -6454,8 +6502,7 @@ bool
 heap_prepare_freeze_tuple(HeapTupleHeader tuple,
 						  const struct VacuumCutoffs *cutoffs,
 						  HeapTupleFreeze *frz, bool *totally_frozen,
-						  TransactionId *NewRelFrozenXid,
-						  MultiXactId *NewRelminMxid)
+						  HeapPageFreeze *pagefrz)
 {
 	bool		frzplan_set = false;
 	bool		xmin_already_frozen = false,
@@ -6471,7 +6518,7 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
 
 	/*
 	 * Process xmin, while keeping track of whether it's already frozen, or
-	 * will become frozen when our freeze plan is executed by caller (could be
+	 * will become frozen iff our freeze plan is executed by caller (could be
 	 * neither).
 	 */
 	xid = HeapTupleHeaderGetXmin(tuple);
@@ -6489,59 +6536,66 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
 					 errmsg_internal("found xmin %u from before relfrozenxid %u",
 									 xid, cutoffs->relfrozenxid)));
 
-		freeze_xmin = TransactionIdPrecedes(xid, cutoffs->FreezeLimit);
+		freeze_xmin = TransactionIdPrecedes(xid, cutoffs->OldestXmin);
 		if (freeze_xmin)
 		{
 			if (!TransactionIdDidCommit(xid))
 				ereport(ERROR,
 						(errcode(ERRCODE_DATA_CORRUPTED),
 						 errmsg_internal("uncommitted xmin %u from before xid cutoff %u needs to be frozen",
-										 xid, cutoffs->FreezeLimit)));
+										 xid, cutoffs->OldestXmin)));
 
 			frz->t_infomask |= HEAP_XMIN_FROZEN;
 			frzplan_set = true;
 		}
 		else
 		{
-			/* xmin to remain unfrozen.  Could push back NewRelfrozenXid. */
-			if (TransactionIdPrecedes(xid, *NewRelFrozenXid))
-				*NewRelFrozenXid = xid;
+			/* No need for NewRelfrozenXid handling for non-eligible xmin */
+			Assert(TransactionIdPrecedesOrEquals(pagefrz->NewRelfrozenXid,
+												 cutoffs->OldestXmin));
 		}
 	}
 
-	/*
-	 * Process xmax.  To thoroughly examine the current Xmax value we need to
-	 * resolve a MultiXactId to its member Xids, in case some of them are
-	 * below the given FreezeLimit.  In that case, those values might need
-	 * freezing, too.  Also, if a multi needs freezing, we cannot simply take
-	 * it out --- if there's a live updater Xid, it needs to be kept.
-	 */
+	/* Now process xmax */
 	xid = HeapTupleHeaderGetRawXmax(tuple);
-
 	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
 	{
 		/* Raw xmax is a MultiXactId */
 		TransactionId newxmax;
 		uint16		flags;
-		TransactionId mxid_oldest_xid_out = *NewRelFrozenXid;
-
-		newxmax = FreezeMultiXactId(xid, tuple->t_infomask, cutoffs,
-									&flags, &mxid_oldest_xid_out);
 
+		/*
+		 * We will either remove xmax completely (in the "freeze_xmax" path),
+		 * process xmax by modifying xmax in some other way, or perform no-op
+		 * xmax processing (which must still manage NewRelfrozenXid and
+		 * NewRelminMxid safety, often by accessing multi members XIDs).
+		 *
+		 * The only rule is that the FreezeLimit/MultiXactCutoff invariant
+		 * must never be violated.  FreezeMultiXactId decides on the rest.
+		 */
+		newxmax = FreezeMultiXactId(xid, tuple, cutoffs, &flags, pagefrz);
 		freeze_xmax = (flags & FRM_INVALIDATE_XMAX);
 
-		if (flags & FRM_RETURN_IS_XID)
+		if (flags & FRM_NOOP)
+		{
+			/*
+			 * xmax is a MultiXactId, and nothing about it changes for now.
+			 * This is the only case where 'freeze_required' won't have been
+			 * set for us by FreezeMultiXactId.
+			 */
+			Assert(!freeze_xmax);
+			Assert(MultiXactIdIsValid(newxmax) && xid == newxmax);
+			Assert(!MultiXactIdPrecedes(newxmax, pagefrz->NewRelminMxid));
+		}
+		else if (flags & FRM_RETURN_IS_XID)
 		{
 			/*
 			 * xmax will become an updater Xid (original MultiXact's updater
 			 * member Xid will be carried forward as a simple Xid in Xmax).
-			 * Might have to ratchet back NewRelfrozenXid here, though never
-			 * NewRelminMxid.
 			 */
 			Assert(!freeze_xmax);
-			Assert(TransactionIdIsValid(newxmax));
-			if (TransactionIdPrecedes(newxmax, *NewRelFrozenXid))
-				*NewRelFrozenXid = newxmax;
+			Assert(pagefrz->freeze_required);
+			Assert(!TransactionIdPrecedes(newxmax, pagefrz->NewRelfrozenXid));
 
 			/*
 			 * NB -- some of these transformations are only valid because we
@@ -6564,15 +6618,11 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
 			/*
 			 * xmax is an old MultiXactId that we have to replace with a new
 			 * MultiXactId, to carry forward two or more original member XIDs.
-			 * Might have to ratchet back NewRelfrozenXid here, though never
-			 * NewRelminMxid.
 			 */
 			Assert(!freeze_xmax);
+			Assert(pagefrz->freeze_required);
 			Assert(MultiXactIdIsValid(newxmax));
-			Assert(!MultiXactIdPrecedes(newxmax, *NewRelminMxid));
-			Assert(TransactionIdPrecedesOrEquals(mxid_oldest_xid_out,
-												 *NewRelFrozenXid));
-			*NewRelFrozenXid = mxid_oldest_xid_out;
+			Assert(!MultiXactIdPrecedes(newxmax, pagefrz->NewRelminMxid));
 
 			/*
 			 * We can't use GetMultiXactIdHintBits directly on the new multi
@@ -6585,33 +6635,18 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
 			GetMultiXactIdHintBits(newxmax, &newbits, &newbits2);
 			frz->t_infomask |= newbits;
 			frz->t_infomask2 |= newbits2;
-
 			frz->xmax = newxmax;
 
 			frzplan_set = true;
 		}
-		else if (flags & FRM_NOOP)
-		{
-			/*
-			 * xmax is a MultiXactId, and nothing about it changes for now.
-			 * Might have to ratchet back NewRelminMxid, NewRelfrozenXid, or
-			 * both together.
-			 */
-			Assert(!freeze_xmax);
-			Assert(MultiXactIdIsValid(newxmax) && xid == newxmax);
-			Assert(TransactionIdPrecedesOrEquals(mxid_oldest_xid_out,
-												 *NewRelFrozenXid));
-			if (MultiXactIdPrecedes(xid, *NewRelminMxid))
-				*NewRelminMxid = xid;
-			*NewRelFrozenXid = mxid_oldest_xid_out;
-		}
 		else
 		{
 			/*
-			 * Keeping nothing (neither an Xid nor a MultiXactId) in xmax.
-			 * Won't have to ratchet back NewRelminMxid or NewRelfrozenXid.
+			 * Keeping nothing (neither an Xid nor a MultiXactId) in xmax.  We
+			 * will "freeze xmax", in the strictest sense.
 			 */
 			Assert(freeze_xmax);
+			Assert(pagefrz->freeze_required);
 			Assert(!TransactionIdIsValid(newxmax));
 		}
 	}
@@ -6624,7 +6659,7 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
 					 errmsg_internal("found xmax %u from before relfrozenxid %u",
 									 xid, cutoffs->relfrozenxid)));
 
-		if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
+		if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
 		{
 			/*
 			 * If we freeze xmax, make absolutely sure that it's not an XID
@@ -6644,8 +6679,9 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
 		else
 		{
 			freeze_xmax = false;
-			if (TransactionIdPrecedes(xid, *NewRelFrozenXid))
-				*NewRelFrozenXid = xid;
+			/* No need for NewRelfrozenXid handling for non-eligible xmax */
+			Assert(TransactionIdPrecedesOrEquals(pagefrz->NewRelfrozenXid,
+												 cutoffs->OldestXmin));
 		}
 	}
 	else if (!TransactionIdIsValid(xid))
@@ -6716,16 +6752,36 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
 			Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID));
 			frz->t_infomask |= HEAP_XMIN_COMMITTED;
 			frzplan_set = true;
+			pagefrz->freeze_required = true;
 		}
 	}
 
 	/*
 	 * Determine if this tuple is already totally frozen, or will become
-	 * totally frozen
+	 * totally frozen (provided caller executes freeze plan for the page)
 	 */
 	*totally_frozen = ((freeze_xmin || xmin_already_frozen) &&
 					   (freeze_xmax || xmax_already_frozen));
 
+	/*
+	 * Force vacuumlazy.c to freeze page when avoiding it would violate the
+	 * rule that XIDs < FreezeLimit (and MXIDs < MultiXactCutoff) must never
+	 * remain.
+	 *
+	 * We have to do this even when we have no freeze plan for caller's tuple,
+	 * since "no freeze" tracking is still required (unless we already know
+	 * that freezing the page will go ahead, in which case we can skip it and
+	 * just rely on "freeze" NewRelfrozenXid tracking).
+	 */
+	if (!pagefrz->freeze_required && !(xmin_already_frozen &&
+									   xmax_already_frozen))
+	{
+		pagefrz->freeze_required =
+			heap_tuple_should_freeze(tuple, cutoffs,
+									 &pagefrz->NoFreezeNewRelfrozenXid,
+									 &pagefrz->NoFreezeNewRelminMxid);
+	}
+
 	return frzplan_set;
 }
 
@@ -6769,13 +6825,12 @@ heap_execute_freeze_tuple(HeapTupleHeader tuple, HeapTupleFreeze *frz)
  */
 void
 heap_freeze_execute_prepared(Relation rel, Buffer buffer,
-							 TransactionId FreezeLimit,
+							 TransactionId snapshotConflictHorizon,
 							 HeapTupleFreeze *tuples, int ntuples)
 {
 	Page		page = BufferGetPage(buffer);
 
 	Assert(ntuples > 0);
-	Assert(TransactionIdIsNormal(FreezeLimit));
 
 	START_CRIT_SECTION();
 
@@ -6798,19 +6853,10 @@ heap_freeze_execute_prepared(Relation rel, Buffer buffer,
 		int			nplans;
 		xl_heap_freeze_page xlrec;
 		XLogRecPtr	recptr;
-		TransactionId snapshotConflictHorizon;
 
 		/* Prepare deduplicated representation for use in WAL record */
 		nplans = heap_xlog_freeze_plan(tuples, ntuples, plans, offsets);
 
-		/*
-		 * FreezeLimit is (approximately) the first XID not frozen by VACUUM.
-		 * Back up caller's FreezeLimit to avoid false conflicts when
-		 * FreezeLimit is precisely equal to VACUUM's OldestXmin cutoff.
-		 */
-		snapshotConflictHorizon = FreezeLimit;
-		TransactionIdRetreat(snapshotConflictHorizon);
-
 		xlrec.snapshotConflictHorizon = snapshotConflictHorizon;
 		xlrec.nplans = nplans;
 
@@ -6851,8 +6897,7 @@ heap_freeze_tuple(HeapTupleHeader tuple,
 	bool		do_freeze;
 	bool		totally_frozen;
 	struct VacuumCutoffs cutoffs;
-	TransactionId NewRelfrozenXid = FreezeLimit;
-	MultiXactId NewRelminMxid = MultiXactCutoff;
+	HeapPageFreeze pagefrz;
 
 	cutoffs.relfrozenxid = relfrozenxid;
 	cutoffs.relminmxid = relminmxid;
@@ -6861,9 +6906,14 @@ heap_freeze_tuple(HeapTupleHeader tuple,
 	cutoffs.FreezeLimit = FreezeLimit;
 	cutoffs.MultiXactCutoff = MultiXactCutoff;
 
+	pagefrz.freeze_required = true;
+	pagefrz.NewRelfrozenXid = FreezeLimit;
+	pagefrz.NewRelminMxid = MultiXactCutoff;
+	pagefrz.NoFreezeNewRelfrozenXid = FreezeLimit;
+	pagefrz.NoFreezeNewRelminMxid = MultiXactCutoff;
+
 	do_freeze = heap_prepare_freeze_tuple(tuple, &cutoffs,
-										  &frz, &totally_frozen,
-										  &NewRelfrozenXid, &NewRelminMxid);
+										  &frz, &totally_frozen, &pagefrz);
 
 	/*
 	 * Note that because this is not a WAL-logged operation, we don't need to
@@ -7294,8 +7344,8 @@ heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple)
  * could be processed by pruning away the whole tuple instead of freezing.
  *
  * The *NewRelfrozenXid and *NewRelminMxid input/output arguments work just
- * like the heap_prepare_freeze_tuple arguments that they're based on.  We
- * never freeze here, which makes tracking the oldest extant XID/MXID simple.
+ * like the similar fields from the FreezeCutoffs struct.  We never freeze
+ * here, which makes tracking the oldest extant XID/MXID simple.
  */
 bool
 heap_tuple_should_freeze(HeapTupleHeader tuple,
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index b3668e57b..9753b6b08 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -1537,8 +1537,8 @@ lazy_scan_prune(LVRelState *vacrel,
 				live_tuples,
 				recently_dead_tuples;
 	int			nnewlpdead;
-	TransactionId NewRelfrozenXid;
-	MultiXactId NewRelminMxid;
+	HeapPageFreeze pagefrz;
+	bool		freeze_all_eligible PG_USED_FOR_ASSERTS_ONLY;
 	OffsetNumber deadoffsets[MaxHeapTuplesPerPage];
 	HeapTupleFreeze frozen[MaxHeapTuplesPerPage];
 
@@ -1554,8 +1554,11 @@ lazy_scan_prune(LVRelState *vacrel,
 retry:
 
 	/* Initialize (or reset) page-level state */
-	NewRelfrozenXid = vacrel->NewRelfrozenXid;
-	NewRelminMxid = vacrel->NewRelminMxid;
+	pagefrz.freeze_required = false;
+	pagefrz.NewRelfrozenXid = vacrel->NewRelfrozenXid;
+	pagefrz.NewRelminMxid = vacrel->NewRelminMxid;
+	pagefrz.NoFreezeNewRelfrozenXid = vacrel->NewRelfrozenXid;
+	pagefrz.NoFreezeNewRelminMxid = vacrel->NewRelminMxid;
 	tuples_deleted = 0;
 	tuples_frozen = 0;
 	lpdead_items = 0;
@@ -1608,27 +1611,23 @@ retry:
 			continue;
 		}
 
-		/*
-		 * LP_DEAD items are processed outside of the loop.
-		 *
-		 * Note that we deliberately don't set hastup=true in the case of an
-		 * LP_DEAD item here, which is not how count_nondeletable_pages() does
-		 * it -- it only considers pages empty/truncatable when they have no
-		 * items at all (except LP_UNUSED items).
-		 *
-		 * Our assumption is that any LP_DEAD items we encounter here will
-		 * become LP_UNUSED inside lazy_vacuum_heap_page() before we actually
-		 * call count_nondeletable_pages().  In any case our opinion of
-		 * whether or not a page 'hastup' (which is how our caller sets its
-		 * vacrel->nonempty_pages value) is inherently race-prone.  It must be
-		 * treated as advisory/unreliable, so we might as well be slightly
-		 * optimistic.
-		 */
 		if (ItemIdIsDead(itemid))
 		{
+			/*
+			 * Delay unsetting all_visible until after we have decided on
+			 * whether this page should be frozen.  We need to test "is this
+			 * page all_visible, assuming any LP_DEAD items are set LP_UNUSED
+			 * in final heap pass?" to reach a decision.  all_visible will be
+			 * unset before we return, as required by lazy_scan_heap caller.
+			 *
+			 * Deliberately don't set hastup for LP_DEAD items.  We make the
+			 * soft assumption that any LP_DEAD items encountered here will
+			 * become LP_UNUSED later on, before count_nondeletable_pages is
+			 * reached.  Whether the page 'hastup' is inherently race-prone.
+			 * It must be treated as unreliable by caller anyway, so we might
+			 * as well be slightly optimistic about it.
+			 */
 			deadoffsets[lpdead_items++] = offnum;
-			prunestate->all_visible = false;
-			prunestate->has_lpdead_items = true;
 			continue;
 		}
 
@@ -1757,7 +1756,7 @@ retry:
 		/* Tuple with storage -- consider need to freeze */
 		if (heap_prepare_freeze_tuple(tuple.t_data, &vacrel->cutoffs,
 									  &frozen[tuples_frozen], &totally_frozen,
-									  &NewRelfrozenXid, &NewRelminMxid))
+									  &pagefrz))
 		{
 			/* Save prepared freeze plan for later */
 			frozen[tuples_frozen++].offset = offnum;
@@ -1778,23 +1777,62 @@ retry:
 	 * that will need to be vacuumed in indexes later, or a LP_NORMAL tuple
 	 * that remains and needs to be considered for freezing now (LP_UNUSED and
 	 * LP_REDIRECT items also remain, but are of no further interest to us).
+	 *
+	 * Freeze the page when heap_prepare_freeze_tuple indicates that at least
+	 * one XID/MXID from before FreezeLimit/MultiXactCutoff is present.
 	 */
-	vacrel->NewRelfrozenXid = NewRelfrozenXid;
-	vacrel->NewRelminMxid = NewRelminMxid;
+	if (pagefrz.freeze_required || tuples_frozen == 0)
+	{
+		/*
+		 * We're freezing the page.  Our final NewRelfrozenXid doesn't need to
+		 * be affected by the XIDs that are just about to be frozen anyway.
+		 *
+		 * Note: although we're freezing all eligible tuples on this page, we
+		 * might not need to freeze anything (might be zero eligible tuples).
+		 */
+		vacrel->NewRelfrozenXid = pagefrz.NewRelfrozenXid;
+		vacrel->NewRelminMxid = pagefrz.NewRelminMxid;
+		freeze_all_eligible = true;
+	}
+	else
+	{
+		/* Not freezing this page, so use alternative cutoffs */
+		vacrel->NewRelfrozenXid = pagefrz.NoFreezeNewRelfrozenXid;
+		vacrel->NewRelminMxid = pagefrz.NoFreezeNewRelminMxid;
+
+		/* Might still set page all-visible, but never all-frozen */
+		tuples_frozen = 0;
+		freeze_all_eligible = prunestate->all_frozen = false;
+	}
 
 	/*
 	 * Consider the need to freeze any items with tuple storage from the page
-	 * first (arbitrary)
 	 */
 	if (tuples_frozen > 0)
 	{
-		Assert(prunestate->hastup);
+		TransactionId snapshotConflictHorizon;
+
+		Assert(prunestate->hastup && freeze_all_eligible);
 
 		vacrel->frozen_pages++;
 
+		/*
+		 * We can use the latest xmin cutoff (which is generally used for 'VM
+		 * set' conflicts) as our cutoff for freeze conflicts when the whole
+		 * page is eligible to become all-frozen in the VM once frozen by us.
+		 * Otherwise use a conservative cutoff (just back up from OldestXmin).
+		 */
+		if (prunestate->all_visible && prunestate->all_frozen)
+			snapshotConflictHorizon = prunestate->visibility_cutoff_xid;
+		else
+		{
+			snapshotConflictHorizon = vacrel->cutoffs.OldestXmin;
+			TransactionIdRetreat(snapshotConflictHorizon);
+		}
+
 		/* Execute all freeze plans for page as a single atomic action */
 		heap_freeze_execute_prepared(vacrel->rel, buf,
-									 vacrel->cutoffs.FreezeLimit,
+									 snapshotConflictHorizon,
 									 frozen, tuples_frozen);
 	}
 
@@ -1813,7 +1851,7 @@ retry:
 	 */
 #ifdef USE_ASSERT_CHECKING
 	/* Note that all_frozen value does not matter when !all_visible */
-	if (prunestate->all_visible)
+	if (prunestate->all_visible && lpdead_items == 0)
 	{
 		TransactionId cutoff;
 		bool		all_frozen;
@@ -1821,8 +1859,7 @@ retry:
 		if (!heap_page_is_all_visible(vacrel, buf, &cutoff, &all_frozen))
 			Assert(false);
 
-		Assert(lpdead_items == 0);
-		Assert(prunestate->all_frozen == all_frozen);
+		Assert(prunestate->all_frozen == all_frozen || !freeze_all_eligible);
 
 		/*
 		 * It's possible that we froze tuples and made the page's XID cutoff
@@ -1843,9 +1880,6 @@ retry:
 		VacDeadItems *dead_items = vacrel->dead_items;
 		ItemPointerData tmp;
 
-		Assert(!prunestate->all_visible);
-		Assert(prunestate->has_lpdead_items);
-
 		vacrel->lpdead_item_pages++;
 
 		ItemPointerSetBlockNumber(&tmp, blkno);
@@ -1859,6 +1893,10 @@ retry:
 		Assert(dead_items->num_items <= dead_items->max_items);
 		pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES,
 									 dead_items->num_items);
+
+		/* lazy_scan_heap caller expects LP_DEAD item to unset all_visible */
+		prunestate->all_visible = false;
+		prunestate->has_lpdead_items = true;
 	}
 
 	/* Finally, add page-local counts to whole-VACUUM counts */
-- 
2.38.1