From fbb49d2cb449cf808a1f776d281052b7b0e69578 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 9 Nov 2020 12:59:30 -0800
Subject: [PATCH v8 4/5] Teach nbtree to use bottom-up index deletion.

Teach nbtree to eagerly delete duplicate of tuples representing old
versions in the event of a localized flood of version churn.  This
situation is detected using heuristics, including the recently added
"index is logically unchanged by an UPDATE" executor hint.

This commit alone changes very little about how nbtree behaves.  We
still lack tableam-side support.  An upcoming commit that adds support
for bottom-up index deletion to heapam will follow, completing the
picture.

The immediate goal of bottom-up index deletion in nbtree is to avoid
"unnecessary" page splits caused entirely by duplicates needed only for
MVCC/versioning purposes.  It naturally has an even more useful effect,
though: it acts as a backstop against accumulating an excessive number
of index tuple versions for any given _logical row_.  Note that the
relationship between this localized condition and the proportion of
garbage tuples in the entire index is very loose, and can be very
volatile.  Bottom-up index deletion complements what we might now call
"top-down index deletion": index vacuuming performed by VACUUM.  It
responds to the immediate local needs of queries, while leaving it up to
autovacuum to perform infrequent clean sweeps of the index.

Bottom-up index deletion is very effective despite not changing anything
about the fundamental invariants for Postgres index access methods in
general (and despite not changing any invariants for nbtree in
particular).  That is, it is still inherently necessary to keep around
multiple versions together on the same leaf page, at least in some cases
(no change there).  But nothing forbids us from being proactive in
keeping the number of tuples for any given logical row under control, if
and when that seems to makes sense at the page/local level.  In practice
it is seldom strictly necessary to have more than a couple of physical
index tuple versions present for any given logical row.

You can think of bottom-up index deletion as bringing the effectiveness
of garbage collection in nbtree far closer to the true theoretical
limits imposed on it by the core system.  In practice nbtree could fall
significantly short of this ideal before now, often in ways that could
not easily be predicted or reasoned about, and often in the absence of
obvious stressors (like very long running transactions that hold open an
MVCC snapshot).  This may not have happened in production all that
often, but when it happened it had a significant impact on query
latency, often at the most inconvenient time possible.

Bottom-up deletion uses the same WAL record that we use when deleting
LP_DEAD items (the xl_btree_delete record).  This commit extends
_bt_delitems_delete() to support granular TID deletion in posting list
tuples, and to support a caller-supplied latestRemovedXid.  Only its
bottom-up index deletion caller makes use of these new facilities.

Bump XLOG_PAGE_MAGIC because xl_btree_delete changed.

No bump in BTREE_VERSION, since there are no changes to the on-disk
representation of nbtree indexes.  Indexes built on PostgreSQL 12 or
PostgreSQL 13 will automatically benefit from the optimization (i.e. no
reindexing required) following a pg_upgrade.

Author: Peter Geoghegan <pg@bowt.ie>
Reviewed-By: Victor Yegorov <vyegorov@gmail.com>
Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
---
 src/include/access/nbtree.h           |   7 +-
 src/include/access/nbtxlog.h          |   9 +-
 src/backend/access/nbtree/README      |  74 +++-
 src/backend/access/nbtree/nbtdedup.c  | 505 +++++++++++++++++++++++++-
 src/backend/access/nbtree/nbtinsert.c |  94 ++++-
 src/backend/access/nbtree/nbtpage.c   | 136 ++++++-
 src/backend/access/nbtree/nbtree.c    |   2 +-
 src/backend/access/nbtree/nbtutils.c  |   5 +
 src/backend/access/nbtree/nbtxlog.c   |  51 ++-
 9 files changed, 829 insertions(+), 54 deletions(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index d1b3e0ba6a..f8faae525a 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1031,6 +1031,8 @@ extern void _bt_parallel_advance_array_keys(IndexScanDesc scan);
 extern void _bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel,
 						   IndexTuple newitem, Size newitemsz,
 						   bool checkingunique);
+extern bool _bt_bottomup_pass(Relation rel, Buffer buf, Relation heapRel,
+							  Size newitemsz, bool checkingunique);
 extern void _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
 									OffsetNumber baseoff);
 extern bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup);
@@ -1045,7 +1047,8 @@ extern IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting,
  * prototypes for functions in nbtinsert.c
  */
 extern bool _bt_doinsert(Relation rel, IndexTuple itup,
-						 IndexUniqueCheck checkUnique, Relation heapRel);
+						 IndexUniqueCheck checkUnique, Relation heapRel,
+						 bool indexunchanged);
 extern void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack);
 extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child);
 
@@ -1084,7 +1087,9 @@ extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
 								OffsetNumber *deletable, int ndeletable,
 								BTVacuumPosting *updatable, int nupdatable);
 extern void _bt_delitems_delete(Relation rel, Buffer buf,
+								bool bottomup, TransactionId bottomupXid,
 								OffsetNumber *deletable, int ndeletable,
+								BTVacuumPosting *updatable, int nupdatable,
 								Relation heapRel);
 extern uint32 _bt_pagedel(Relation rel, Buffer leafbuf,
 						  TransactionId *oldestBtpoXact);
diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h
index 5c014bdc66..4006872d7a 100644
--- a/src/include/access/nbtxlog.h
+++ b/src/include/access/nbtxlog.h
@@ -187,12 +187,15 @@ typedef struct xl_btree_dedup
 typedef struct xl_btree_delete
 {
 	TransactionId latestRemovedXid;
-	uint32		ndeleted;
+	uint16		ndeleted;
+	uint16		nupdated;
 
 	/* DELETED TARGET OFFSET NUMBERS FOLLOW */
+	/* UPDATED TARGET OFFSET NUMBERS FOLLOW */
+	/* UPDATED TUPLES METADATA ARRAY FOLLOWS */
 } xl_btree_delete;
 
-#define SizeOfBtreeDelete	(offsetof(xl_btree_delete, ndeleted) + sizeof(uint32))
+#define SizeOfBtreeDelete	(offsetof(xl_btree_delete, nupdated) + sizeof(uint16))
 
 /*
  * This is what we need to know about page reuse within btree.  This record
@@ -213,7 +216,7 @@ typedef struct xl_btree_reuse_page
 /*
  * This is what we need to know about which TIDs to remove from an individual
  * posting list tuple during vacuuming.  An array of these may appear at the
- * end of xl_btree_vacuum records.
+ * end of xl_btree_vacuum and xl_btree_delete records.
  */
 typedef struct xl_btree_update
 {
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 27f555177e..7855392212 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -439,12 +439,14 @@ from the index immediately; since index scans only stop "between" pages,
 no scan can lose its place from such a deletion.  We separate the steps
 because we allow LP_DEAD to be set with only a share lock (it's exactly
 like a hint bit for a heap tuple), but physically removing tuples requires
-exclusive lock.  In the current code we try to remove LP_DEAD tuples when
-we are otherwise faced with having to split a page to do an insertion (and
-hence have exclusive lock on it already).  Deduplication can also prevent
-a page split, but removing LP_DEAD tuples is the preferred approach.
-(Note that posting list tuples can only have their LP_DEAD bit set when
-every table TID within the posting list is known dead.)
+exclusive lock.  We try to remove LP_DEAD tuples when we are otherwise
+faced with having to split a page to do an insertion (and hence have
+exclusive lock on it already).  Deduplication and bottom-up index deletion
+can also prevent a page split, but removing LP_DEAD tuples is always the
+preferred approach.  (Note that posting list tuples can only have their
+LP_DEAD bit set when every table TID within the posting list is known
+dead.  This isn't much of a problem because bottom-up deletion supports
+granular deletion of TIDs from posting lists.)
 
 This leaves the index in a state where it has no entry for a dead tuple
 that still exists in the heap.  This is not a problem for the current
@@ -767,9 +769,10 @@ into a single physical tuple with a posting list (a simple array of heap
 TIDs with the standard item pointer format).  Deduplication is always
 applied lazily, at the point where it would otherwise be necessary to
 perform a page split.  It occurs only when LP_DEAD items have been
-removed, as our last line of defense against splitting a leaf page.  We
-can set the LP_DEAD bit with posting list tuples, though only when all
-TIDs are known dead.
+removed, as our last line of defense against splitting a leaf page
+(bottom-up index deletion may be attempted first, as our second last line
+of defense).  We can set the LP_DEAD bit with posting list tuples, though
+only when all TIDs are known dead.
 
 Our lazy approach to deduplication allows the page space accounting used
 during page splits to have absolutely minimal special case logic for
@@ -826,6 +829,16 @@ delay a split that is probably inevitable anyway.  This allows us to avoid
 the overhead of attempting to deduplicate with unique indexes that always
 have few or no duplicates.
 
+Note: Avoiding "unnecessary" page splits driven by version churn is also
+the goal of bottom-up index deletion, which was added to PostgreSQL 14.
+Bottom-up index deletion is now the preferred way to deal with this
+problem (with all kinds of indexes, though especially with unique
+indexes).  Still, deduplication can sometimes augment bottom-up index
+deletion.  When deletion cannot free tuples (due to an old snapshot
+holding up cleanup), falling back on deduplication provides additional
+capacity.  Delaying the page split by deduplicating can allow a future
+bottom-up deletion pass of the same page to succeed.
+
 Posting list splits
 -------------------
 
@@ -880,6 +893,49 @@ that need a page split anyway.  Besides, supporting variable "split points"
 while splitting posting lists won't actually improve overall space
 utilization.
 
+Bottom-up index deletion
+------------------------
+
+We sometimes delete whatever duplicates happen to be present on the page
+before moving on to deduplication.  This only happens when we receive a
+hint that optimizations like heapam's HOT have not worked out for the
+index -- the incoming tuple must be a logically unchanged duplicate which
+is needed for MVCC purposes.  (Actually it also happens with unique
+indexes in some extra cases that don't get this hint.)
+
+There are certain ways in which this mechanism is similar to on-the-fly
+deletion of index tuples (that will already have failed to prevent a page
+split by the time bottom-up deletion is attempted).  For example, the same
+WAL records are used.  There are also significant differences.  Index
+tuples that get deleted by this mechanism won't have already been marked
+LP_DEAD in passing by queries.  Rather, we figure out whether or not
+they're deletable in principle at the last point before splitting a page
+by accessing tableam blocks to get visibility information.  We use
+heuristics to access as few tableam blocks as possible while still
+expecting to find a reasonably large number of tuples that are safe to
+delete each time (actually, we outsource much of this to the tableam, that
+understands how all this works pretty well).  We expect to perform regular
+bottom-up deletion operations against pages that are at constant risk of
+unnecessary page splits caused only by version churn.  When the mechanism
+works well we'll constantly be on the verge of having lots of version
+churn driven page splits, but never actually have any.
+
+Bottom-up index deletion can be thought of as a backstop mechanism against
+unnecessary version-driven page splits.  When we have a reasonable
+suspicion that a would-be page split may not actually be necessary, we
+fight back.  There is very little to lose and much to gain by spending a
+few cycles to become reasonably sure that it is in fact necessary -- page
+splits are very expensive and practically irreversible.  This approach
+works well with a large variety of workloads because we give up before
+spending very many cycles on trying, and because our heuristics are good
+enough to spot unnecessary page splits fairly reliably in practice.
+Unnecessary page splits occur due to pathological amounts of version
+churn.  In practice this pathological condition can be detected before too
+long using simple heuristics.  We don't have to understand the universe of
+possible workloads; we only have to understand the nature of the
+underlying pathology.  We're helped out by additional heuristics within
+tableams such as heapam.
+
 Notes About Data Representation
 -------------------------------
 
diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c
index 9e535124c4..db86541f42 100644
--- a/src/backend/access/nbtree/nbtdedup.c
+++ b/src/backend/access/nbtree/nbtdedup.c
@@ -16,13 +16,17 @@
 
 #include "access/nbtree.h"
 #include "access/nbtxlog.h"
+#include "access/tableam.h"
 #include "miscadmin.h"
 #include "utils/rel.h"
 
+static void _bt_bottomup_finish_pending(Page page, TM_IndexDeleteOp *delstate,
+										BTDedupState state);
 static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state,
 							 OffsetNumber minoff, IndexTuple newitem);
 static void _bt_singleval_fillfactor(Page page, BTDedupState state,
 									 Size newitemsz);
+static int	_bt_indexdelete_cmp(const void *a, const void *b);
 #ifdef USE_ASSERT_CHECKING
 static bool _bt_posting_valid(IndexTuple posting);
 #endif
@@ -267,6 +271,325 @@ _bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem,
 	pfree(state);
 }
 
+/*
+ * Perform bottom-up index deletion pass.
+ *
+ * See if duplicate index tuples are eligible to be deleted by accessing
+ * visibility information from the tableam.  Give up if we have to access more
+ * than a few tableam blocks.  Caller tries to avoid "unnecessary" page splits
+ * (splits driven only by version churn) by calling here when it looks like
+ * that's about to happen.  It's normal for there to be a lot of calls here
+ * for pages that are constantly at risk of an unnecessary split.
+ *
+ * Each failure to delete a duplicate/promising tuple here is a kind of
+ * learning experience.  It results in caller falling back on splitting the
+ * page (or on a deduplication pass), discouraging future calls back here for
+ * the same key space range covered by a failed page (or at least discouraging
+ * processing the original duplicates in case where caller falls back on a
+ * successful deduplication pass).  We converge on the most effective strategy
+ * for each page in the index over time.
+ *
+ * Returns true on success, in which case caller can assume page split will be
+ * avoided for a reasonable amount of time.  Returns false when caller should
+ * deduplicate the page (if possible at all).
+ *
+ * Note: occasionally a true return value does not actually indicate that any
+ * items could be deleted.  It might just indicate that caller should not go
+ * on to perform a deduplication pass.  Caller is not expected to care about
+ * the difference.
+ *
+ * Note: Caller should have already deleted all existing items with their
+ * LP_DEAD bits set.
+ */
+bool
+_bt_bottomup_pass(Relation rel, Buffer buf, Relation heapRel, Size newitemsz,
+				  bool checkingunique)
+{
+	OffsetNumber offnum,
+				minoff,
+				maxoff,
+				postingidxoffnum;
+	Page		page = BufferGetPage(buf);
+	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	BTDedupState state;
+	TM_IndexDeleteOp delstate;
+	bool		neverdedup = false;
+	TransactionId latestRemovedXid;
+	int			ndeletable,
+				nupdatable;
+	OffsetNumber deletable[MaxIndexTuplesPerPage];
+	BTVacuumPosting updatable[MaxIndexTuplesPerPage];
+	int			nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+
+	/* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
+	newitemsz += sizeof(ItemIdData);
+
+	/* Initialize deduplication state */
+	state = (BTDedupState) palloc(sizeof(BTDedupStateData));
+	state->deduplicate = true;
+	state->nmaxitems = 0;
+	state->maxpostingsize = BLCKSZ; /* "posting list size" not a concern */
+	state->base = NULL;
+	state->baseoff = InvalidOffsetNumber;
+	state->basetupsize = 0;
+	state->htids = palloc(state->maxpostingsize);
+	state->nhtids = 0;
+	state->nitems = 0;
+	state->phystupsize = 0;
+	state->nintervals = 0;
+
+	/*
+	 * Initialize tableam state that describes bottom-up index deletion
+	 * operation.
+	 *
+	 * We will ask tableam to free 1/16 of BLCKSZ.  We don't usually expect to
+	 * have to free much space each call here in order to avoid page splits.
+	 * We don't want to be too aggressive since in general the tableam will
+	 * have to access more table blocks when we ask for more free space.  In
+	 * general we try to be conservative about what we ask for (though not too
+	 * conservative), while leaving it up to the tableam to ramp up the number
+	 * of tableam blocks accessed when conditions in the table structure
+	 * happen to favor it.
+	 *
+	 * We expect to end up back here again and again for any leaf page that is
+	 * more or less constantly at risk of unnecessary page splits -- in fact
+	 * that's what happens when bottom-up deletion really helps.  We must
+	 * avoid thrashing when this becomes very frequent at the level of an
+	 * individual page.  Our free space target helps with that.  It balances
+	 * the costs and benefits over time and across related bottom-up deletion
+	 * passes.
+	 */
+	delstate.ndeltids = 0;
+	delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
+	delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
+	delstate.targetfreespace = Max(BLCKSZ / 16, newitemsz);
+
+	/* Now remember details of the page in the state we'll pass to tableam */
+	minoff = P_FIRSTDATAKEY(opaque);
+	maxoff = PageGetMaxOffsetNumber(page);
+	for (offnum = minoff;
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		ItemId		itemid = PageGetItemId(page, offnum);
+		IndexTuple	itup = (IndexTuple) PageGetItem(page, itemid);
+
+		Assert(!ItemIdIsDead(itemid));
+
+		if (offnum == minoff)
+		{
+			_bt_dedup_start_pending(state, itup, offnum);
+		}
+		else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
+				 _bt_dedup_save_htid(state, itup))
+		{
+			/* Tuple is equal; just added its TIDs to pending interval */
+		}
+		else
+		{
+			/* Finalize interval -- move its TIDs to bottom-up state */
+			_bt_bottomup_finish_pending(page, &delstate, state);
+
+			/* itup starts new pending interval */
+			_bt_dedup_start_pending(state, itup, offnum);
+		}
+	}
+	/* Finalize final interval -- move its TIDs to bottom-up state */
+	_bt_bottomup_finish_pending(page, &delstate, state);
+
+	/*
+	 * Remember to tell caller to never deduplicate later on, regardless of
+	 * how much space we free when there are no duplicates on existing page.
+	 *
+	 * Note: We sometimes proceed with calling table_index_delete_check() with
+	 * no promising tuples.  This is possible in cases with a unique index
+	 * that caller just erased LP_DEAD items on, as well as cases where we
+	 * deduplicated just a moment ago.  We finish what we started.  The
+	 * tableam has its own heuristics that it can fall back on, and so it
+	 * still has some chance of success.
+	 */
+	if (state->nintervals == 0)
+		neverdedup = true;
+
+	/* Done with dedup state */
+	pfree(state->htids);
+	pfree(state);
+
+	/* Now use tableam interface to determine which tuples to delete */
+	latestRemovedXid = table_index_delete_check(heapRel, &delstate);
+
+	if (delstate.ndeltids == 0)
+	{
+		/* The tableam has nothing for us */
+		pfree(delstate.deltids);
+		pfree(delstate.status);
+
+		if (neverdedup)
+			return true;
+
+		return false;
+	}
+
+	/*
+	 * By here we know that we have at least one deletable index tuple (or
+	 * posting list's TID) in final deltids array.  All that remains is to
+	 * construct a leaf-page-wise description of what _bt_delitems_delete()
+	 * needs to do to physically delete index tuples from the page.
+	 *
+	 * Sort deltids array (which is typically much smaller now) in the order
+	 * expected by loop: the original leaf-page-wise order (the order the
+	 * array was in before the tableam sorted it for its own reasons).
+	 */
+	qsort(delstate.deltids, delstate.ndeltids, sizeof(TM_IndexDelete),
+		  _bt_indexdelete_cmp);
+	postingidxoffnum = InvalidOffsetNumber;
+	ndeletable = 0;
+	nupdatable = 0;
+	for (int i = 0; i < delstate.ndeltids; i++)
+	{
+		TM_IndexStatus *dstatus = delstate.status + delstate.deltids[i].id;
+		OffsetNumber idxoffnum = dstatus->idxoffnum;
+		ItemId		itemid = PageGetItemId(page, idxoffnum);
+		IndexTuple	itup = (IndexTuple) PageGetItem(page, itemid);
+		int			tidi,
+					nitem;
+		BTVacuumPosting vacposting;
+
+		if (idxoffnum == postingidxoffnum)
+		{
+			/*
+			 * This deltid entry is a TID from a posting list tuple that has
+			 * already been completely processed (since we process all of a
+			 * posting lists TIDs together, once)
+			 */
+			Assert(BTreeTupleIsPosting(itup));
+			continue;
+		}
+
+		if (!BTreeTupleIsPosting(itup))
+		{
+			/* Plain non-pivot tuple */
+			Assert(ItemPointerEquals(&itup->t_tid, &delstate.deltids[i].tid));
+			if (dstatus->deleteitup)
+				deletable[ndeletable++] = idxoffnum;
+			continue;
+		}
+
+		/*
+		 * Posting list tuple.  Process all of its TIDs together, at once.
+		 *
+		 * tidi is a posting-list-tid local iterator for array.  We're going
+		 * to peak at later entries in deltid array here.  Remember to skip
+		 * over the itup-related entries that we peak at here later on.  We
+		 * should not do anything more with them when get back to the top of
+		 * the outermost deltids loop (we should just skip them).
+		 *
+		 * Innermost loop exploits the fact that both itup's TIDs and the
+		 * entries from the array (whose TIDs came from itup) are in ascending
+		 * TID order.  We avoid unnecessary TID comparisons by starting each
+		 * execution of the innermost loop at the point where the previous
+		 * execution (for previous TID from itup) left off at.
+		 */
+		postingidxoffnum = idxoffnum;	/* Remember: process itup once only */
+		tidi = i;				/* Initialize for itup's first TID */
+		vacposting = NULL;		/* Describes what to do with itup */
+		nitem = BTreeTupleGetNPosting(itup);
+		for (int j = 0; j < nitem; j++)
+		{
+			ItemPointer htid = BTreeTupleGetPostingN(itup, j);
+			int			cmp = -1;
+
+			for (; tidi < delstate.ndeltids; tidi++)
+			{
+				TM_IndexDelete *tcdeltid = &delstate.deltids[tidi];
+				TM_IndexStatus *tdstatus = (delstate.status + tcdeltid->id);
+
+				/* Stop when we get to first entry beyond itup's entries */
+				Assert(tdstatus->idxoffnum >= idxoffnum);
+				if (tdstatus->idxoffnum != idxoffnum)
+					break;
+
+				/* Skip any non-deletable entries for itup */
+				if (!tdstatus->deleteitup)
+					continue;
+
+				/* Have we found matching deletable entry for htid? */
+				cmp = ItemPointerCompare(htid, &tcdeltid->tid);
+
+				/* Keep going until equal or greater tid from array located */
+				if (cmp <= 0)
+					break;
+			}
+
+			/* Final check on htid: must match a deletable array entry */
+			if (cmp != 0)
+				continue;
+
+			if (vacposting == NULL)
+			{
+				/*
+				 * First deletable TID for itup found.  Start maintaining
+				 * metadata describing which TIDs to delete from itup.
+				 */
+				vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
+									nitem * sizeof(uint16));
+				vacposting->itup = itup;
+				vacposting->updatedoffset = idxoffnum;
+				vacposting->ndeletedtids = 0;
+			}
+
+			/* htid will be deleted from itup */
+			vacposting->deletetids[vacposting->ndeletedtids++] = j;
+		}
+
+		if (vacposting == NULL)
+		{
+			/* No TIDs to delete from itup -- do nothing */
+		}
+		else if (vacposting->ndeletedtids == nitem)
+		{
+			/* Straight delete of itup (to delete all TIDs) */
+			deletable[ndeletable++] = idxoffnum;
+			/* Turns out we won't need granular information */
+			pfree(vacposting);
+		}
+		else
+		{
+			/* Delete some but not all TIDs from itup */
+			Assert(vacposting->ndeletedtids > 0 &&
+				   vacposting->ndeletedtids < nitem);
+			updatable[nupdatable++] = vacposting;
+		}
+	}
+
+	/* Done with bottom-up deletion state */
+	pfree(delstate.deltids);
+	pfree(delstate.status);
+
+	/*
+	 * Go through with deleting TIDs that we found are safe to delete.
+	 *
+	 * No MarkBufferDirtyHint() call is needed here, since we don't ever mark
+	 * line pointers LP_DEAD.  Any and all modifications to the page are made
+	 * in the critical section in _bt_delitems_delete().
+	 */
+	_bt_delitems_delete(rel, buf, true, latestRemovedXid,
+						deletable, ndeletable, updatable, nupdatable,
+						heapRel);
+
+	/* be tidy */
+	for (int i = 0; i < nupdatable; i++)
+		pfree(updatable[i]);
+
+	/* Carry out earlier decision to have caller avoid deduplication now */
+	if (neverdedup)
+		return true;
+
+	/* Don't dedup when we won't end up back here any time soon anyway */
+	return PageGetExactFreeSpace(page) >=
+		Max(delstate.targetfreespace / 2, newitemsz);
+}
+
 /*
  * Create a new pending posting list tuple based on caller's base tuple.
  *
@@ -452,6 +775,164 @@ _bt_dedup_finish_pending(Page newpage, BTDedupState state)
 	return spacesaving;
 }
 
+/*
+ * Finalize interval during bottom-up index deletion.
+ *
+ * Determines which TIDs are to be marked promising based on heuristics.
+ */
+static void
+_bt_bottomup_finish_pending(Page page, TM_IndexDeleteOp *delstate,
+							BTDedupState state)
+{
+	bool		dupinterval = (state->nitems > 1);
+
+	Assert(state->nitems > 0);
+	Assert(state->nitems <= state->nhtids);
+	Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
+
+	/*
+	 * All TIDs from all tuples are at least recording in state.  Tuples are
+	 * marked promising when they're duplicates (i.e. when they appear in an
+	 * interval with more than one item, as when we expect create a new
+	 * posting list tuple in the deduplication case).
+	 *
+	 * It's easy to see what this means in the plain non-pivot tuple case:
+	 * TIDs from duplicate plain tuples are promising.  Posting list tuples
+	 * are more subtle.  We ought to do something with posting list tuples,
+	 * though plain tuples tend to be more promising targets.  (Plain tuples
+	 * are the most likely to be dead/deletable because they suggest version
+	 * churn.  And they allow us to free more space when we actually succeed).
+	 */
+	for (int i = 0; i < state->nitems; i++)
+	{
+		OffsetNumber offnum = state->baseoff + i;
+		ItemId		itemid = PageGetItemId(page, offnum);
+		IndexTuple	itup = (IndexTuple) PageGetItem(page, itemid);
+		TM_IndexDelete *cdeltid;
+		TM_IndexStatus *dstatus;
+
+		cdeltid = &delstate->deltids[delstate->ndeltids];
+		dstatus = &delstate->status[delstate->ndeltids];
+
+		if (!BTreeTupleIsPosting(itup))
+		{
+			/* Easy case: A plain non-pivot tuple's TID */
+			cdeltid->tid = itup->t_tid;
+			cdeltid->id = delstate->ndeltids;
+			dstatus->idxoffnum = offnum;
+			dstatus->ispromising = dupinterval;
+			dstatus->deleteitup = false;	/* for now */
+			dstatus->tupsize =
+				ItemIdGetLength(itemid) + sizeof(ItemIdData);
+			delstate->ndeltids++;
+		}
+		else
+		{
+			/*
+			 * Harder case: A posting list tuple's TIDs (multiple TIDs).
+			 *
+			 * Only a single TID from a posting list tuple may be promising,
+			 * and only when it appears in a duplicate tuple (just like plain
+			 * tuple case).  In general there is a good chance that the
+			 * posting list tuple relates to multiple logical rows, rather
+			 * than multiple versions of just one logical row.  (It can only
+			 * be the latter case when a previous bottom-up deletion pass
+			 * failed, necessitating a deduplication pass, which isn't all
+			 * that common.)
+			 *
+			 * There is a pretty good chance that at least one of the logical
+			 * rows from the posting list was updated, and so had a successor
+			 * version (about as good a chance as it is in the regular tuple
+			 * case, at least).  We should at least try to follow the regular
+			 * tuple case while making the conservative assumption that there
+			 * can only be one affected logical row per posting list tuple. We
+			 * do that by picking one TID when it appears to be from the
+			 * predominant tableam block in the posting list (if any one
+			 * tableam block predominates).  The approach we take is to either
+			 * choose the first or last TID in the posting list (if any at
+			 * all).  We go with whichever one is on the same tableam block at
+			 * the middle tuple (and only the first TID when both the first
+			 * and last TIDs relate to the same tableam block -- we could
+			 * easily be too aggressive here).
+			 *
+			 * If it turns out that there are multiple old versions of a
+			 * single logical table row, we still have a pretty good chance of
+			 * being able to delete them this way.  We don't want to give too
+			 * strong a signal to the tableam.  But we should always try to
+			 * give some useful hints.  Even cases with considerable
+			 * uncertainty can consistently avoid an unnecessary page split,
+			 * in part because the tableam will have tricks of its own for
+			 * figuring out where to look in marginal cases.
+			 */
+			int			nitem = BTreeTupleGetNPosting(itup);
+			bool		firstpromise = false;
+			bool		lastpromise = false;
+
+			Assert(_bt_posting_valid(itup));
+
+			if (dupinterval)
+			{
+				/* Figure out if there really should be promising TIDs */
+				BlockNumber minblocklist,
+							midblocklist,
+							maxblocklist;
+				ItemPointer mintid,
+							midtid,
+							maxtid;
+
+				mintid = BTreeTupleGetHeapTID(itup);
+				midtid = BTreeTupleGetPostingN(itup, nitem / 2);
+				maxtid = BTreeTupleGetMaxHeapTID(itup);
+				minblocklist = ItemPointerGetBlockNumber(mintid);
+				midblocklist = ItemPointerGetBlockNumber(midtid);
+				maxblocklist = ItemPointerGetBlockNumber(maxtid);
+
+				firstpromise = (minblocklist == midblocklist);
+				lastpromise = (!firstpromise && midblocklist == maxblocklist);
+			}
+
+			/* No more than one TID from itup can be promising */
+			Assert(!(firstpromise && lastpromise));
+
+			for (int p = 0; p < nitem; p++)
+			{
+				ItemPointer htid = BTreeTupleGetPostingN(itup, p);
+
+				cdeltid->tid = *htid;
+				cdeltid->id = delstate->ndeltids;
+				dstatus->idxoffnum = offnum;
+				dstatus->ispromising = false;
+
+				if ((firstpromise && p == 0) ||
+					(lastpromise && p == nitem - 1))
+					dstatus->ispromising = true;
+
+				dstatus->deleteitup = false;	/* for now */
+				dstatus->tupsize = sizeof(ItemPointerData) + 1;
+				delstate->ndeltids++;
+
+				cdeltid++;
+				dstatus++;
+			}
+		}
+	}
+
+	if (dupinterval)
+	{
+		/*
+		 * Maintain interval state for consistency with true deduplication
+		 * case
+		 */
+		state->intervals[state->nintervals].nitems = state->nitems;
+		state->nintervals++;
+	}
+
+	/* Reset state for next interval */
+	state->nhtids = 0;
+	state->nitems = 0;
+	state->phystupsize = 0;
+}
+
 /*
  * Determine if page non-pivot tuples (data items) are all duplicates of the
  * same value -- if they are, deduplication's "single value" strategy should
@@ -622,8 +1103,8 @@ _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
  * Generate a replacement tuple by "updating" a posting list tuple so that it
  * no longer has TIDs that need to be deleted.
  *
- * Used by VACUUM.  Caller's vacposting argument points to the existing
- * posting list tuple to be updated.
+ * Used by both VACUUM and bottom-up index deletion.  Caller's vacposting
+ * argument points to the existing posting list tuple to be updated.
  *
  * On return, caller's vacposting argument will point to final "updated"
  * tuple, which will be palloc()'d in caller's memory context.
@@ -765,6 +1246,26 @@ _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
 	return nposting;
 }
 
+/*
+ * Comparator used by _bt_bottomup_pass() to restore deltids array back to its
+ * original sort order
+ */
+static int
+_bt_indexdelete_cmp(const void *a, const void *b)
+{
+	TM_IndexDelete *indexdelete1 = (TM_IndexDelete *) a;
+	TM_IndexDelete *indexdelete2 = (TM_IndexDelete *) b;
+
+	if (indexdelete1->id > indexdelete2->id)
+		return 1;
+	if (indexdelete1->id < indexdelete2->id)
+		return -1;
+
+	Assert(false);
+
+	return 0;
+}
+
 /*
  * Verify posting list invariants for "posting", which must be a posting list
  * tuple.  Used within assertions.
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 1ab98588c8..ed73a30456 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -37,6 +37,7 @@ static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate,
 static OffsetNumber _bt_findinsertloc(Relation rel,
 									  BTInsertState insertstate,
 									  bool checkingunique,
+									  bool indexunchanged,
 									  BTStack stack,
 									  Relation heapRel);
 static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack);
@@ -61,7 +62,7 @@ static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
 static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
 										 BTInsertState insertstate,
 										 bool lpdeadonly, bool checkingunique,
-										 bool uniquedup);
+										 bool uniquedup, bool indexunchanged);
 
 /*
  *	_bt_doinsert() -- Handle insertion of a single index tuple in the tree.
@@ -83,7 +84,8 @@ static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
  */
 bool
 _bt_doinsert(Relation rel, IndexTuple itup,
-			 IndexUniqueCheck checkUnique, Relation heapRel)
+			 IndexUniqueCheck checkUnique, Relation heapRel,
+			 bool indexunchanged)
 {
 	bool		is_unique = false;
 	BTInsertStateData insertstate;
@@ -238,7 +240,7 @@ search:
 		 * checkingunique.
 		 */
 		newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
-									   stack, heapRel);
+									   indexunchanged, stack, heapRel);
 		_bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
 					   itup, insertstate.itemsz, newitemoff,
 					   insertstate.postingoff, false);
@@ -777,6 +779,12 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
  *		room for the new tuple, this function moves right, trying to find a
  *		legal page that does.)
  *
+ *		If 'indexunchanged' is true, this is for an UPDATE that didn't
+ *		logically change the indexed value, but must nevertheless have a new
+ *		entry to point to a successor version.  This hint from the executor
+ *		will influence our behavior when the page might have to be split and
+ *		we must consider if it's avoidable.
+ *
  *		On exit, insertstate buffer contains the chosen insertion page, and
  *		the offset within that page is returned.  If _bt_findinsertloc needed
  *		to move right, the lock and pin on the original page are released, and
@@ -793,6 +801,7 @@ static OffsetNumber
 _bt_findinsertloc(Relation rel,
 				  BTInsertState insertstate,
 				  bool checkingunique,
+				  bool indexunchanged,
 				  BTStack stack,
 				  Relation heapRel)
 {
@@ -817,7 +826,7 @@ _bt_findinsertloc(Relation rel,
 	if (itup_key->heapkeyspace)
 	{
 		/* Keep track of whether checkingunique duplicate seen */
-		bool		uniquedup = false;
+		bool		uniquedup = indexunchanged;
 
 		/*
 		 * If we're inserting into a unique index, we may have to walk right
@@ -881,7 +890,8 @@ _bt_findinsertloc(Relation rel,
 		 */
 		if (PageGetFreeSpace(page) < insertstate->itemsz)
 			_bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
-										 checkingunique, uniquedup);
+										 checkingunique, uniquedup,
+										 indexunchanged);
 	}
 	else
 	{
@@ -923,7 +933,8 @@ _bt_findinsertloc(Relation rel,
 			{
 				/* Erase LP_DEAD items (won't deduplicate) */
 				_bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
-											 checkingunique, false);
+											 checkingunique, false,
+											 indexunchanged);
 
 				if (PageGetFreeSpace(page) >= insertstate->itemsz)
 					break;		/* OK, now we have enough space */
@@ -977,7 +988,7 @@ _bt_findinsertloc(Relation rel,
 		 * This can only erase LP_DEAD items (it won't deduplicate).
 		 */
 		_bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
-									 checkingunique, false);
+									 checkingunique, false, indexunchanged);
 
 		/*
 		 * Do new binary search.  New insert location cannot overlap with any
@@ -2609,15 +2620,35 @@ _bt_pgaddtup(Page page,
  * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split by attempting
  * a variety of operations.
  *
- * There are two operations performed here: deleting items already marked
- * LP_DEAD, and deduplication.  If both operations fail to free enough space
- * for the incoming item then caller will go on to split the page.  We always
- * attempt our preferred strategy (which is to delete items whose LP_DEAD bit
- * are set) first.  If that doesn't work out we move on to deduplication.
+ * There are three operations performed here: deleting items already marked
+ * LP_DEAD, deduplication, and bottom-up index deletion.  If all three
+ * operations fail to free enough space for the incoming item then caller will
+ * go on to split the page.  We always attempt our preferred strategy (which
+ * is to delete items whose LP_DEAD bit are set) first.  If that doesn't work
+ * out we consider alternatives.  Most calls here will not exhaustively
+ * attempt all three operations.  Deduplication and bottom-up index deletion
+ * are relatively expensive operations, so we try to pick one or the other up
+ * front (whichever one seems better for this specific page).
  *
- * Caller's checkingunique and uniquedup arguments help us decide if we should
- * perform deduplication, which is primarily useful with low cardinality data,
- * but can sometimes absorb version churn.
+ * Caller's checkingunique, uniquedup, and indexunchanged arguments help us
+ * decide which alternative strategy we should attempt (or attempt first).
+ * Deduplication is primarily useful with low cardinality data.  Bottom-up
+ * index deletion is a backstop against version churn caused by repeated
+ * UPDATE statements where affected indexes don't receive logical changes
+ * (because an optimization like heapam's HOT cannot be applied in the
+ * tableam).  But useful interplay between both techniques over time is
+ * sometimes possible.
+ *
+ * Deduplication can sometimes step in when bottom-up index deletion fails due
+ * to it simply being unsafe to delete old version tuples that accumulate on a
+ * leaf page (usually because of one old snapshot that might need the old
+ * versions, and thereby disrupts the cleanup of garbage tuples generally).
+ * Deduplication may buy time for bottom-up index deletion, which can
+ * ultimately succeed because the question of splitting a page affected by
+ * version churn is delayed long enough for the snapshot that held back
+ * deletion to go away naturally.  Note that bottom-up deletion can perform
+ * granular deletion of posting list TIDs, just like VACUUM (but unlike our
+ * preferred strategy).
  *
  * Callers that only want us to look for/delete LP_DEAD items can ask for that
  * directly by passing true 'lpdeadonly' argument.
@@ -2640,7 +2671,7 @@ static void
 _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
 							 BTInsertState insertstate,
 							 bool lpdeadonly, bool checkingunique,
-							 bool uniquedup)
+							 bool uniquedup, bool indexunchanged)
 {
 	OffsetNumber deletable[MaxIndexTuplesPerPage];
 	int			ndeletable = 0;
@@ -2671,7 +2702,8 @@ _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
 
 	if (ndeletable > 0)
 	{
-		_bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel);
+		_bt_delitems_delete(rel, buffer, false, InvalidTransactionId,
+							deletable, ndeletable, NULL, 0, heapRel);
 		insertstate->bounds_valid = false;
 
 		/* Return when a page split has already been avoided */
@@ -2700,9 +2732,12 @@ _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
 	 * We can get called in the checkingunique case when there is no reason to
 	 * believe that there are any duplicates on the page; we should at least
 	 * still check for LP_DEAD items.  Now that we have, and now that it has
-	 * not helped, give up and let caller split the page.  Deduplication
-	 * cannot be justified given there is no reason to think that there are
-	 * duplicates.
+	 * not helped, give up and let caller split the page.
+	 *
+	 * We give up because the other types of operations that might avoid a
+	 * page split are also unlikely to work out, but are much more expensive
+	 * to try.  That cannot be justified given there is no reason to think
+	 * that there are duplicates that we can target.
 	 */
 	if (checkingunique && !uniquedup)
 		return;
@@ -2710,6 +2745,25 @@ _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
 	/* Assume bounds about to be invalidated (this is almost certain now) */
 	insertstate->bounds_valid = false;
 
+	/*
+	 * Perform bottom-up index deletion pass when executor hint indicated that
+	 * incoming item is logically unchanged, or for a unique index that is
+	 * known to have physical duplicates for some other reason.  (There is a
+	 * large overlap between these two cases for a unique index.  It's worth
+	 * having both triggering conditions in order to apply the optimization in
+	 * the event of successive related INSERT and DELETE statements.)
+	 *
+	 * We'll go on to do a deduplication pass when a bottom-up pass either
+	 * fails to delete an acceptable amount of free space (a non-trivial
+	 * fraction of the page that typically exceeds the new item's size), or
+	 * when we're dealing with low cardinality data that has relatively few
+	 * tuples (with large posting lists).
+	 */
+	if ((indexunchanged || uniquedup) &&
+		_bt_bottomup_pass(rel, buffer, heapRel, insertstate->itemsz,
+						  checkingunique))
+		return;
+
 	/*
 	 * Perform deduplication pass, though only when it is enabled for the
 	 * index and known to be safe (it must be an allequalimage index).
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index e192873f19..9a72de23a5 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -1110,15 +1110,14 @@ _bt_page_recyclable(Page page)
  * sorted in ascending order.
  *
  * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
- * in an existing posting list item are to be removed by VACUUM.  This works
- * by updating/overwriting an existing item with caller's new version of the
- * item (a version that lacks the TIDs that are to be deleted).
+ * in an existing posting list item are to be removed.  This works by
+ * updating/overwriting an existing item with caller's new version of the item
+ * (a version that lacks the TIDs that are to be deleted).
  *
  * We record VACUUMs and b-tree deletes differently in WAL.  Deletes must
  * generate their own latestRemovedXid by accessing the heap directly, whereas
  * VACUUMs rely on the initial heap scan taking care of it indirectly.  Also,
- * only VACUUM can perform granular deletes of individual TIDs in posting list
- * tuples.
+ * we remove the VACUUM cycle ID from pages, which b-tree deletes don't do.
  */
 void
 _bt_delitems_vacuum(Relation rel, Buffer buf,
@@ -1188,7 +1187,11 @@ _bt_delitems_vacuum(Relation rel, Buffer buf,
 	 *
 	 * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
 	 * happens to already be set.  It's important that we not interfere with
-	 * garbage collection mechanisms that use _bt_delitems_delete().  Besides,
+	 * garbage collection mechanisms that use _bt_delitems_delete().
+	 *
+	 * Eagerly removing items with their LP_DEAD bit set seems unwise, because
+	 * in practice bottom-up techniques do a good job of taking care of the
+	 * problem at a rate that makes sense at a keyspace-local level.  Plus
 	 * it'd just be messy.  We'd have to explicitly log a latestRemovedXid
 	 * cutoff, just like _bt_delitems_delete().  Accessing tableam blocks
 	 * again from here is rather unappealing.
@@ -1272,36 +1275,119 @@ _bt_delitems_vacuum(Relation rel, Buffer buf,
  * Delete item(s) from a btree leaf page during single-page cleanup.
  *
  * This routine assumes that the caller has pinned and write locked the
- * buffer.  Also, the given deletable array *must* be sorted in ascending
- * order.
+ * buffer.  Also, the given deletable and updatable arrays *must* be sorted in
+ * ascending order.
+ *
+ * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
+ * in an existing posting list item are to be removed.  This works by
+ * updating/overwriting an existing item with caller's new version of the item
+ * (a version that lacks the TIDs that are to be deleted).
  *
  * This is nearly the same as _bt_delitems_vacuum as far as what it does to
  * the page, but it needs to generate its own latestRemovedXid by accessing
  * the heap.  This is used by the REDO routine to generate recovery conflicts.
- * Also, it doesn't handle posting list tuples unless the entire tuple can be
- * deleted as a whole (since there is only one LP_DEAD bit per line pointer).
+ * Though note that bottom-up index deletion caller will provide its own
+ * latestRemovedXid, since it's convenient for it to determine that at the
+ * same point that it determines that the items are dead (it won't set LP_DEAD
+ * items on leaf page at all).  Also, we don't clear page's VACUUM cycle ID.
  */
 void
 _bt_delitems_delete(Relation rel, Buffer buf,
+					bool bottomup, TransactionId bottomupXid,
 					OffsetNumber *deletable, int ndeletable,
+					BTVacuumPosting *updatable, int nupdatable,
 					Relation heapRel)
 {
 	Page		page = BufferGetPage(buf);
 	BTPageOpaque opaque;
 	TransactionId latestRemovedXid = InvalidTransactionId;
+	Size		itemsz;
+	char	   *updatedbuf = NULL;
+	Size		updatedbuflen = 0;
+	OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
 
 	/* Shouldn't be called unless there's something to do */
-	Assert(ndeletable > 0);
+	Assert(ndeletable > 0 || nupdatable > 0);
+	/* Shouldn't update posting lists unless it's for bottom-up caller */
+	Assert(nupdatable == 0 || bottomup);
 
 	if (XLogStandbyInfoActive() && RelationNeedsWAL(rel))
-		latestRemovedXid =
-			_bt_xid_horizon(rel, heapRel, page, deletable, ndeletable);
+	{
+		if (!bottomup)
+			latestRemovedXid =
+				_bt_xid_horizon(rel, heapRel, page, deletable,
+								ndeletable);
+		else
+			latestRemovedXid = bottomupXid;
+	}
+
+	for (int i = 0; i < nupdatable; i++)
+	{
+		/* Replace work area IndexTuple with updated version */
+		_bt_update_posting(updatable[i]);
+
+		/* Maintain array of updatable page offsets for WAL record */
+		updatedoffsets[i] = updatable[i]->updatedoffset;
+	}
+
+	/* XLOG stuff -- allocate and fill buffer before critical section */
+	if (nupdatable > 0 && RelationNeedsWAL(rel))
+	{
+		Size		offset = 0;
+
+		for (int i = 0; i < nupdatable; i++)
+		{
+			BTVacuumPosting vacposting = updatable[i];
+
+			itemsz = SizeOfBtreeUpdate +
+				vacposting->ndeletedtids * sizeof(uint16);
+			updatedbuflen += itemsz;
+		}
+
+		updatedbuf = palloc(updatedbuflen);
+		for (int i = 0; i < nupdatable; i++)
+		{
+			BTVacuumPosting vacposting = updatable[i];
+			xl_btree_update update;
+
+			update.ndeletedtids = vacposting->ndeletedtids;
+			memcpy(updatedbuf + offset, &update.ndeletedtids,
+				   SizeOfBtreeUpdate);
+			offset += SizeOfBtreeUpdate;
+
+			itemsz = update.ndeletedtids * sizeof(uint16);
+			memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);
+			offset += itemsz;
+		}
+	}
 
 	/* No ereport(ERROR) until changes are logged */
 	START_CRIT_SECTION();
 
-	/* Fix the page */
-	PageIndexMultiDelete(page, deletable, ndeletable);
+	/*
+	 * Handle posting tuple updates.
+	 *
+	 * Deliberately do this before handling simple deletes.  If we did it the
+	 * other way around (i.e. WAL record order -- simple deletes before
+	 * updates) then we'd have to make compensating changes to the 'updatable'
+	 * array of offset numbers.
+	 */
+	for (int i = 0; i < nupdatable; i++)
+	{
+		OffsetNumber updatedoffset = updatedoffsets[i];
+		IndexTuple	itup;
+
+		itup = updatable[i]->itup;
+		itemsz = MAXALIGN(IndexTupleSize(itup));
+		if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
+									 itemsz))
+			elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
+				 BufferGetBlockNumber(buf), RelationGetRelationName(rel));
+	}
+
+	/* Now handle simple deletes of entire tuples */
+	if (ndeletable > 0)
+		PageIndexMultiDelete(page, deletable, ndeletable);
 
 	/*
 	 * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID,
@@ -1329,6 +1415,7 @@ _bt_delitems_delete(Relation rel, Buffer buf,
 
 		xlrec_delete.latestRemovedXid = latestRemovedXid;
 		xlrec_delete.ndeleted = ndeletable;
+		xlrec_delete.nupdated = nupdatable;
 
 		XLogBeginInsert();
 		XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
@@ -1339,8 +1426,16 @@ _bt_delitems_delete(Relation rel, Buffer buf,
 		 * When XLogInsert stores the whole buffer, the array need not be
 		 * stored too.
 		 */
-		XLogRegisterBufData(0, (char *) deletable,
-							ndeletable * sizeof(OffsetNumber));
+		if (ndeletable > 0)
+			XLogRegisterBufData(0, (char *) deletable,
+								ndeletable * sizeof(OffsetNumber));
+
+		if (nupdatable > 0)
+		{
+			XLogRegisterBufData(0, (char *) updatedoffsets,
+								nupdatable * sizeof(OffsetNumber));
+			XLogRegisterBufData(0, updatedbuf, updatedbuflen);
+		}
 
 		recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
 
@@ -1348,6 +1443,13 @@ _bt_delitems_delete(Relation rel, Buffer buf,
 	}
 
 	END_CRIT_SECTION();
+
+	/* can't leak memory here */
+	if (updatedbuf != NULL)
+		pfree(updatedbuf);
+	/* free tuples generated by calling _bt_update_posting() */
+	for (int i = 0; i < nupdatable; i++)
+		pfree(updatable[i]->itup);
 }
 
 /*
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 8eeb7bb64e..bc82cd4e7d 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -209,7 +209,7 @@ btinsert(Relation rel, Datum *values, bool *isnull,
 	itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
 	itup->t_tid = *ht_ctid;
 
-	result = _bt_doinsert(rel, itup, checkUnique, heapRel);
+	result = _bt_doinsert(rel, itup, checkUnique, heapRel, indexunchanged);
 
 	pfree(itup);
 
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 2f5f14e527..b6a60ce1cb 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -2414,6 +2414,11 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright,
  * This weaker guarantee is good enough for nbtsplitloc.c caller, since false
  * negatives generally only have the effect of making leaf page splits use a
  * more balanced split point.
+ *
+ * The differences between this function and _bt_keep_natts may actually be
+ * helpful to the bottom-up index deletion caller.  A bottom-up pass tries to
+ * find old versions left behind by UPDATEs, but only when those UPDATEs
+ * didn't logically modify columns that are covered by the index.
  */
 int
 _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 5135b800af..d2142f3e89 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -675,7 +675,56 @@ btree_xlog_delete(XLogReaderState *record)
 
 		page = (Page) BufferGetPage(buffer);
 
-		PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted);
+		if (xlrec->nupdated > 0)
+		{
+			OffsetNumber *updatedoffsets;
+			xl_btree_update *updates;
+
+			updatedoffsets = (OffsetNumber *)
+				(ptr + xlrec->ndeleted * sizeof(OffsetNumber));
+			updates = (xl_btree_update *) ((char *) updatedoffsets +
+										   xlrec->nupdated *
+										   sizeof(OffsetNumber));
+
+			for (int i = 0; i < xlrec->nupdated; i++)
+			{
+				BTVacuumPosting vacposting;
+				IndexTuple	origtuple;
+				ItemId		itemid;
+				Size		itemsz;
+
+				itemid = PageGetItemId(page, updatedoffsets[i]);
+				origtuple = (IndexTuple) PageGetItem(page, itemid);
+
+				vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
+									updates->ndeletedtids * sizeof(uint16));
+				vacposting->updatedoffset = updatedoffsets[i];
+				vacposting->itup = origtuple;
+				vacposting->ndeletedtids = updates->ndeletedtids;
+				memcpy(vacposting->deletetids,
+					   (char *) updates + SizeOfBtreeUpdate,
+					   updates->ndeletedtids * sizeof(uint16));
+
+				_bt_update_posting(vacposting);
+
+				/* Overwrite updated version of tuple */
+				itemsz = MAXALIGN(IndexTupleSize(vacposting->itup));
+				if (!PageIndexTupleOverwrite(page, updatedoffsets[i],
+											 (Item) vacposting->itup, itemsz))
+					elog(PANIC, "failed to update partially dead item");
+
+				pfree(vacposting->itup);
+				pfree(vacposting);
+
+				/* advance to next xl_btree_update from array */
+				updates = (xl_btree_update *)
+					((char *) updates + SizeOfBtreeUpdate +
+					 updates->ndeletedtids * sizeof(uint16));
+			}
+		}
+
+		if (xlrec->ndeleted > 0)
+			PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted);
 
 		/* Mark the page as not containing any LP_DEAD items */
 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-- 
2.25.1