From 4fd6fa5c21b79f56f5d3f8f8881778a3d8fb82c5 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Wed, 25 Sep 2019 10:08:53 -0700
Subject: [PATCH v20 1/2] Add deduplication to nbtree

---
 contrib/amcheck/verify_nbtree.c         | 164 ++++-
 src/backend/access/common/reloptions.c  |  11 +-
 src/backend/access/index/genam.c        |   4 +
 src/backend/access/nbtree/README        |  74 +-
 src/backend/access/nbtree/nbtinsert.c   | 860 +++++++++++++++++++++++-
 src/backend/access/nbtree/nbtpage.c     | 211 +++++-
 src/backend/access/nbtree/nbtree.c      | 175 ++++-
 src/backend/access/nbtree/nbtsearch.c   | 244 ++++++-
 src/backend/access/nbtree/nbtsort.c     | 144 +++-
 src/backend/access/nbtree/nbtsplitloc.c |  49 +-
 src/backend/access/nbtree/nbtutils.c    | 326 ++++++++-
 src/backend/access/nbtree/nbtxlog.c     | 222 +++++-
 src/backend/access/rmgrdesc/nbtdesc.c   |  28 +-
 src/include/access/nbtree.h             | 319 ++++++++-
 src/include/access/nbtxlog.h            |  68 +-
 src/include/access/rmgrlist.h           |   2 +-
 src/tools/valgrind.supp                 |  21 +
 17 files changed, 2732 insertions(+), 190 deletions(-)

diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index 05e7d678ed..bdb0ede577 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -145,6 +145,7 @@ static void bt_tuple_present_callback(Relation index, HeapTuple htup,
 									  bool tupleIsAlive, void *checkstate);
 static IndexTuple bt_normalize_tuple(BtreeCheckState *state,
 									 IndexTuple itup);
+static inline IndexTuple bt_posting_logical_tuple(IndexTuple itup, int n);
 static bool bt_rootdescend(BtreeCheckState *state, IndexTuple itup);
 static inline bool offset_is_negative_infinity(BTPageOpaque opaque,
 											   OffsetNumber offset);
@@ -419,12 +420,13 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
 		/*
 		 * Size Bloom filter based on estimated number of tuples in index,
 		 * while conservatively assuming that each block must contain at least
-		 * MaxIndexTuplesPerPage / 5 non-pivot tuples.  (Non-leaf pages cannot
-		 * contain non-pivot tuples.  That's okay because they generally make
-		 * up no more than about 1% of all pages in the index.)
+		 * MaxPostingIndexTuplesPerPage / 3 "logical" tuples.  heapallindexed
+		 * verification fingerprints posting list heap TIDs as plain non-pivot
+		 * tuples, complete with index keys.  This allows its heap scan to
+		 * behave as if posting lists do not exist.
 		 */
 		total_pages = RelationGetNumberOfBlocks(rel);
-		total_elems = Max(total_pages * (MaxIndexTuplesPerPage / 5),
+		total_elems = Max(total_pages * (MaxPostingIndexTuplesPerPage / 3),
 						  (int64) state->rel->rd_rel->reltuples);
 		/* Random seed relies on backend srandom() call to avoid repetition */
 		seed = random();
@@ -924,6 +926,7 @@ bt_target_page_check(BtreeCheckState *state)
 		size_t		tupsize;
 		BTScanInsert skey;
 		bool		lowersizelimit;
+		ItemPointer	scantid;
 
 		CHECK_FOR_INTERRUPTS();
 
@@ -994,29 +997,73 @@ bt_target_page_check(BtreeCheckState *state)
 
 		/*
 		 * Readonly callers may optionally verify that non-pivot tuples can
-		 * each be found by an independent search that starts from the root
+		 * each be found by an independent search that starts from the root.
+		 * Note that we deliberately don't do individual searches for each
+		 * "logical" posting list tuple, since the posting list itself is
+		 * validated by other checks.
 		 */
 		if (state->rootdescend && P_ISLEAF(topaque) &&
 			!bt_rootdescend(state, itup))
 		{
 			char	   *itid,
 					   *htid;
+			ItemPointer tid = BTreeTupleGetHeapTID(itup);
 
 			itid = psprintf("(%u,%u)", state->targetblock, offset);
 			htid = psprintf("(%u,%u)",
-							ItemPointerGetBlockNumber(&(itup->t_tid)),
-							ItemPointerGetOffsetNumber(&(itup->t_tid)));
+							ItemPointerGetBlockNumber(tid),
+							ItemPointerGetOffsetNumber(tid));
 
 			ereport(ERROR,
 					(errcode(ERRCODE_INDEX_CORRUPTED),
 					 errmsg("could not find tuple using search from root page in index \"%s\"",
 							RelationGetRelationName(state->rel)),
-					 errdetail_internal("Index tid=%s points to heap tid=%s page lsn=%X/%X.",
+					 errdetail_internal("Index tid=%s min heap tid=%s page lsn=%X/%X.",
 										itid, htid,
 										(uint32) (state->targetlsn >> 32),
 										(uint32) state->targetlsn)));
 		}
 
+		/*
+		 * If tuple is actually a posting list, make sure posting list TIDs
+		 * are in order.
+		 */
+		if (BTreeTupleIsPosting(itup))
+		{
+			ItemPointerData last;
+			ItemPointer		current;
+
+			ItemPointerCopy(BTreeTupleGetHeapTID(itup), &last);
+
+			for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+			{
+
+				current = BTreeTupleGetPostingN(itup, i);
+
+				if (ItemPointerCompare(current, &last) <= 0)
+				{
+					char	   *itid,
+							   *htid;
+
+					itid = psprintf("(%u,%u)", state->targetblock, offset);
+					htid = psprintf("(%u,%u)",
+									ItemPointerGetBlockNumberNoCheck(current),
+									ItemPointerGetOffsetNumberNoCheck(current));
+
+					ereport(ERROR,
+							(errcode(ERRCODE_INDEX_CORRUPTED),
+							 errmsg("posting list heap TIDs out of order in index \"%s\"",
+									RelationGetRelationName(state->rel)),
+							 errdetail_internal("Index tid=%s min heap tid=%s page lsn=%X/%X.",
+												itid, htid,
+												(uint32) (state->targetlsn >> 32),
+												(uint32) state->targetlsn)));
+				}
+
+				ItemPointerCopy(current, &last);
+			}
+		}
+
 		/* Build insertion scankey for current page offset */
 		skey = bt_mkscankey_pivotsearch(state->rel, itup);
 
@@ -1074,12 +1121,32 @@ bt_target_page_check(BtreeCheckState *state)
 		{
 			IndexTuple	norm;
 
-			norm = bt_normalize_tuple(state, itup);
-			bloom_add_element(state->filter, (unsigned char *) norm,
-							  IndexTupleSize(norm));
-			/* Be tidy */
-			if (norm != itup)
-				pfree(norm);
+			if (BTreeTupleIsPosting(itup))
+			{
+				/* Fingerprint all elements as distinct "logical" tuples */
+				for (int i = 0; i < BTreeTupleGetNPosting(itup); i++)
+				{
+					IndexTuple	logtuple;
+
+					logtuple = bt_posting_logical_tuple(itup, i);
+					norm = bt_normalize_tuple(state, logtuple);
+					bloom_add_element(state->filter, (unsigned char *) norm,
+									  IndexTupleSize(norm));
+					/* Be tidy */
+					if (norm != logtuple)
+						pfree(norm);
+					pfree(logtuple);
+				}
+			}
+			else
+			{
+				norm = bt_normalize_tuple(state, itup);
+				bloom_add_element(state->filter, (unsigned char *) norm,
+								  IndexTupleSize(norm));
+				/* Be tidy */
+				if (norm != itup)
+					pfree(norm);
+			}
 		}
 
 		/*
@@ -1087,7 +1154,8 @@ bt_target_page_check(BtreeCheckState *state)
 		 *
 		 * If there is a high key (if this is not the rightmost page on its
 		 * entire level), check that high key actually is upper bound on all
-		 * page items.
+		 * page items.  If this is a posting list tuple, we'll need to set
+		 * scantid to be highest TID in posting list.
 		 *
 		 * We prefer to check all items against high key rather than checking
 		 * just the last and trusting that the operator class obeys the
@@ -1127,6 +1195,9 @@ bt_target_page_check(BtreeCheckState *state)
 		 * tuple. (See also: "Notes About Data Representation" in the nbtree
 		 * README.)
 		 */
+		scantid = skey->scantid;
+		if (state->heapkeyspace && !BTreeTupleIsPivot(itup))
+			skey->scantid = BTreeTupleGetMaxHeapTID(itup);
 		if (!P_RIGHTMOST(topaque) &&
 			!(P_ISLEAF(topaque) ? invariant_leq_offset(state, skey, P_HIKEY) :
 			  invariant_l_offset(state, skey, P_HIKEY)))
@@ -1150,6 +1221,7 @@ bt_target_page_check(BtreeCheckState *state)
 										(uint32) (state->targetlsn >> 32),
 										(uint32) state->targetlsn)));
 		}
+		skey->scantid = scantid;
 
 		/*
 		 * * Item order check *
@@ -1164,11 +1236,13 @@ bt_target_page_check(BtreeCheckState *state)
 					   *htid,
 					   *nitid,
 					   *nhtid;
+			ItemPointer tid;
 
 			itid = psprintf("(%u,%u)", state->targetblock, offset);
+			tid = BTreeTupleGetHeapTID(itup);
 			htid = psprintf("(%u,%u)",
-							ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
-							ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
+							ItemPointerGetBlockNumberNoCheck(tid),
+							ItemPointerGetOffsetNumberNoCheck(tid));
 			nitid = psprintf("(%u,%u)", state->targetblock,
 							 OffsetNumberNext(offset));
 
@@ -1177,9 +1251,11 @@ bt_target_page_check(BtreeCheckState *state)
 										  state->target,
 										  OffsetNumberNext(offset));
 			itup = (IndexTuple) PageGetItem(state->target, itemid);
+
+			tid = BTreeTupleGetHeapTID(itup);
 			nhtid = psprintf("(%u,%u)",
-							 ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
-							 ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
+							 ItemPointerGetBlockNumberNoCheck(tid),
+							 ItemPointerGetOffsetNumberNoCheck(tid));
 
 			ereport(ERROR,
 					(errcode(ERRCODE_INDEX_CORRUPTED),
@@ -1189,10 +1265,10 @@ bt_target_page_check(BtreeCheckState *state)
 										"higher index tid=%s (points to %s tid=%s) "
 										"page lsn=%X/%X.",
 										itid,
-										P_ISLEAF(topaque) ? "heap" : "index",
+										P_ISLEAF(topaque) ? "min heap" : "index",
 										htid,
 										nitid,
-										P_ISLEAF(topaque) ? "heap" : "index",
+										P_ISLEAF(topaque) ? "min heap" : "index",
 										nhtid,
 										(uint32) (state->targetlsn >> 32),
 										(uint32) state->targetlsn)));
@@ -1953,10 +2029,10 @@ bt_tuple_present_callback(Relation index, HeapTuple htup, Datum *values,
  * verification.  In particular, it won't try to normalize opclass-equal
  * datums with potentially distinct representations (e.g., btree/numeric_ops
  * index datums will not get their display scale normalized-away here).
- * Normalization may need to be expanded to handle more cases in the future,
- * though.  For example, it's possible that non-pivot tuples could in the
- * future have alternative logically equivalent representations due to using
- * the INDEX_ALT_TID_MASK bit to implement intelligent deduplication.
+ * Caller does normalization for non-pivot tuples that have a posting list,
+ * since dummy CREATE INDEX callback code generates new tuples with the same
+ * normalized representation.  Deduplication is performed opportunistically,
+ * and in general there is no guarantee about how or when it will be applied.
  */
 static IndexTuple
 bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup)
@@ -1969,6 +2045,9 @@ bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup)
 	IndexTuple	reformed;
 	int			i;
 
+	/* Caller should only pass "logical" non-pivot tuples here */
+	Assert(!BTreeTupleIsPosting(itup) && !BTreeTupleIsPivot(itup));
+
 	/* Easy case: It's immediately clear that tuple has no varlena datums */
 	if (!IndexTupleHasVarwidths(itup))
 		return itup;
@@ -2031,6 +2110,30 @@ bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup)
 	return reformed;
 }
 
+/*
+ * Produce palloc()'d "logical" tuple for nth posting list entry.
+ *
+ * In general, deduplication is not supposed to change the logical contents of
+ * an index.  Multiple logical index tuples are folded together into one
+ * physical posting list index tuple when convenient.
+ *
+ * heapallindexed verification must normalize-away this variation in
+ * representation by converting posting list tuples into two or more "logical"
+ * tuples.  Each logical tuple must be fingerprinted separately -- there must
+ * be one logical tuple for each corresponding Bloom filter probe during the
+ * heap scan.
+ *
+ * Note: Caller needs to call bt_normalize_tuple() with returned tuple.
+ */
+static inline IndexTuple
+bt_posting_logical_tuple(IndexTuple itup, int n)
+{
+	Assert(BTreeTupleIsPosting(itup));
+
+	/* Returns non-posting-list tuple */
+	return BTreeFormPostingTuple(itup, BTreeTupleGetPostingN(itup, n), 1);
+}
+
 /*
  * Search for itup in index, starting from fast root page.  itup must be a
  * non-pivot tuple.  This is only supported with heapkeyspace indexes, since
@@ -2087,6 +2190,7 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup)
 		insertstate.itup = itup;
 		insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
 		insertstate.itup_key = key;
+		insertstate.postingoff = 0;
 		insertstate.bounds_valid = false;
 		insertstate.buf = lbuf;
 
@@ -2094,7 +2198,9 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup)
 		offnum = _bt_binsrch_insert(state->rel, &insertstate);
 		/* Compare first >= matching item on leaf page, if any */
 		page = BufferGetPage(lbuf);
+		/* Should match on first heap TID when tuple has a posting list */
 		if (offnum <= PageGetMaxOffsetNumber(page) &&
+			insertstate.postingoff <= 0 &&
 			_bt_compare(state->rel, key, page, offnum) == 0)
 			exists = true;
 		_bt_relbuf(state->rel, lbuf);
@@ -2560,14 +2666,18 @@ static inline ItemPointer
 BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, IndexTuple itup,
 							bool nonpivot)
 {
-	ItemPointer result = BTreeTupleGetHeapTID(itup);
+	ItemPointer result;
 	BlockNumber targetblock = state->targetblock;
 
-	if (result == NULL && nonpivot)
+	/* Shouldn't be called with heapkeyspace index */
+	Assert(state->heapkeyspace);
+	if (BTreeTupleIsPivot(itup) == nonpivot)
 		ereport(ERROR,
 				(errcode(ERRCODE_INDEX_CORRUPTED),
 				 errmsg("block %u or its right sibling block or child block in index \"%s\" contains non-pivot tuple that lacks a heap TID",
 						targetblock, RelationGetRelationName(state->rel))));
 
+	result = BTreeTupleGetHeapTID(itup);
+
 	return result;
 }
diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c
index b5072c00fe..e6448e4a86 100644
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -158,6 +158,15 @@ static relopt_bool boolRelOpts[] =
 		},
 		true
 	},
+	{
+		{
+			"deduplication",
+			"Enables deduplication on btree index leaf pages",
+			RELOPT_KIND_BTREE,
+			ShareUpdateExclusiveLock
+		},
+		true
+	},
 	/* list terminator */
 	{{NULL}}
 };
@@ -1513,8 +1522,6 @@ default_reloptions(Datum reloptions, bool validate, relopt_kind kind)
 		offsetof(StdRdOptions, user_catalog_table)},
 		{"parallel_workers", RELOPT_TYPE_INT,
 		offsetof(StdRdOptions, parallel_workers)},
-		{"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL,
-		offsetof(StdRdOptions, vacuum_cleanup_index_scale_factor)},
 		{"vacuum_index_cleanup", RELOPT_TYPE_BOOL,
 		offsetof(StdRdOptions, vacuum_index_cleanup)},
 		{"vacuum_truncate", RELOPT_TYPE_BOOL,
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index 2599b5d342..6e1dc596e1 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -276,6 +276,10 @@ BuildIndexValueDescription(Relation indexRelation,
 /*
  * Get the latestRemovedXid from the table entries pointed at by the index
  * tuples being deleted.
+ *
+ * Note: index access methods that don't consistently use the standard
+ * IndexTuple + heap TID item pointer representation will need to provide
+ * their own version of this function.
  */
 TransactionId
 index_compute_xid_horizon_for_tuples(Relation irel,
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 6db203e75c..54cb9db49d 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -432,7 +432,10 @@ because we allow LP_DEAD to be set with only a share lock (it's exactly
 like a hint bit for a heap tuple), but physically removing tuples requires
 exclusive lock.  In the current code we try to remove LP_DEAD tuples when
 we are otherwise faced with having to split a page to do an insertion (and
-hence have exclusive lock on it already).
+hence have exclusive lock on it already).  Deduplication can also prevent
+a page split, but removing LP_DEAD tuples is the preferred approach.
+(Note that posting list tuples can only have their LP_DEAD bit set when
+every "logical" tuple represented within the posting list is known dead.)
 
 This leaves the index in a state where it has no entry for a dead tuple
 that still exists in the heap.  This is not a problem for the current
@@ -710,6 +713,75 @@ the fallback strategy assumes that duplicates are mostly inserted in
 ascending heap TID order.  The page is split in a way that leaves the left
 half of the page mostly full, and the right half of the page mostly empty.
 
+Notes about deduplication
+-------------------------
+
+We deduplicate non-pivot tuples in non-unique indexes to reduce storage
+overhead, and to avoid or at least delay page splits.  Deduplication alters
+the physical representation of tuples without changing the logical contents
+of the index, and without adding overhead to read queries.  Non-pivot
+tuples are folded together into a single physical tuple with a posting list
+(a simple array of heap TIDs with the standard item pointer format).
+Deduplication is always applied lazily, at the point where it would
+otherwise be necessary to perform a page split.  It occurs only when
+LP_DEAD items have been removed, as our last line of defense against
+splitting a leaf page.  We can set the LP_DEAD bit with posting list
+tuples, though only when all table tuples are known dead. (Bitmap scans
+cannot perform LP_DEAD bit setting, and are the common case with indexes
+that contain lots of duplicates, so this downside is considered
+acceptable.)
+
+Large groups of logical duplicates tend to appear together on the same leaf
+page due to the special duplicate logic used when choosing a split point.
+This facilitates lazy/dynamic deduplication.  Deduplication can reliably
+deduplicate a large localized group of duplicates before it can span
+multiple leaf pages.  Posting list tuples are subject to the same 1/3 of a
+page restriction as any other tuple.
+
+Lazy deduplication allows the page space accounting used during page splits
+to have absolutely minimal special case logic for posting lists.  A posting
+list can be thought of as extra payload that suffix truncation will
+reliably truncate away as needed during page splits, just like non-key
+columns from an INCLUDE index tuple.  An incoming tuple (which might cause
+a page split) can always be thought of as a non-posting-list tuple that
+must be inserted alongside existing items, without needing to consider
+deduplication.  Most of the time, that's what actually happens: incoming
+tuples are either not duplicates, or are duplicates with a heap TID that
+doesn't overlap with any existing posting list tuple.  When the incoming
+tuple really does overlap with an existing posting list, a posting list
+split is performed.  Posting list splits work in a way that more or less
+preserves the illusion that all incoming tuples do not need to be merged
+with any existing posting list tuple.
+
+Posting list splits work by "overriding" the details of the incoming tuple.
+The heap TID of the incoming tuple is altered to make it match the
+rightmost heap TID from the existing/originally overlapping posting list.
+The offset number that the new/incoming tuple is to be inserted at is
+incremented so that it will be inserted to the right of the existing
+posting list.  The insertion (or page split) operation that completes the
+insert does one extra step: an in-place update of the posting list.  The
+update changes the posting list such that the "true" heap TID from the
+original incoming tuple is now contained in the posting list.  We make
+space in the posting list by removing the heap TID that became the new
+item.  The size of the posting list won't change, and so the page split
+space accounting does not need to care about posting lists.  Also, overall
+space utilization is improved by keeping existing posting lists large.
+
+The representation of posting lists is identical to the posting lists used
+by GIN, so it would be straightforward to apply GIN's varbyte encoding
+compression scheme to individual posting lists.  Posting list compression
+would break the assumptions made by posting list splits about page space
+accounting, though, so it's not clear how compression could be integrated
+with nbtree.  Besides, posting list compression does not offer a compelling
+trade-off for nbtree, since in general nbtree is optimized for consistent
+performance with many concurrent readers and writers.  A major goal of
+nbtree's lazy approach to deduplication is to limit the performance impact
+of deduplication with random updates.  Even concurrent append-only inserts
+of the same key value will tend to have inserts of individual index tuples
+in an order that doesn't quite match heap TID order.  In general, delaying
+deduplication avoids many unnecessary posting list splits, and minimizes
+page level fragmentation.
+
 Notes About Data Representation
 -------------------------------
 
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index b84bf1c3df..3d213dfd2d 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -47,21 +47,27 @@ static void _bt_insertonpg(Relation rel, BTScanInsert itup_key,
 						   BTStack stack,
 						   IndexTuple itup,
 						   OffsetNumber newitemoff,
+						   int postingoff,
 						   bool split_only_page);
 static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf,
 						Buffer cbuf, OffsetNumber newitemoff, Size newitemsz,
-						IndexTuple newitem);
+						IndexTuple newitem, IndexTuple orignewitem,
+						IndexTuple nposting, OffsetNumber postingoff);
 static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
 							  BTStack stack, bool is_root, bool is_only);
 static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
 						 OffsetNumber itup_off);
 static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel);
+static void _bt_dedup_one_page(Relation rel, Buffer buffer, Relation heapRel,
+							   IndexTuple newitem, Size newitemsz,
+							   bool checkingunique);
 
 /*
  *	_bt_doinsert() -- Handle insertion of a single index tuple in the tree.
  *
  *		This routine is called by the public interface routine, btinsert.
- *		By here, itup is filled in, including the TID.
+ *		By here, itup is filled in, including the TID.  Caller should be
+ *		prepared for us to scribble on 'itup'.
  *
  *		If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
  *		will allow duplicates.  Otherwise (UNIQUE_CHECK_YES or
@@ -123,6 +129,7 @@ _bt_doinsert(Relation rel, IndexTuple itup,
 	/* PageAddItem will MAXALIGN(), but be consistent */
 	insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
 	insertstate.itup_key = itup_key;
+	insertstate.postingoff = 0;
 	insertstate.bounds_valid = false;
 	insertstate.buf = InvalidBuffer;
 
@@ -300,7 +307,7 @@ top:
 		newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
 									   stack, heapRel);
 		_bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
-					   itup, newitemoff, false);
+					   itup, newitemoff, insertstate.postingoff, false);
 	}
 	else
 	{
@@ -428,14 +435,36 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 			if (!ItemIdIsDead(curitemid))
 			{
 				ItemPointerData htid;
+				bool		posting;
 				bool		all_dead;
+				bool		posting_all_dead;
+				int			npost;
+
 
 				if (_bt_compare(rel, itup_key, page, offset) != 0)
 					break;		/* we're past all the equal tuples */
 
 				/* okay, we gotta fetch the heap tuple ... */
 				curitup = (IndexTuple) PageGetItem(page, curitemid);
-				htid = curitup->t_tid;
+
+				if (!BTreeTupleIsPosting(curitup))
+				{
+					htid = curitup->t_tid;
+					posting = false;
+					posting_all_dead = true;
+				}
+				else
+				{
+					posting = true;
+					/* Initial assumption */
+					posting_all_dead = true;
+				}
+
+				npost = 0;
+		doposttup:
+				if (posting)
+					htid = *BTreeTupleGetPostingN(curitup, npost);
+
 
 				/*
 				 * If we are doing a recheck, we expect to find the tuple we
@@ -446,6 +475,9 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 					ItemPointerCompare(&htid, &itup->t_tid) == 0)
 				{
 					found = true;
+					posting_all_dead = false;
+					if (posting)
+						goto nextpost;
 				}
 
 				/*
@@ -511,8 +543,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 					 * not part of this chain because it had a different index
 					 * entry.
 					 */
-					htid = itup->t_tid;
-					if (table_index_fetch_tuple_check(heapRel, &htid,
+					if (table_index_fetch_tuple_check(heapRel, &itup->t_tid,
 													  SnapshotSelf, NULL))
 					{
 						/* Normal case --- it's still live */
@@ -570,7 +601,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 													RelationGetRelationName(rel))));
 					}
 				}
-				else if (all_dead)
+				else if (all_dead && !posting)
 				{
 					/*
 					 * The conflicting tuple (or whole HOT chain) is dead to
@@ -589,6 +620,35 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 					else
 						MarkBufferDirtyHint(insertstate->buf, true);
 				}
+				else if (posting)
+				{
+			nextpost:
+					if (!all_dead)
+						posting_all_dead = false;
+
+					/* Iterate over single posting list tuple */
+					npost++;
+					if (npost < BTreeTupleGetNPosting(curitup))
+						goto doposttup;
+
+					/*
+					 * Mark posting tuple dead if all hot chains whose root is
+					 * contained in posting tuple have tuples that are all
+					 * dead
+					 */
+					if (posting_all_dead)
+					{
+						ItemIdMarkDead(curitemid);
+						opaque->btpo_flags |= BTP_HAS_GARBAGE;
+
+						if (nbuf != InvalidBuffer)
+							MarkBufferDirtyHint(nbuf, true);
+						else
+							MarkBufferDirtyHint(insertstate->buf, true);
+					}
+
+					/* Move on to next index tuple */
+				}
 			}
 		}
 
@@ -689,6 +749,7 @@ _bt_findinsertloc(Relation rel,
 	BTScanInsert itup_key = insertstate->itup_key;
 	Page		page = BufferGetPage(insertstate->buf);
 	BTPageOpaque lpageop;
+	OffsetNumber location;
 
 	lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
 
@@ -751,13 +812,26 @@ _bt_findinsertloc(Relation rel,
 
 		/*
 		 * If the target page is full, see if we can obtain enough space by
-		 * erasing LP_DEAD items
+		 * erasing LP_DEAD items.  If that doesn't work out, and if the index
+		 * deduplication is both possible and enabled, try deduplication.
 		 */
-		if (PageGetFreeSpace(page) < insertstate->itemsz &&
-			P_HAS_GARBAGE(lpageop))
+		if (PageGetFreeSpace(page) < insertstate->itemsz)
 		{
-			_bt_vacuum_one_page(rel, insertstate->buf, heapRel);
-			insertstate->bounds_valid = false;
+			if (P_HAS_GARBAGE(lpageop))
+			{
+				_bt_vacuum_one_page(rel, insertstate->buf, heapRel);
+				insertstate->bounds_valid = false;
+			}
+
+			if (insertstate->itup_key->dedup_is_possible &&
+				BtreeGetDoDedupOption(rel) &&
+				PageGetFreeSpace(page) < insertstate->itemsz)
+			{
+				_bt_dedup_one_page(rel, insertstate->buf, heapRel,
+								   insertstate->itup, insertstate->itemsz,
+								   checkingunique);
+				insertstate->bounds_valid = false;
+			}
 		}
 	}
 	else
@@ -839,7 +913,38 @@ _bt_findinsertloc(Relation rel,
 	Assert(P_RIGHTMOST(lpageop) ||
 		   _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
 
-	return _bt_binsrch_insert(rel, insertstate);
+	location = _bt_binsrch_insert(rel, insertstate);
+
+	/*
+	 * Insertion is not prepared for the case where an LP_DEAD posting list
+	 * tuple must be split.  In the unlikely event that this happens, call
+	 * _bt_dedup_one_page() to force it to kill all LP_DEAD items.
+	 */
+	if (unlikely(insertstate->postingoff == -1))
+	{
+		Assert(insertstate->itup_key->dedup_is_possible);
+
+		/*
+		 * Don't check if the option is enabled, since no actual deduplication
+		 * will be done, just cleanup.
+		 */
+		_bt_dedup_one_page(rel, insertstate->buf, heapRel, insertstate->itup,
+						   0, checkingunique);
+		Assert(!P_HAS_GARBAGE(lpageop));
+
+		/* Must reset insertstate ahead of new _bt_binsrch_insert() call */
+		insertstate->bounds_valid = false;
+		insertstate->postingoff = 0;
+		location = _bt_binsrch_insert(rel, insertstate);
+
+		/*
+		 * Might still have to split some other posting list now, but that
+		 * should never be LP_DEAD
+		 */
+		Assert(insertstate->postingoff >= 0);
+	}
+
+	return location;
 }
 
 /*
@@ -900,15 +1005,81 @@ _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
 	insertstate->bounds_valid = false;
 }
 
+/*
+ * Form a new posting list during a posting split.
+ *
+ * If caller determines that its new tuple 'newitem' is a duplicate with a
+ * heap TID that falls inside the range of an existing posting list tuple
+ * 'oposting', it must generate a new posting tuple to replace the original.
+ * The new posting list is guaranteed to be the same size as the original.
+ * Caller must also change newitem to have the heap TID of the rightmost TID
+ * in the original posting list.  Both steps are always handled by calling
+ * here.
+ *
+ * Returns new posting list palloc()'d in caller's context.  Also modifies
+ * caller's newitem to contain final/effective heap TID, which is what caller
+ * actually inserts on the page.
+ *
+ * Exported for use by recovery.  Note that recovery path must recreate the
+ * same version of newitem that is passed here on the primary, even though
+ * that differs from the final newitem actually added to the page.  This
+ * optimization avoids explicit WAL-logging of entire posting lists, which
+ * tend to be rather large.
+ */
+IndexTuple
+_bt_posting_split(IndexTuple newitem, IndexTuple oposting,
+				  OffsetNumber postingoff)
+{
+	int			nhtids;
+	char	   *replacepos;
+	char	   *rightpos;
+	Size		nbytes;
+	IndexTuple	nposting;
+
+	Assert(BTreeTupleIsPosting(oposting));
+	nhtids = BTreeTupleGetNPosting(oposting);
+	Assert(postingoff < nhtids);
+
+	nposting = CopyIndexTuple(oposting);
+	replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
+	rightpos = replacepos + sizeof(ItemPointerData);
+	nbytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
+
+	/*
+	 * Move item pointers in posting list to make a gap for the new item's
+	 * heap TID (shift TIDs one place to the right, losing original rightmost
+	 * TID).
+	 */
+	memmove(rightpos, replacepos, nbytes);
+
+	/*
+	 * Fill the gap with the TID of the new item.
+	 */
+	ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos);
+
+	/*
+	 * Copy original (not new original) posting list's last TID into new item
+	 */
+	ItemPointerCopy(BTreeTupleGetPostingN(oposting, nhtids - 1),
+					&newitem->t_tid);
+	Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting),
+							  BTreeTupleGetHeapTID(newitem)) < 0);
+	Assert(BTreeTupleGetNPosting(nposting) == BTreeTupleGetNPosting(oposting));
+
+	return nposting;
+}
+
 /*----------
  *	_bt_insertonpg() -- Insert a tuple on a particular page in the index.
  *
  *		This recursive procedure does the following things:
  *
+ *			+  if necessary, splits an existing posting list on page.
+ *			   This is only needed when 'postingoff' is non-zero.
  *			+  if necessary, splits the target page, using 'itup_key' for
  *			   suffix truncation on leaf pages (caller passes NULL for
  *			   non-leaf pages).
- *			+  inserts the tuple.
+ *			+  inserts the new tuple (could be from split posting list).
  *			+  if the page was split, pops the parent stack, and finds the
  *			   right place to insert the new child pointer (by walking
  *			   right using information stored in the parent stack).
@@ -918,7 +1089,8 @@ _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
  *
  *		On entry, we must have the correct buffer in which to do the
  *		insertion, and the buffer must be pinned and write-locked.  On return,
- *		we will have dropped both the pin and the lock on the buffer.
+ *		we will have dropped both the pin and the lock on the buffer.  Caller
+ *		should be prepared for us to scribble on 'itup'.
  *
  *		This routine only performs retail tuple insertions.  'itup' should
  *		always be either a non-highkey leaf item, or a downlink (new high
@@ -936,11 +1108,15 @@ _bt_insertonpg(Relation rel,
 			   BTStack stack,
 			   IndexTuple itup,
 			   OffsetNumber newitemoff,
+			   int postingoff,
 			   bool split_only_page)
 {
 	Page		page;
 	BTPageOpaque lpageop;
 	Size		itemsz;
+	IndexTuple	oposting;
+	IndexTuple	origitup = NULL;
+	IndexTuple	nposting = NULL;
 
 	page = BufferGetPage(buf);
 	lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -954,6 +1130,8 @@ _bt_insertonpg(Relation rel,
 	Assert(P_ISLEAF(lpageop) ||
 		   BTreeTupleGetNAtts(itup, rel) <=
 		   IndexRelationGetNumberOfKeyAttributes(rel));
+	/* retail insertions of posting list tuples are disallowed */
+	Assert(!BTreeTupleIsPosting(itup));
 
 	/* The caller should've finished any incomplete splits already. */
 	if (P_INCOMPLETE_SPLIT(lpageop))
@@ -964,6 +1142,46 @@ _bt_insertonpg(Relation rel,
 	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but we
 								 * need to be consistent */
 
+	/*
+	 * Do we need to split an existing posting list item?
+	 */
+	if (postingoff != 0)
+	{
+		ItemId		itemid = PageGetItemId(page, newitemoff);
+
+		/*
+		 * The new tuple is a duplicate with a heap TID that falls inside the
+		 * range of an existing posting list tuple, so split posting list.
+		 *
+		 * Posting list splits always replace some existing TID in the posting
+		 * list with the new item's heap TID (based on a posting list offset
+		 * from caller) by removing rightmost heap TID from posting list.  The
+		 * new item's heap TID is swapped with that rightmost heap TID, almost
+		 * as if the tuple inserted never overlapped with a posting list in
+		 * the first place.  This allows the insertion and page split code to
+		 * have minimal special case handling of posting lists.
+		 *
+		 * The only extra handling required is to overwrite the original
+		 * posting list with nposting, which is guaranteed to be the same size
+		 * as the original, keeping the page space accounting simple.  This
+		 * takes place in either the page insert or page split critical
+		 * section.
+		 */
+		Assert(P_ISLEAF(lpageop));
+		Assert(!ItemIdIsDead(itemid));
+		Assert(postingoff > 0);
+		oposting = (IndexTuple) PageGetItem(page, itemid);
+
+		/* save a copy of itup with unchanged TID to write it into xlog record */
+		origitup = CopyIndexTuple(itup);
+		nposting = _bt_posting_split(itup, oposting, postingoff);
+
+		Assert(BTreeTupleGetNPosting(nposting) ==
+			   BTreeTupleGetNPosting(oposting));
+		/* Alter new item offset, since effective new item changed */
+		newitemoff = OffsetNumberNext(newitemoff);
+	}
+
 	/*
 	 * Do we need to split the page to fit the item on it?
 	 *
@@ -996,7 +1214,8 @@ _bt_insertonpg(Relation rel,
 				 BlockNumberIsValid(RelationGetTargetBlock(rel))));
 
 		/* split the buffer into left and right halves */
-		rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup);
+		rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup,
+						 origitup, nposting, postingoff);
 		PredicateLockPageSplit(rel,
 							   BufferGetBlockNumber(buf),
 							   BufferGetBlockNumber(rbuf));
@@ -1075,6 +1294,18 @@ _bt_insertonpg(Relation rel,
 			elog(PANIC, "failed to add new item to block %u in index \"%s\"",
 				 itup_blkno, RelationGetRelationName(rel));
 
+		if (nposting)
+		{
+			/*
+			 * Posting list split requires an in-place update of the existing
+			 * posting list
+			 */
+			Assert(P_ISLEAF(lpageop));
+			Assert(MAXALIGN(IndexTupleSize(oposting)) ==
+				   MAXALIGN(IndexTupleSize(nposting)));
+			memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
+		}
+
 		MarkBufferDirty(buf);
 
 		if (BufferIsValid(metabuf))
@@ -1116,6 +1347,7 @@ _bt_insertonpg(Relation rel,
 			XLogRecPtr	recptr;
 
 			xlrec.offnum = itup_off;
+			xlrec.postingoff = postingoff;
 
 			XLogBeginInsert();
 			XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
@@ -1144,6 +1376,7 @@ _bt_insertonpg(Relation rel,
 				xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
 				xlmeta.last_cleanup_num_heap_tuples =
 					metad->btm_last_cleanup_num_heap_tuples;
+				xlmeta.btm_dedup_is_possible = metad->btm_dedup_is_possible;
 
 				XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
 				XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata));
@@ -1152,7 +1385,19 @@ _bt_insertonpg(Relation rel,
 			}
 
 			XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
-			XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
+
+			/*
+			 * We always write newitem to the page, but when there is an
+			 * original newitem due to a posting list split then we log the
+			 * original item instead.  REDO routine must reconstruct the final
+			 * newitem at the same time it reconstructs nposting.
+			 */
+			if (postingoff == 0)
+				XLogRegisterBufData(0, (char *) itup,
+									IndexTupleSize(itup));
+			else
+				XLogRegisterBufData(0, (char *) origitup,
+									IndexTupleSize(origitup));
 
 			recptr = XLogInsert(RM_BTREE_ID, xlinfo);
 
@@ -1194,6 +1439,13 @@ _bt_insertonpg(Relation rel,
 			_bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL)
 			RelationSetTargetBlock(rel, cachedBlock);
 	}
+
+	/* be tidy */
+	if (postingoff != 0)
+	{
+		pfree(nposting);
+		pfree(origitup);
+	}
 }
 
 /*
@@ -1209,12 +1461,25 @@ _bt_insertonpg(Relation rel,
  *		This function will clear the INCOMPLETE_SPLIT flag on it, and
  *		release the buffer.
  *
+ *		orignewitem, nposting, and postingoff are needed when an insert of
+ *		orignewitem results in both a posting list split and a page split.
+ *		newitem and nposting are replacements for orignewitem and the
+ *		existing posting list on the page respectively.  These extra
+ *		posting list split details are used here in the same way as they
+ *		are used in the more common case where a posting list split does
+ *		not coincide with a page split.  We need to deal with posting list
+ *		splits directly in order to ensure that everything that follows
+ *		from the insert of orignewitem is handled as a single atomic
+ *		operation (though caller's insert of a new pivot/downlink into
+ *		parent page will still be a separate operation).
+ *
  *		Returns the new right sibling of buf, pinned and write-locked.
  *		The pin and lock on buf are maintained.
  */
 static Buffer
 _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
-		  OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem)
+		  OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
+		  IndexTuple orignewitem, IndexTuple nposting, OffsetNumber postingoff)
 {
 	Buffer		rbuf;
 	Page		origpage;
@@ -1236,12 +1501,23 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 	OffsetNumber firstright;
 	OffsetNumber maxoff;
 	OffsetNumber i;
+	OffsetNumber replacepostingoff = InvalidOffsetNumber;
 	bool		newitemonleft,
 				isleaf;
 	IndexTuple	lefthikey;
 	int			indnatts = IndexRelationGetNumberOfAttributes(rel);
 	int			indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
 
+	/*
+	 * Determine offset number of existing posting list on page when a split
+	 * of a posting list needs to take place as the page is split
+	 */
+	if (nposting != NULL)
+	{
+		Assert(itup_key->heapkeyspace);
+		replacepostingoff = OffsetNumberPrev(newitemoff);
+	}
+
 	/*
 	 * origpage is the original page to be split.  leftpage is a temporary
 	 * buffer that receives the left-sibling data, which will be copied back
@@ -1273,6 +1549,13 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 	 * newitemoff == firstright.  In all other cases it's clear which side of
 	 * the split every tuple goes on from context.  newitemonleft is usually
 	 * (but not always) redundant information.
+	 *
+	 * Note: In theory, the split point choice logic should operate against a
+	 * version of the page that already replaced the posting list at offset
+	 * replacepostingoff with nposting where applicable.  We don't bother with
+	 * that, though.  Both versions of the posting list must be the same size,
+	 * and both will have the same base tuple key values, so split point
+	 * choice is never affected.
 	 */
 	firstright = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
 								  newitem, &newitemonleft);
@@ -1340,6 +1623,9 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 		itemid = PageGetItemId(origpage, firstright);
 		itemsz = ItemIdGetLength(itemid);
 		item = (IndexTuple) PageGetItem(origpage, itemid);
+		/* Behave as if origpage posting list has already been swapped */
+		if (firstright == replacepostingoff)
+			item = nposting;
 	}
 
 	/*
@@ -1373,6 +1659,9 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 			Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
 			itemid = PageGetItemId(origpage, lastleftoff);
 			lastleft = (IndexTuple) PageGetItem(origpage, itemid);
+			/* Behave as if origpage posting list has already been swapped */
+			if (lastleftoff == replacepostingoff)
+				lastleft = nposting;
 		}
 
 		Assert(lastleft != item);
@@ -1480,8 +1769,23 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 		itemsz = ItemIdGetLength(itemid);
 		item = (IndexTuple) PageGetItem(origpage, itemid);
 
+		/*
+		 * did caller pass new replacement posting list tuple due to posting
+		 * list split?
+		 */
+		if (i == replacepostingoff)
+		{
+			/*
+			 * swap origpage posting list with post-posting-list-split version
+			 * from caller
+			 */
+			Assert(isleaf);
+			Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
+			item = nposting;
+		}
+
 		/* does new item belong before this one? */
-		if (i == newitemoff)
+		else if (i == newitemoff)
 		{
 			if (newitemonleft)
 			{
@@ -1650,8 +1954,12 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 		XLogRecPtr	recptr;
 
 		xlrec.level = ropaque->btpo.level;
+		/* See comments below on newitem, orignewitem, and posting lists */
 		xlrec.firstright = firstright;
 		xlrec.newitemoff = newitemoff;
+		xlrec.postingoff = InvalidOffsetNumber;
+		if (replacepostingoff < firstright)
+			xlrec.postingoff = postingoff;
 
 		XLogBeginInsert();
 		XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
@@ -1670,11 +1978,46 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 		 * because it's included with all the other items on the right page.)
 		 * Show the new item as belonging to the left page buffer, so that it
 		 * is not stored if XLogInsert decides it needs a full-page image of
-		 * the left page.  We store the offset anyway, though, to support
-		 * archive compression of these records.
+		 * the left page.  We always store newitemoff in record, though.
+		 *
+		 * The details are often slightly different for page splits that
+		 * coincide with a posting list split.  If both the replacement
+		 * posting list and newitem go on the right page, then we don't need
+		 * to log anything extra, just like the simple !newitemonleft
+		 * no-posting-split case (postingoff isn't set in the WAL record, so
+		 * recovery can't even tell the difference).  Otherwise, we set
+		 * postingoff and log orignewitem instead of newitem, despite having
+		 * actually inserted newitem.  Recovery must reconstruct nposting and
+		 * newitem by repeating the actions of our caller (i.e. by passing
+		 * original posting list and orignewitem to _bt_posting_split()).
+		 *
+		 * Note: It's possible that our page split point is the point that
+		 * makes the posting list lastleft and newitem firstright.  This is
+		 * the only case where we log orignewitem despite newitem going on the
+		 * right page.  If XLogInsert decides that it can omit orignewitem due
+		 * to logging a full-page image of the left page, everything still
+		 * works out, since recovery only needs to log orignewitem for items
+		 * on the left page (just like the regular newitem-logged case).
 		 */
-		if (newitemonleft)
-			XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz));
+		if (newitemonleft || xlrec.postingoff != InvalidOffsetNumber)
+		{
+			if (xlrec.postingoff == InvalidOffsetNumber)
+			{
+				/* Must WAL-log newitem, since it's on left page */
+				Assert(newitemonleft);
+				Assert(orignewitem == NULL && nposting == NULL);
+				XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz));
+			}
+			else
+			{
+				/* Must WAL-log orignewitem following posting list split */
+				Assert(newitemonleft || firstright == newitemoff);
+				Assert(ItemPointerCompare(&orignewitem->t_tid,
+										  &newitem->t_tid) < 0);
+				XLogRegisterBufData(0, (char *) orignewitem,
+									MAXALIGN(IndexTupleSize(orignewitem)));
+			}
+		}
 
 		/* Log the left page's new high key */
 		itemid = PageGetItemId(origpage, P_HIKEY);
@@ -1834,7 +2177,7 @@ _bt_insert_parent(Relation rel,
 
 		/* Recursively insert into the parent */
 		_bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent,
-					   new_item, stack->bts_offset + 1,
+					   new_item, stack->bts_offset + 1, 0,
 					   is_only);
 
 		/* be tidy */
@@ -2190,6 +2533,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 		md.fastlevel = metad->btm_level;
 		md.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
 		md.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
+		md.btm_dedup_is_possible = metad->btm_dedup_is_possible;
 
 		XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
 
@@ -2304,6 +2648,472 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel)
 	 * Note: if we didn't find any LP_DEAD items, then the page's
 	 * BTP_HAS_GARBAGE hint bit is falsely set.  We do not bother expending a
 	 * separate write to clear it, however.  We will clear it when we split
-	 * the page.
+	 * the page (or when deduplication runs).
 	 */
 }
+
+/*
+ * Try to deduplicate items to free at least enough space to avoid a page
+ * split.  This function should be called after LP_DEAD items were removed by
+ * _bt_vacuum_one_page() to prevent a page split.  (We'll have to kill LP_DEAD
+ * items here when the page's BTP_HAS_GARBAGE hint was not set, but that
+ * should be rare.)
+ *
+ * The strategy for !checkingunique callers is to perform as much
+ * deduplication as possible to free as much space as possible now, since
+ * making it harder to set LP_DEAD bits is considered an acceptable price for
+ * not having to deduplicate the same page many times.  It is unlikely that
+ * the items on the page will have their LP_DEAD bit set in the future, since
+ * that hasn't happened before now (besides, entire posting lists can still
+ * have their LP_DEAD bit set).
+ *
+ * The strategy for checkingunique callers is rather different, since the
+ * overall goal is different.  Deduplication cooperates with and enhances
+ * garbage collection, especially the LP_DEAD bit setting that takes place in
+ * _bt_check_unique().  Deduplication does as little as possible while still
+ * preventing a page split for caller, since it's less likely that posting
+ * lists will have their LP_DEAD bit set.  Deduplication avoids creating new
+ * posting lists with only two heap TIDs, and also avoids creating new posting
+ * lists from an existing posting list.  Deduplication is only useful when it
+ * delays a page split long enough for garbage collection to prevent the page
+ * split altogether.  checkingunique deduplication can make all the difference
+ * in cases where VACUUM keeps up with dead index tuples, but "recently dead"
+ * index tuples are still numerous enough to cause page splits that are truly
+ * unnecessary.
+ *
+ * Note: If newitem contains NULL values in key attributes, caller will be
+ * !checkingunique even when rel is a unique index.  The page in question will
+ * usually have many existing items with NULLs.
+ */
+static void
+_bt_dedup_one_page(Relation rel, Buffer buffer, Relation heapRel,
+				   IndexTuple newitem, Size newitemsz, bool checkingunique)
+{
+	OffsetNumber offnum,
+				minoff,
+				maxoff;
+	Page		page = BufferGetPage(buffer);
+	BTPageOpaque oopaque;
+	BTDedupState *state = NULL;
+	int			natts = IndexRelationGetNumberOfAttributes(rel);
+	OffsetNumber deletable[MaxIndexTuplesPerPage];
+	bool		minimal = checkingunique;
+	int			ndeletable = 0;
+	Size		pagesaving = 0;
+
+	oopaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	/* init deduplication state needed to build posting tuples */
+	state = (BTDedupState *) palloc(sizeof(BTDedupState));
+	state->rel = rel;
+
+	state->maxitemsize = BTMaxItemSize(page);
+	state->newitem = newitem;
+	state->checkingunique = checkingunique;
+	/* Metadata about current pending posting list */
+	state->htids = NULL;
+	state->nhtids = 0;
+	state->nitems = 0;
+	state->alltupsize = 0;
+	state->overlap = false;
+	/* Metadata about based tuple of current pending posting list */
+	state->base = NULL;
+	state->baseoff = InvalidOffsetNumber;
+	state->basetupsize = 0;
+
+	minoff = P_FIRSTDATAKEY(oopaque);
+	maxoff = PageGetMaxOffsetNumber(page);
+
+	/*
+	 * Delete dead tuples if any. We cannot simply skip them in the cycle
+	 * below, because it's necessary to generate special Xlog record
+	 * containing such tuples to compute latestRemovedXid on a standby server
+	 * later.
+	 *
+	 * This should not affect performance, since it only can happen in a rare
+	 * situation when BTP_HAS_GARBAGE flag was not set and _bt_vacuum_one_page
+	 * was not called, or _bt_vacuum_one_page didn't remove all dead items.
+	 */
+	for (offnum = minoff;
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		ItemId		itemid = PageGetItemId(page, offnum);
+
+		if (ItemIdIsDead(itemid))
+			deletable[ndeletable++] = offnum;
+	}
+
+	if (ndeletable > 0)
+	{
+		/*
+		 * Skip duplication in rare cases where there were LP_DEAD items
+		 * encountered here when that frees sufficient space for caller to
+		 * avoid a page split
+		 */
+		_bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel);
+		if (PageGetFreeSpace(page) >= newitemsz)
+		{
+			pfree(state);
+			return;
+		}
+
+		/* Continue with deduplication */
+		minoff = P_FIRSTDATAKEY(oopaque);
+		maxoff = PageGetMaxOffsetNumber(page);
+	}
+
+	/* Make sure that new page won't have garbage flag set */
+	oopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+
+	/* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
+	newitemsz += sizeof(ItemIdData);
+	/* Conservatively size array */
+	state->htids = palloc(state->maxitemsize);
+
+	/*
+	 * Iterate over tuples on the page, try to deduplicate them into posting
+	 * lists and insert into new page.  NOTE: It's essential to reassess the
+	 * max offset on each iteration, since it will change as items are
+	 * deduplicated.
+	 */
+retry:
+	offnum = minoff;
+	while (offnum <= PageGetMaxOffsetNumber(page))
+	{
+		ItemId		itemid = PageGetItemId(page, offnum);
+		IndexTuple	itup = (IndexTuple) PageGetItem(page, itemid);
+
+		Assert(!ItemIdIsDead(itemid));
+
+		if (state->nitems == 0)
+		{
+			/*
+			 * No previous/base tuple for the data item -- use the data item
+			 * as base tuple of pending posting list
+			 */
+			_bt_dedup_start_pending(state, itup, offnum);
+		}
+		else if (_bt_keep_natts_fast(rel, state->base, itup) > natts &&
+				 _bt_dedup_save_htid(state, itup))
+		{
+			/*
+			 * Tuple is equal to base tuple of pending posting list, and
+			 * merging itup into pending posting list won't exceed the
+			 * BTMaxItemSize() limit.  Heap TID(s) for itup have been saved in
+			 * state.  The next iteration will also end up here if it's
+			 * possible to merge the next tuple into the same pending posting
+			 * list.
+			 */
+		}
+		else
+		{
+			/*
+			 * Tuple is not equal to pending posting list tuple, or
+			 * BTMaxItemSize() limit was reached.
+			 *
+			 * If state contains pending posting list with more than one item,
+			 * form new posting tuple, and update the page, otherwise, just
+			 * reset the state and move on.
+			 */
+			pagesaving += _bt_dedup_finish_pending(buffer, state,
+												   RelationNeedsWAL(rel));
+
+			/*
+			 * When caller is a checkingunique caller and we have deduplicated
+			 * enough to avoid a page split, do minimal deduplication.  Don't
+			 * prematurely deduplicate items that could still have their
+			 * LP_DEAD bits set.
+			 */
+			if (minimal && pagesaving >= newitemsz)
+				break;
+
+			/* Continue iteration from base tuple's offnum */
+			offnum = state->baseoff;
+		}
+
+		offnum = OffsetNumberNext(offnum);
+	}
+
+	/* Handle the last item when pending posting list is not empty */
+	if (state->nitems != 0)
+		pagesaving += _bt_dedup_finish_pending(buffer, state,
+											   RelationNeedsWAL(rel));
+
+	if (state->checkingunique && pagesaving < newitemsz)
+	{
+		/*
+		 * Try again.  The second pass over the page may deduplicate items
+		 * that were passed over the first time due to concerns about limiting
+		 * the effectiveness of LP_DEAD bit setting within _bt_check_unique().
+		 * Note that we will still stop deduplicating as soon as enough space
+		 * has been freed to avoid caller's page split.
+		 *
+		 * FIXME: Don't bother with this when it's clearly a total waste of
+		 * time.  Maybe don't do any checkingunique deduplication for the
+		 * rightmost page, either.
+		 */
+		state->checkingunique = false;
+		state->alltupsize = 0;
+		state->nitems = 0;
+		state->base = NULL;
+		state->baseoff = InvalidOffsetNumber;
+		state->basetupsize = 0;
+		goto retry;
+	}
+
+	/* be tidy */
+	pfree(state->htids);
+	pfree(state);
+}
+
+/*
+ * Create a new pending posting list tuple based on caller's tuple.
+ *
+ * Every tuple processed by the deduplication routines either becomes the base
+ * tuple for a posting list, or gets its heap TID(s) accepted into a pending
+ * posting list.  A tuple that starts out as the base tuple for a posting list
+ * will only actually be rewritten within _bt_dedup_finish_pending() when
+ * there was at least one successful call to _bt_dedup_save_htid().
+ *
+ * Exported for use by nbtsort.c and recovery.
+ */
+void
+_bt_dedup_start_pending(BTDedupState *state, IndexTuple base,
+						OffsetNumber baseoff)
+{
+	Assert(state->nhtids == 0);
+	Assert(state->nitems == 0);
+
+	/*
+	 * Copy heap TIDs from new base tuple for new candidate posting list into
+	 * ipd array.  Assume that we'll eventually create a new posting tuple by
+	 * merging later tuples with this existing one, though we may not.
+	 */
+	if (!BTreeTupleIsPosting(base))
+	{
+		memcpy(state->htids, base, sizeof(ItemPointerData));
+		state->nhtids = 1;
+		/* Save size of tuple without any posting list */
+		state->basetupsize = IndexTupleSize(base);
+	}
+	else
+	{
+		int			nposting;
+
+		nposting = BTreeTupleGetNPosting(base);
+		memcpy(state->htids, BTreeTupleGetPosting(base),
+			   sizeof(ItemPointerData) * nposting);
+		state->nhtids = nposting;
+		/* Save size of tuple without any posting list */
+		state->basetupsize = BTreeTupleGetPostingOffset(base);
+	}
+
+	/*
+	 * Save new base tuple itself -- it'll be needed if we actually create a
+	 * new posting list from new pending posting list.
+	 *
+	 * Must maintain size of all tuples (including line pointer overhead) to
+	 * calculate space savings on page within _bt_dedup_finish_pending().
+	 * Also, save number of base tuple logical tuples so that we can save
+	 * cycles in the common case where an existing posting list can't or won't
+	 * be merged with other tuples on the page.
+	 */
+	state->nitems = 1;
+	state->base = base;
+	state->baseoff = baseoff;
+	state->alltupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
+	/* Also save baseoff in pending state for interval */
+	state->interval.baseoff = state->baseoff;
+	state->overlap = false;
+	if (state->newitem)
+	{
+		/* Might overlap with new item -- mark it as possible if it is */
+		if (BTreeTupleGetHeapTID(base) < BTreeTupleGetHeapTID(state->newitem))
+			state->overlap = true;
+	}
+}
+
+/*
+ * Save itup heap TID(s) into pending posting list where possible.
+ *
+ * Returns bool indicating if the pending posting list managed by state has
+ * itup's heap TID(s) saved.  When this is false, enlarging the pending
+ * posting list by the required amount would exceed the maxitemsize limit, so
+ * caller must finish the pending posting list tuple.  (Generally itup becomes
+ * the base tuple of caller's new pending posting list).
+ *
+ * Exported for use by nbtsort.c and recovery.
+ */
+bool
+_bt_dedup_save_htid(BTDedupState *state, IndexTuple itup)
+{
+	int			nhtids;
+	ItemPointer htids;
+	Size		mergedtupsz;
+
+	if (!BTreeTupleIsPosting(itup))
+	{
+		nhtids = 1;
+		htids = &itup->t_tid;
+	}
+	else
+	{
+		nhtids = BTreeTupleGetNPosting(itup);
+		htids = BTreeTupleGetPosting(itup);
+	}
+
+	/*
+	 * Don't append (have caller finish pending posting list as-is) if
+	 * appending heap TID(s) from itup would put us over limit
+	 */
+	mergedtupsz = MAXALIGN(state->basetupsize +
+						   (state->nhtids + nhtids) *
+						   sizeof(ItemPointerData));
+
+	if (mergedtupsz > state->maxitemsize)
+		return false;
+
+	/* Don't merge existing posting lists with checkingunique */
+	if (state->checkingunique && BTreeTupleIsPosting(state->base))
+		return false;
+	if (state->checkingunique && nhtids > 1)
+		return false;
+
+	if (state->overlap)
+	{
+		if (BTreeTupleGetMaxHeapTID(itup) > BTreeTupleGetHeapTID(state->newitem))
+		{
+			/*
+			 * newitem has heap TID in the range of the would-be new posting
+			 * list.  Avoid an immediate posting list split for caller.
+			 */
+			if (_bt_keep_natts_fast(state->rel, state->newitem, itup) >
+				IndexRelationGetNumberOfAttributes(state->rel))
+			{
+				state->newitem = NULL;	/* avoid unnecessary comparisons */
+				return false;
+			}
+		}
+	}
+
+	/*
+	 * Save heap TIDs to pending posting list tuple -- itup can be merged into
+	 * pending posting list
+	 */
+	state->nitems++;
+	memcpy(state->htids + state->nhtids, htids,
+		   sizeof(ItemPointerData) * nhtids);
+	state->nhtids += nhtids;
+	state->alltupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
+
+	return true;
+}
+
+/*
+ * Finalize pending posting list tuple, and add it to the page.  Final tuple
+ * is based on saved base tuple, and saved list of heap TIDs.
+ *
+ * Returns space saving from deduplicating to make a new posting list tuple.
+ * Note that this includes line pointer overhead.  This is zero in the case
+ * where no deduplication was possible.
+ *
+ * Exported for use by recovery.
+ */
+Size
+_bt_dedup_finish_pending(Buffer buffer, BTDedupState *state, bool need_wal)
+{
+	Size		spacesaving = 0;
+	Page		page = BufferGetPage(buffer);
+	int			minimum = 2;
+
+	Assert(state->nitems > 0);
+	Assert(state->nitems <= state->nhtids);
+	Assert(state->interval.baseoff == state->baseoff);
+
+	/*
+	 * Only create a posting list when at least 3 heap TIDs will appear in the
+	 * checkingunique case (checkingunique strategy won't merge existing
+	 * posting list tuples, so we know that the number of items here must also
+	 * be the total number of heap TIDs).  Creating a new posting lists with
+	 * only two heap TIDs won't even save enough space to fit another
+	 * duplicate with the same key as the posting list.  This is a bad
+	 * trade-off if there is a chance that the LP_DEAD bit can be set for
+	 * either existing tuple by putting off deduplication.
+	 *
+	 * (Note that a second pass over the page can deduplicate the item if that
+	 * is truly the only way to avoid a page split for checkingunique caller)
+	 */
+	Assert(!state->checkingunique ||
+		   state->nitems == 1 || state->nhtids == state->nitems);
+	if (state->checkingunique)
+		minimum = 3;
+
+	if (state->nitems >= minimum)
+	{
+		IndexTuple	final;
+		Size		finalsz;
+		OffsetNumber offnum;
+		OffsetNumber deletable[MaxOffsetNumber];
+		int			ndeletable = 0;
+
+		/* find all tuples that will be replaced with this new posting tuple */
+		for (offnum = state->baseoff;
+			 offnum < state->baseoff + state->nitems;
+			 offnum = OffsetNumberNext(offnum))
+			deletable[ndeletable++] = offnum;
+
+		/* Form a tuple with a posting list */
+		final = BTreeFormPostingTuple(state->base, state->htids,
+									  state->nhtids);
+		finalsz = IndexTupleSize(final);
+		spacesaving = state->alltupsize - (finalsz + sizeof(ItemIdData));
+		/* Must have saved some space */
+		Assert(spacesaving > 0 && spacesaving < BLCKSZ);
+
+		/* Save final number of items for posting list */
+		state->interval.nitems = state->nitems;
+
+		Assert(finalsz <= state->maxitemsize);
+		Assert(finalsz == MAXALIGN(IndexTupleSize(final)));
+
+		START_CRIT_SECTION();
+
+		/* Delete items to replace */
+		PageIndexMultiDelete(page, deletable, ndeletable);
+		/* Insert posting tuple */
+		if (PageAddItem(page, (Item) final, finalsz, state->baseoff, false,
+						false) == InvalidOffsetNumber)
+			elog(ERROR, "deduplication failed to add tuple to page");
+
+		MarkBufferDirty(buffer);
+
+		/* Log deduplicated items */
+		if (need_wal)
+		{
+			XLogRecPtr	recptr;
+			xl_btree_dedup xlrec_dedup;
+
+			xlrec_dedup.baseoff = state->interval.baseoff;
+			xlrec_dedup.nitems = state->interval.nitems;
+
+			XLogBeginInsert();
+			XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
+			XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup);
+
+			recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP_PAGE);
+
+			PageSetLSN(page, recptr);
+		}
+
+		END_CRIT_SECTION();
+
+		pfree(final);
+	}
+
+	/* Reset state for next pending posting list */
+	state->nhtids = 0;
+	state->nitems = 0;
+	state->alltupsize = 0;
+
+	return spacesaving;
+}
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 268f869a36..c08f850595 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -24,6 +24,7 @@
 
 #include "access/nbtree.h"
 #include "access/nbtxlog.h"
+#include "access/tableam.h"
 #include "access/transam.h"
 #include "access/xlog.h"
 #include "access/xloginsert.h"
@@ -42,12 +43,17 @@ static bool _bt_lock_branch_parent(Relation rel, BlockNumber child,
 								   BlockNumber *target, BlockNumber *rightsib);
 static void _bt_log_reuse_page(Relation rel, BlockNumber blkno,
 							   TransactionId latestRemovedXid);
+static TransactionId _bt_compute_xid_horizon_for_tuples(Relation rel,
+														Relation heapRel,
+														Buffer buf,
+														OffsetNumber *itemnos,
+														int nitems);
 
 /*
  *	_bt_initmetapage() -- Fill a page buffer with a correct metapage image
  */
 void
-_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
+_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool dedup_is_possible)
 {
 	BTMetaPageData *metad;
 	BTPageOpaque metaopaque;
@@ -63,6 +69,7 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
 	metad->btm_fastlevel = level;
 	metad->btm_oldest_btpo_xact = InvalidTransactionId;
 	metad->btm_last_cleanup_num_heap_tuples = -1.0;
+	metad->btm_dedup_is_possible = dedup_is_possible;
 
 	metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
 	metaopaque->btpo_flags = BTP_META;
@@ -213,6 +220,7 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
 		md.fastlevel = metad->btm_fastlevel;
 		md.oldest_btpo_xact = oldestBtpoXact;
 		md.last_cleanup_num_heap_tuples = numHeapTuples;
+		md.btm_dedup_is_possible = metad->btm_dedup_is_possible;
 
 		XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
 
@@ -394,6 +402,7 @@ _bt_getroot(Relation rel, int access)
 			md.fastlevel = 0;
 			md.oldest_btpo_xact = InvalidTransactionId;
 			md.last_cleanup_num_heap_tuples = -1.0;
+			md.btm_dedup_is_possible = metad->btm_dedup_is_possible;
 
 			XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
 
@@ -683,6 +692,63 @@ _bt_heapkeyspace(Relation rel)
 	return metad->btm_version > BTREE_NOVAC_VERSION;
 }
 
+/*
+ *	_bt_get_dedupispossible() -- is deduplication possible for the index?
+ * 	get information from metapage
+ */
+bool
+_bt_getdedupispossible(Relation rel)
+{
+	BTMetaPageData *metad;
+
+	if (rel->rd_amcache == NULL)
+	{
+		Buffer		metabuf;
+
+		metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+		metad = _bt_getmeta(rel, metabuf);
+
+		/*
+		 * If there's no root page yet, _bt_getroot() doesn't expect a cache
+		 * to be made, so just stop here.  (XXX perhaps _bt_getroot() should
+		 * be changed to allow this case.)
+		 *
+		 * FIXME: Think some more about pg_upgrade'd !heapkeyspace indexes
+		 * here, and the need for aa version bump to go with new metapage
+		 * field.
+		 */
+		if (metad->btm_root == P_NONE)
+		{
+			_bt_relbuf(rel, metabuf);
+			return metad->btm_dedup_is_possible;;
+		}
+
+		/*
+		 * Cache the metapage data for next time
+		 *
+		 * An on-the-fly version upgrade performed by _bt_upgrademetapage()
+		 * can change the nbtree version for an index without invalidating any
+		 * local cache.  This is okay because it can only happen when moving
+		 * from version 2 to version 3, both of which are !heapkeyspace
+		 * versions.
+		 */
+		rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
+											 sizeof(BTMetaPageData));
+		memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
+		_bt_relbuf(rel, metabuf);
+	}
+
+	/* Get cached page */
+	metad = (BTMetaPageData *) rel->rd_amcache;
+	/* We shouldn't have cached it if any of these fail */
+	Assert(metad->btm_magic == BTREE_MAGIC);
+	Assert(metad->btm_version >= BTREE_MIN_VERSION);
+	Assert(metad->btm_version <= BTREE_VERSION);
+	Assert(metad->btm_fastroot != P_NONE);
+
+	return metad->btm_dedup_is_possible;
+}
+
 /*
  *	_bt_checkpage() -- Verify that a freshly-read page looks sane.
  */
@@ -983,14 +1049,52 @@ _bt_page_recyclable(Page page)
 void
 _bt_delitems_vacuum(Relation rel, Buffer buf,
 					OffsetNumber *itemnos, int nitems,
+					OffsetNumber *updateitemnos,
+					IndexTuple *updated, int nupdatable,
 					BlockNumber lastBlockVacuumed)
 {
 	Page		page = BufferGetPage(buf);
 	BTPageOpaque opaque;
+	Size		itemsz;
+	Size		updated_sz = 0;
+	char	   *updated_buf = NULL;
+
+	/* XLOG stuff, buffer for updateds */
+	if (nupdatable > 0 && RelationNeedsWAL(rel))
+	{
+		Size		offset = 0;
+
+		for (int i = 0; i < nupdatable; i++)
+			updated_sz += MAXALIGN(IndexTupleSize(updated[i]));
+
+		updated_buf = palloc(updated_sz);
+		for (int i = 0; i < nupdatable; i++)
+		{
+			itemsz = IndexTupleSize(updated[i]);
+			memcpy(updated_buf + offset, (char *) updated[i], itemsz);
+			offset += MAXALIGN(itemsz);
+		}
+		Assert(offset == updated_sz);
+	}
 
 	/* No ereport(ERROR) until changes are logged */
 	START_CRIT_SECTION();
 
+	/* Handle posting tuples here */
+	for (int i = 0; i < nupdatable; i++)
+	{
+		/* At first, delete the old tuple. */
+		PageIndexTupleDelete(page, updateitemnos[i]);
+
+		itemsz = IndexTupleSize(updated[i]);
+		itemsz = MAXALIGN(itemsz);
+
+		/* Add tuple with updated ItemPointers to the page. */
+		if (PageAddItem(page, (Item) updated[i], itemsz, updateitemnos[i],
+						false, false) == InvalidOffsetNumber)
+			elog(ERROR, "failed to rewrite posting list item in index while doing vacuum");
+	}
+
 	/* Fix the page */
 	if (nitems > 0)
 		PageIndexMultiDelete(page, itemnos, nitems);
@@ -1020,6 +1124,8 @@ _bt_delitems_vacuum(Relation rel, Buffer buf,
 		xl_btree_vacuum xlrec_vacuum;
 
 		xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed;
+		xlrec_vacuum.nupdated = nupdatable;
+		xlrec_vacuum.ndeleted = nitems;
 
 		XLogBeginInsert();
 		XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
@@ -1033,6 +1139,19 @@ _bt_delitems_vacuum(Relation rel, Buffer buf,
 		if (nitems > 0)
 			XLogRegisterBufData(0, (char *) itemnos, nitems * sizeof(OffsetNumber));
 
+		/*
+		 * Here we should save offnums and updated tuples themselves. It's
+		 * important to restore them in correct order. At first, we must
+		 * handle updated tuples and only after that other deleted items.
+		 */
+		if (nupdatable > 0)
+		{
+			Assert(updated_buf != NULL);
+			XLogRegisterBufData(0, (char *) updateitemnos,
+								nupdatable * sizeof(OffsetNumber));
+			XLogRegisterBufData(0, updated_buf, updated_sz);
+		}
+
 		recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
 
 		PageSetLSN(page, recptr);
@@ -1041,6 +1160,91 @@ _bt_delitems_vacuum(Relation rel, Buffer buf,
 	END_CRIT_SECTION();
 }
 
+/*
+ * Get the latestRemovedXid from the table entries pointed at by the index
+ * tuples being deleted.
+ *
+ * This is a version of index_compute_xid_horizon_for_tuples() specialized to
+ * nbtree, which can handle posting lists.
+ */
+static TransactionId
+_bt_compute_xid_horizon_for_tuples(Relation rel, Relation heapRel,
+								   Buffer buf, OffsetNumber *itemnos,
+								   int nitems)
+{
+	ItemPointer htids;
+	TransactionId latestRemovedXid = InvalidTransactionId;
+	Page		page = BufferGetPage(buf);
+	int			arraynitems;
+	int			finalnitems;
+
+	/*
+	 * Initial size of array can fit everything when it turns out that are no
+	 * posting lists
+	 */
+	arraynitems = nitems;
+	htids = (ItemPointer) palloc(sizeof(ItemPointerData) * arraynitems);
+
+	finalnitems = 0;
+	/* identify what the index tuples about to be deleted point to */
+	for (int i = 0; i < nitems; i++)
+	{
+		ItemId		itemid;
+		IndexTuple	itup;
+
+		itemid = PageGetItemId(page, itemnos[i]);
+		itup = (IndexTuple) PageGetItem(page, itemid);
+
+		Assert(ItemIdIsDead(itemid));
+
+		if (!BTreeTupleIsPosting(itup))
+		{
+			/* Make sure that we have space for additional heap TID */
+			if (finalnitems + 1 > arraynitems)
+			{
+				arraynitems = arraynitems * 2;
+				htids = (ItemPointer)
+					repalloc(htids, sizeof(ItemPointerData) * arraynitems);
+			}
+
+			Assert(ItemPointerIsValid(&itup->t_tid));
+			ItemPointerCopy(&itup->t_tid, &htids[finalnitems]);
+			finalnitems++;
+		}
+		else
+		{
+			int			nposting = BTreeTupleGetNPosting(itup);
+
+			/* Make sure that we have space for additional heap TIDs */
+			if (finalnitems + nposting > arraynitems)
+			{
+				arraynitems = Max(arraynitems * 2, finalnitems + nposting);
+				htids = (ItemPointer)
+					repalloc(htids, sizeof(ItemPointerData) * arraynitems);
+			}
+
+			for (int j = 0; j < nposting; j++)
+			{
+				ItemPointer htid = BTreeTupleGetPostingN(itup, j);
+
+				Assert(ItemPointerIsValid(htid));
+				ItemPointerCopy(htid, &htids[finalnitems]);
+				finalnitems++;
+			}
+		}
+	}
+
+	Assert(finalnitems >= nitems);
+
+	/* determine the actual xid horizon */
+	latestRemovedXid =
+		table_compute_xid_horizon_for_tuples(heapRel, htids, finalnitems);
+
+	pfree(htids);
+
+	return latestRemovedXid;
+}
+
 /*
  * Delete item(s) from a btree page during single-page cleanup.
  *
@@ -1067,8 +1271,8 @@ _bt_delitems_delete(Relation rel, Buffer buf,
 
 	if (XLogStandbyInfoActive() && RelationNeedsWAL(rel))
 		latestRemovedXid =
-			index_compute_xid_horizon_for_tuples(rel, heapRel, buf,
-												 itemnos, nitems);
+			_bt_compute_xid_horizon_for_tuples(rel, heapRel, buf,
+											   itemnos, nitems);
 
 	/* No ereport(ERROR) until changes are logged */
 	START_CRIT_SECTION();
@@ -2066,6 +2270,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
 			xlmeta.fastlevel = metad->btm_fastlevel;
 			xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
 			xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
+			xlmeta.btm_dedup_is_possible = metad->btm_dedup_is_possible;
 
 			XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata));
 			xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 4cfd5289ad..d70607e71a 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -97,6 +97,8 @@ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 						 BTCycleId cycleid, TransactionId *oldestBtpoXact);
 static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
 						 BlockNumber orig_blkno);
+static ItemPointer btreevacuumposting(BTVacState *vstate, IndexTuple itup,
+									  int *nremaining);
 
 
 /*
@@ -157,10 +159,11 @@ void
 btbuildempty(Relation index)
 {
 	Page		metapage;
+	bool		dedup_is_possible = _bt_dedup_is_possible(index);
 
 	/* Construct metapage. */
 	metapage = (Page) palloc(BLCKSZ);
-	_bt_initmetapage(metapage, P_NONE, 0);
+	_bt_initmetapage(metapage, P_NONE, 0, dedup_is_possible);
 
 	/*
 	 * Write the page and log it.  It might seem that an immediate sync would
@@ -263,8 +266,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
 				 */
 				if (so->killedItems == NULL)
 					so->killedItems = (int *)
-						palloc(MaxIndexTuplesPerPage * sizeof(int));
-				if (so->numKilled < MaxIndexTuplesPerPage)
+						palloc(MaxPostingIndexTuplesPerPage * sizeof(int));
+				if (so->numKilled < MaxPostingIndexTuplesPerPage)
 					so->killedItems[so->numKilled++] = so->currPos.itemIndex;
 			}
 
@@ -816,7 +819,7 @@ _bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
 	}
 	else
 	{
-		StdRdOptions *relopts;
+		BtreeOptions *relopts;
 		float8		cleanup_scale_factor;
 		float8		prev_num_heap_tuples;
 
@@ -827,7 +830,7 @@ _bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
 		 * tuples exceeds vacuum_cleanup_index_scale_factor fraction of
 		 * original tuples count.
 		 */
-		relopts = (StdRdOptions *) info->index->rd_options;
+		relopts = (BtreeOptions *) info->index->rd_options;
 		cleanup_scale_factor = (relopts &&
 								relopts->vacuum_cleanup_index_scale_factor >= 0)
 			? relopts->vacuum_cleanup_index_scale_factor
@@ -1069,7 +1072,8 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 								 RBM_NORMAL, info->strategy);
 		LockBufferForCleanup(buf);
 		_bt_checkpage(rel, buf);
-		_bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed);
+		_bt_delitems_vacuum(rel, buf, NULL, 0, NULL, NULL, 0,
+							vstate.lastBlockVacuumed);
 		_bt_relbuf(rel, buf);
 	}
 
@@ -1188,8 +1192,17 @@ restart:
 	}
 	else if (P_ISLEAF(opaque))
 	{
+		/* Deletable item state */
 		OffsetNumber deletable[MaxOffsetNumber];
 		int			ndeletable;
+		int			nhtidsdead;
+		int			nhtidslive;
+
+		/* Updatable item state (for posting lists) */
+		IndexTuple	updated[MaxOffsetNumber];
+		OffsetNumber updatable[MaxOffsetNumber];
+		int			nupdatable;
+
 		OffsetNumber offnum,
 					minoff,
 					maxoff;
@@ -1229,6 +1242,10 @@ restart:
 		 * callback function.
 		 */
 		ndeletable = 0;
+		nupdatable = 0;
+		/* Maintain stats counters for index tuple versions/heap TIDs */
+		nhtidsdead = 0;
+		nhtidslive = 0;
 		minoff = P_FIRSTDATAKEY(opaque);
 		maxoff = PageGetMaxOffsetNumber(page);
 		if (callback)
@@ -1238,11 +1255,9 @@ restart:
 				 offnum = OffsetNumberNext(offnum))
 			{
 				IndexTuple	itup;
-				ItemPointer htup;
 
 				itup = (IndexTuple) PageGetItem(page,
 												PageGetItemId(page, offnum));
-				htup = &(itup->t_tid);
 
 				/*
 				 * During Hot Standby we currently assume that
@@ -1265,8 +1280,71 @@ restart:
 				 * applies to *any* type of index that marks index tuples as
 				 * killed.
 				 */
-				if (callback(htup, callback_state))
-					deletable[ndeletable++] = offnum;
+				if (!BTreeTupleIsPosting(itup))
+				{
+					/* Regular tuple, standard heap TID representation */
+					ItemPointer htid = &(itup->t_tid);
+
+					if (callback(htid, callback_state))
+					{
+						deletable[ndeletable++] = offnum;
+						nhtidsdead++;
+					}
+					else
+						nhtidslive++;
+				}
+				else
+				{
+					ItemPointer newhtids;
+					int			nremaining;
+
+					/*
+					 * Posting list tuple, a physical tuple that represents
+					 * two or more logical tuples, any of which could be an
+					 * index row version that must be removed
+					 */
+					newhtids = btreevacuumposting(vstate, itup, &nremaining);
+					if (newhtids == NULL)
+					{
+						/*
+						 * All TIDs/logical tuples from the posting tuple
+						 * remain, so no update or delete required
+						 */
+						Assert(nremaining == BTreeTupleGetNPosting(itup));
+					}
+					else if (nremaining > 0)
+					{
+						IndexTuple	updatedtuple;
+
+						/*
+						 * Form new tuple that contains only remaining TIDs.
+						 * Remember this tuple and the offset of the old tuple
+						 * for when we update it in place
+						 */
+						Assert(nremaining < BTreeTupleGetNPosting(itup));
+						updatedtuple = BTreeFormPostingTuple(itup, newhtids,
+															 nremaining);
+						updated[nupdatable] = updatedtuple;
+						updatable[nupdatable++] = offnum;
+						nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
+						pfree(newhtids);
+					}
+					else
+					{
+						/*
+						 * All TIDs/logical tuples from the posting list must
+						 * be deleted.  We'll delete the physical tuple
+						 * completely.
+						 */
+						deletable[ndeletable++] = offnum;
+						nhtidsdead += BTreeTupleGetNPosting(itup);
+
+						/* Free empty array of live items */
+						pfree(newhtids);
+					}
+
+					nhtidslive += nremaining;
+				}
 			}
 		}
 
@@ -1274,7 +1352,7 @@ restart:
 		 * Apply any needed deletes.  We issue just one _bt_delitems_vacuum()
 		 * call per page, so as to minimize WAL traffic.
 		 */
-		if (ndeletable > 0)
+		if (ndeletable > 0 || nupdatable > 0)
 		{
 			/*
 			 * Notice that the issued XLOG_BTREE_VACUUM WAL record includes
@@ -1290,7 +1368,8 @@ restart:
 			 * doesn't seem worth the amount of bookkeeping it'd take to avoid
 			 * that.
 			 */
-			_bt_delitems_vacuum(rel, buf, deletable, ndeletable,
+			_bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
+								updated, nupdatable,
 								vstate->lastBlockVacuumed);
 
 			/*
@@ -1300,7 +1379,7 @@ restart:
 			if (blkno > vstate->lastBlockVacuumed)
 				vstate->lastBlockVacuumed = blkno;
 
-			stats->tuples_removed += ndeletable;
+			stats->tuples_removed += nhtidsdead;
 			/* must recompute maxoff */
 			maxoff = PageGetMaxOffsetNumber(page);
 		}
@@ -1315,6 +1394,7 @@ restart:
 			 * We treat this like a hint-bit update because there's no need to
 			 * WAL-log it.
 			 */
+			Assert(nhtidsdead == 0);
 			if (vstate->cycleid != 0 &&
 				opaque->btpo_cycleid == vstate->cycleid)
 			{
@@ -1324,15 +1404,16 @@ restart:
 		}
 
 		/*
-		 * If it's now empty, try to delete; else count the live tuples. We
-		 * don't delete when recursing, though, to avoid putting entries into
+		 * If it's now empty, try to delete; else count the live tuples (live
+		 * heap TIDs in posting lists are counted as live tuples).  We don't
+		 * delete when recursing, though, to avoid putting entries into
 		 * freePages out-of-order (doesn't seem worth any extra code to handle
 		 * the case).
 		 */
 		if (minoff > maxoff)
 			delete_now = (blkno == orig_blkno);
 		else
-			stats->num_index_tuples += maxoff - minoff + 1;
+			stats->num_index_tuples += nhtidslive;
 	}
 
 	if (delete_now)
@@ -1375,6 +1456,68 @@ restart:
 	}
 }
 
+/*
+ * btreevacuumposting() -- determines which logical tuples must remain when
+ * VACUUMing a posting list tuple.
+ *
+ * Returns new palloc'd array of item pointers needed to build replacement
+ * posting list without the index row versions that are to be deleted.
+ *
+ * Note that returned array is NULL in the common case where there is nothing
+ * to delete in caller's posting list tuple.  The number of TIDs that should
+ * remain in the posting list tuple is set for caller in *nremaining.  This is
+ * also the size of the returned array (though only when array isn't just
+ * NULL).
+ */
+static ItemPointer
+btreevacuumposting(BTVacState *vstate, IndexTuple itup, int *nremaining)
+{
+	int			live = 0;
+	int			nitem = BTreeTupleGetNPosting(itup);
+	ItemPointer tmpitems = NULL,
+				items = BTreeTupleGetPosting(itup);
+
+	Assert(BTreeTupleIsPosting(itup));
+
+	/*
+	 * Check each tuple in the posting list.  Save live tuples into tmpitems,
+	 * though try to avoid memory allocation as an optimization.
+	 */
+	for (int i = 0; i < nitem; i++)
+	{
+		if (!vstate->callback(items + i, vstate->callback_state))
+		{
+			/*
+			 * Live heap TID.
+			 *
+			 * Only save live TID when we know that we're going to have to
+			 * kill at least one TID, and have already allocated memory.
+			 */
+			if (tmpitems)
+				tmpitems[live] = items[i];
+			live++;
+		}
+
+		/* Dead heap TID */
+		else if (tmpitems == NULL)
+		{
+			/*
+			 * Turns out we need to delete one or more dead heap TIDs, so
+			 * start maintaining an array of live TIDs for caller to
+			 * reconstruct smaller replacement posting list tuple
+			 */
+			tmpitems = palloc(sizeof(ItemPointerData) * nitem);
+
+			/* Copy live heap TIDs from previous loop iterations */
+			if (live > 0)
+				memcpy(tmpitems, items, sizeof(ItemPointerData) * live);
+		}
+	}
+
+	*nremaining = live;
+	return tmpitems;
+}
+
 /*
  *	btcanreturn() -- Check whether btree indexes support index-only scans.
  *
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 8e512461a0..9db73d070d 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -26,10 +26,18 @@
 
 static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
 static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
+static int	_bt_binsrch_posting(BTScanInsert key, Page page,
+								OffsetNumber offnum);
 static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
 						 OffsetNumber offnum);
 static void _bt_saveitem(BTScanOpaque so, int itemIndex,
 						 OffsetNumber offnum, IndexTuple itup);
+static void _bt_setuppostingitems(BTScanOpaque so, int itemIndex,
+								  OffsetNumber offnum, ItemPointer heapTid,
+								  IndexTuple itup);
+static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
+									   OffsetNumber offnum,
+									   ItemPointer heapTid);
 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
 static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
 static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
@@ -434,7 +442,10 @@ _bt_binsrch(Relation rel,
  * low) makes bounds invalid.
  *
  * Caller is responsible for invalidating bounds when it modifies the page
- * before calling here a second time.
+ * before calling here a second time, and for dealing with posting list
+ * tuple matches (callers can use insertstate's postingoff field to
+ * determine which existing heap TID will need to be replaced by their
+ * scantid/new heap TID).
  */
 OffsetNumber
 _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
@@ -453,6 +464,7 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
 
 	Assert(P_ISLEAF(opaque));
 	Assert(!key->nextkey);
+	Assert(insertstate->postingoff == 0);
 
 	if (!insertstate->bounds_valid)
 	{
@@ -509,6 +521,16 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
 			if (result != 0)
 				stricthigh = high;
 		}
+
+		/*
+		 * If tuple at offset located by binary search is a posting list whose
+		 * TID range overlaps with caller's scantid, perform posting list
+		 * binary search to set postingoff for caller.  Caller must split the
+		 * posting list when postingoff is set.  This should happen
+		 * infrequently.
+		 */
+		if (unlikely(result == 0 && key->scantid != NULL))
+			insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
 	}
 
 	/*
@@ -528,6 +550,68 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
 	return low;
 }
 
+/*----------
+ *	_bt_binsrch_posting() -- posting list binary search.
+ *
+ * Returns offset into posting list where caller's scantid belongs.
+ *----------
+ */
+static int
+_bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
+{
+	IndexTuple	itup;
+	ItemId		itemid;
+	int			low,
+				high,
+				mid,
+				res;
+
+	/*
+	 * If this isn't a posting tuple, then the index must be corrupt (if it is
+	 * an ordinary non-pivot tuple then there must be an existing tuple with a
+	 * heap TID that equals inserter's new heap TID/scantid).  Defensively
+	 * check that tuple is a posting list tuple whose posting list range
+	 * includes caller's scantid.
+	 *
+	 * (This is also needed because contrib/amcheck's rootdescend option needs
+	 * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
+	 */
+	Assert(P_ISLEAF((BTPageOpaque) PageGetSpecialPointer(page)));
+	Assert(!key->nextkey);
+	Assert(key->scantid != NULL);
+	itemid = PageGetItemId(page, offnum);
+	itup = (IndexTuple) PageGetItem(page, itemid);
+	if (!BTreeTupleIsPosting(itup))
+		return 0;
+
+	/*
+	 * In the unlikely event that posting list tuple has LP_DEAD bit set,
+	 * signal to caller that it should kill the item and restart its binary
+	 * search.
+	 */
+	if (ItemIdIsDead(itemid))
+		return -1;
+
+	/* "high" is past end of posting list for loop invariant */
+	low = 0;
+	high = BTreeTupleGetNPosting(itup);
+	Assert(high >= 2);
+
+	while (high > low)
+	{
+		mid = low + ((high - low) / 2);
+		res = ItemPointerCompare(key->scantid,
+								 BTreeTupleGetPostingN(itup, mid));
+
+		if (res >= 1)
+			low = mid + 1;
+		else
+			high = mid;
+	}
+
+	return low;
+}
+
 /*----------
  *	_bt_compare() -- Compare insertion-type scankey to tuple on a page.
  *
@@ -537,9 +621,18 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
  *			<0 if scankey < tuple at offnum;
  *			 0 if scankey == tuple at offnum;
  *			>0 if scankey > tuple at offnum.
- *		NULLs in the keys are treated as sortable values.  Therefore
- *		"equality" does not necessarily mean that the item should be
- *		returned to the caller as a matching key!
+ *
+ * NULLs in the keys are treated as sortable values.  Therefore
+ * "equality" does not necessarily mean that the item should be returned
+ * to the caller as a matching key.  Similarly, an insertion scankey
+ * with its scantid set is treated as equal to a posting tuple whose TID
+ * range overlaps with their scantid.  There generally won't be a
+ * matching TID in the posting tuple, which caller must handle
+ * themselves (e.g., by splitting the posting list tuple).
+ *
+ * It is generally guaranteed that any possible scankey with scantid set
+ * will have zero or one tuples in the index that are considered equal
+ * here.
  *
  * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
  * "minus infinity": this routine will always claim it is less than the
@@ -563,6 +656,7 @@ _bt_compare(Relation rel,
 	ScanKey		scankey;
 	int			ncmpkey;
 	int			ntupatts;
+	int32		result;
 
 	Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
 	Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
@@ -597,7 +691,6 @@ _bt_compare(Relation rel,
 	{
 		Datum		datum;
 		bool		isNull;
-		int32		result;
 
 		datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
 
@@ -713,8 +806,25 @@ _bt_compare(Relation rel,
 	if (heapTid == NULL)
 		return 1;
 
+	/*
+	 * scankey must be treated as equal to a posting list tuple if its scantid
+	 * value falls within the range of the posting list.  In all other cases
+	 * there can only be a single heap TID value, which is compared directly
+	 * as a simple scalar value.
+	 */
 	Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
-	return ItemPointerCompare(key->scantid, heapTid);
+	result = ItemPointerCompare(key->scantid, heapTid);
+	if (!BTreeTupleIsPosting(itup) || result <= 0)
+		return result;
+	else
+	{
+		result = ItemPointerCompare(key->scantid,
+									BTreeTupleGetMaxHeapTID(itup));
+		if (result > 0)
+			return 1;
+	}
+
+	return 0;
 }
 
 /*
@@ -1233,6 +1343,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	inskey.anynullkeys = false; /* unused */
 	inskey.nextkey = nextkey;
 	inskey.pivotsearch = false;
+	inskey.dedup_is_possible = false;
 	inskey.scantid = NULL;
 	inskey.keysz = keysCount;
 
@@ -1451,6 +1562,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
 	/* initialize tuple workspace to empty */
 	so->currPos.nextTupleOffset = 0;
+	so->currPos.postingTupleOffset = 0;
 
 	/*
 	 * Now that the current page has been made consistent, the macro should be
@@ -1485,8 +1597,29 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
 			{
 				/* tuple passes all scan key conditions, so remember it */
-				_bt_saveitem(so, itemIndex, offnum, itup);
-				itemIndex++;
+				if (!BTreeTupleIsPosting(itup))
+				{
+					_bt_saveitem(so, itemIndex, offnum, itup);
+					itemIndex++;
+				}
+				else
+				{
+					/*
+					 * Setup state to return posting list, and save first
+					 * "logical" tuple
+					 */
+					_bt_setuppostingitems(so, itemIndex, offnum,
+										  BTreeTupleGetPostingN(itup, 0),
+										  itup);
+					itemIndex++;
+					/* Save additional posting list "logical" tuples */
+					for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+					{
+						_bt_savepostingitem(so, itemIndex, offnum,
+											BTreeTupleGetPostingN(itup, i));
+						itemIndex++;
+					}
+				}
 			}
 			/* When !continuescan, there can't be any more matches, so stop */
 			if (!continuescan)
@@ -1519,7 +1652,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 		if (!continuescan)
 			so->currPos.moreRight = false;
 
-		Assert(itemIndex <= MaxIndexTuplesPerPage);
+		Assert(itemIndex <= MaxPostingIndexTuplesPerPage);
 		so->currPos.firstItem = 0;
 		so->currPos.lastItem = itemIndex - 1;
 		so->currPos.itemIndex = 0;
@@ -1527,7 +1660,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	else
 	{
 		/* load items[] in descending order */
-		itemIndex = MaxIndexTuplesPerPage;
+		itemIndex = MaxPostingIndexTuplesPerPage;
 
 		offnum = Min(offnum, maxoff);
 
@@ -1569,8 +1702,36 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			if (passes_quals && tuple_alive)
 			{
 				/* tuple passes all scan key conditions, so remember it */
-				itemIndex--;
-				_bt_saveitem(so, itemIndex, offnum, itup);
+				if (!BTreeTupleIsPosting(itup))
+				{
+					itemIndex--;
+					_bt_saveitem(so, itemIndex, offnum, itup);
+				}
+				else
+				{
+					int			i = BTreeTupleGetNPosting(itup) - 1;
+
+					/*
+					 * Setup state to return posting list, and save last
+					 * "logical" tuple from posting list (since it's the first
+					 * that will be returned to scan).
+					 */
+					itemIndex--;
+					_bt_setuppostingitems(so, itemIndex, offnum,
+										  BTreeTupleGetPostingN(itup, i--),
+										  itup);
+
+					/*
+					 * Return posting list "logical" tuples -- do this in
+					 * descending order, to match overall scan order
+					 */
+					for (; i >= 0; i--)
+					{
+						itemIndex--;
+						_bt_savepostingitem(so, itemIndex, offnum,
+											BTreeTupleGetPostingN(itup, i));
+					}
+				}
 			}
 			if (!continuescan)
 			{
@@ -1584,8 +1745,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
 		Assert(itemIndex >= 0);
 		so->currPos.firstItem = itemIndex;
-		so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
-		so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
+		so->currPos.lastItem = MaxPostingIndexTuplesPerPage - 1;
+		so->currPos.itemIndex = MaxPostingIndexTuplesPerPage - 1;
 	}
 
 	return (so->currPos.firstItem <= so->currPos.lastItem);
@@ -1598,6 +1759,8 @@ _bt_saveitem(BTScanOpaque so, int itemIndex,
 {
 	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
 
+	Assert(!BTreeTupleIsPosting(itup));
+
 	currItem->heapTid = itup->t_tid;
 	currItem->indexOffset = offnum;
 	if (so->currTuples)
@@ -1610,6 +1773,59 @@ _bt_saveitem(BTScanOpaque so, int itemIndex,
 	}
 }
 
+/*
+ * Setup state to save posting items from a single posting list tuple.  Saves
+ * the logical tuple that will be returned to scan first in passing.
+ *
+ * Saves an index item into so->currPos.items[itemIndex] for logical tuple
+ * that is returned to scan first.  Second or subsequent heap TID for posting
+ * list should be saved by calling _bt_savepostingitem().
+ */
+static void
+_bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
+					  ItemPointer heapTid, IndexTuple itup)
+{
+	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+
+	currItem->heapTid = *heapTid;
+	currItem->indexOffset = offnum;
+
+	if (so->currTuples)
+	{
+		/* Save a base version of the IndexTuple */
+		Size		itupsz = BTreeTupleGetPostingOffset(itup);
+
+		itupsz = MAXALIGN(itupsz);
+		currItem->tupleOffset = so->currPos.nextTupleOffset;
+		memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
+		so->currPos.nextTupleOffset += itupsz;
+		so->currPos.postingTupleOffset = currItem->tupleOffset;
+	}
+}
+
+/*
+ * Save an index item into so->currPos.items[itemIndex] for posting tuple.
+ *
+ * Assumes that _bt_setuppostingitems() has already been called for current
+ * posting list tuple.
+ */
+static inline void
+_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
+					ItemPointer heapTid)
+{
+	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+
+	currItem->heapTid = *heapTid;
+	currItem->indexOffset = offnum;
+
+	/*
+	 * Have index-only scans return the same base IndexTuple for every logical
+	 * tuple that originates from the same posting list
+	 */
+	if (so->currTuples)
+		currItem->tupleOffset = so->currPos.postingTupleOffset;
+}
+
 /*
  *	_bt_steppage() -- Step to next page containing valid data for scan
  *
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index ab19692006..a138fafeb1 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -287,6 +287,9 @@ static void _bt_sortaddtup(Page page, Size itemsize,
 						   IndexTuple itup, OffsetNumber itup_off);
 static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
 						 IndexTuple itup);
+static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
+										  BTPageState *state,
+										  BTDedupState *dstate);
 static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
 static void _bt_load(BTWriteState *wstate,
 					 BTSpool *btspool, BTSpool *btspool2);
@@ -725,8 +728,8 @@ _bt_pagestate(BTWriteState *wstate, uint32 level)
 	if (level > 0)
 		state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
 	else
-		state->btps_full = RelationGetTargetPageFreeSpace(wstate->index,
-														  BTREE_DEFAULT_FILLFACTOR);
+		state->btps_full = BtreeGetTargetPageFreeSpace(wstate->index,
+													   BTREE_DEFAULT_FILLFACTOR);
 	/* no parent level, yet */
 	state->btps_next = NULL;
 
@@ -799,7 +802,8 @@ _bt_sortaddtup(Page page,
 }
 
 /*----------
- * Add an item to a disk page from the sort output.
+ * Add an item to a disk page from the sort output (or add a posting list
+ * item formed from the sort output).
  *
  * We must be careful to observe the page layout conventions of nbtsearch.c:
  * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
@@ -1002,6 +1006,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 		 * the minimum key for the new page.
 		 */
 		state->btps_minkey = CopyIndexTuple(oitup);
+		Assert(BTreeTupleIsPivot(state->btps_minkey));
 
 		/*
 		 * Set the sibling links for both pages.
@@ -1043,6 +1048,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 		Assert(state->btps_minkey == NULL);
 		state->btps_minkey = CopyIndexTuple(itup);
 		/* _bt_sortaddtup() will perform full truncation later */
+		BTreeTupleClearBtIsPosting(state->btps_minkey);
 		BTreeTupleSetNAtts(state->btps_minkey, 0);
 	}
 
@@ -1057,6 +1063,42 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 	state->btps_lastoff = last_off;
 }
 
+/*
+ * Finalize pending posting list tuple, and add it to the index.  Final tuple
+ * is based on saved base tuple, and saved list of heap TIDs.
+ *
+ * This is almost like nbtinsert.c's _bt_dedup_finish_pending(), but it adds a
+ * new tuple using _bt_buildadd() and does not maintain the intervals array.
+ */
+static void
+_bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
+							  BTDedupState *dstate)
+{
+	IndexTuple	final;
+
+	Assert(dstate->nitems > 0);
+	if (dstate->nitems == 1)
+		final = dstate->base;
+	else
+	{
+		IndexTuple	postingtuple;
+
+		/* form a tuple with a posting list */
+		postingtuple = BTreeFormPostingTuple(dstate->base,
+											 dstate->htids,
+											 dstate->nhtids);
+		final = postingtuple;
+	}
+
+	_bt_buildadd(wstate, state, final);
+
+	if (dstate->nitems > 1)
+		pfree(final);
+	/* Don't maintain  dedup_intervals array, or alltupsize */
+	dstate->nhtids = 0;
+	dstate->nitems = 0;
+}
+
 /*
  * Finish writing out the completed btree.
  */
@@ -1123,7 +1165,8 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 	 * by filling in a valid magic number in the metapage.
 	 */
 	metapage = (Page) palloc(BLCKSZ);
-	_bt_initmetapage(metapage, rootblkno, rootlevel);
+
+	_bt_initmetapage(metapage, rootblkno, rootlevel, wstate->inskey->dedup_is_possible);
 	_bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
 }
 
@@ -1144,6 +1187,10 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 				keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
 	SortSupport sortKeys;
 	int64		tuples_done = 0;
+	bool		deduplicate;
+
+	deduplicate = wstate->inskey->dedup_is_possible &&
+		BtreeGetDoDedupOption(wstate->index);
 
 	if (merge)
 	{
@@ -1255,9 +1302,96 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 		}
 		pfree(sortKeys);
 	}
+	else if (deduplicate)
+	{
+		/* merge is unnecessary, deduplicate into posting lists */
+		BTDedupState *dstate;
+		IndexTuple	newbase;
+
+		dstate = (BTDedupState *) palloc(sizeof(BTDedupState));
+		dstate->maxitemsize = 0;	/* set later */
+		dstate->checkingunique = false; /* unused */
+		dstate->newitem = NULL;
+		/* Metadata about current pending posting list */
+		dstate->htids = NULL;
+		dstate->nhtids = 0;
+		dstate->nitems = 0;
+		dstate->overlap = false;
+		dstate->alltupsize = 0; /* unused */
+		/* Metadata about based tuple of current pending posting list */
+		dstate->base = NULL;
+		dstate->baseoff = InvalidOffsetNumber;	/* unused */
+		dstate->basetupsize = 0;
+
+		while ((itup = tuplesort_getindextuple(btspool->sortstate,
+											   true)) != NULL)
+		{
+			/* When we see first tuple, create first index page */
+			if (state == NULL)
+			{
+				state = _bt_pagestate(wstate, 0);
+				dstate->maxitemsize = BTMaxItemSize(state->btps_page);
+				/* Conservatively size array */
+				dstate->htids = palloc(dstate->maxitemsize);
+
+				/*
+				 * No previous/base tuple, since itup is the  first item
+				 * returned by the tuplesort -- use itup as base tuple of
+				 * first pending posting list for entire index build
+				 */
+				newbase = CopyIndexTuple(itup);
+				_bt_dedup_start_pending(dstate, newbase, InvalidOffsetNumber);
+			}
+			else if (_bt_keep_natts_fast(wstate->index, dstate->base,
+										 itup) > keysz &&
+					 _bt_dedup_save_htid(dstate, itup))
+			{
+				/*
+				 * Tuple is equal to base tuple of pending posting list, and
+				 * merging itup into pending posting list won't exceed the
+				 * BTMaxItemSize() limit.  Heap TID(s) for itup have been
+				 * saved in state.  The next iteration will also end up here
+				 * if it's possible to merge the next tuple into the same
+				 * pending posting list.
+				 */
+			}
+			else
+			{
+				/*
+				 * Tuple is not equal to pending posting list tuple, or
+				 * BTMaxItemSize() limit was reached
+				 */
+				_bt_sort_dedup_finish_pending(wstate, state, dstate);
+				/* Base tuple is always a copy */
+				pfree(dstate->base);
+
+				/* itup starts new pending posting list */
+				newbase = CopyIndexTuple(itup);
+				_bt_dedup_start_pending(dstate, newbase, InvalidOffsetNumber);
+			}
+
+			/* Report progress */
+			pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
+										 ++tuples_done);
+		}
+
+		/*
+		 * Handle the last item (there must be a last item when the tuplesort
+		 * returned one or more tuples)
+		 */
+		if (state)
+		{
+			_bt_sort_dedup_finish_pending(wstate, state, dstate);
+			/* Base tuple is always a copy */
+			pfree(dstate->base);
+			pfree(dstate->htids);
+		}
+
+		pfree(dstate);
+	}
 	else
 	{
-		/* merge is unnecessary */
+		/* merging and deduplication are both unnecessary */
 		while ((itup = tuplesort_getindextuple(btspool->sortstate,
 											   true)) != NULL)
 		{
diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c
index 1c1029b6c4..df976d4b7a 100644
--- a/src/backend/access/nbtree/nbtsplitloc.c
+++ b/src/backend/access/nbtree/nbtsplitloc.c
@@ -167,7 +167,7 @@ _bt_findsplitloc(Relation rel,
 
 	/* Count up total space in data items before actually scanning 'em */
 	olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page);
-	leaffillfactor = RelationGetFillFactor(rel, BTREE_DEFAULT_FILLFACTOR);
+	leaffillfactor = BtreeGetFillFactor(rel, BTREE_DEFAULT_FILLFACTOR);
 
 	/* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
 	newitemsz += sizeof(ItemIdData);
@@ -183,6 +183,9 @@ _bt_findsplitloc(Relation rel,
 	state.minfirstrightsz = SIZE_MAX;
 	state.newitemoff = newitemoff;
 
+	/* newitem cannot be a posting list item */
+	Assert(!BTreeTupleIsPosting(newitem));
+
 	/*
 	 * maxsplits should never exceed maxoff because there will be at most as
 	 * many candidate split points as there are points _between_ tuples, once
@@ -459,17 +462,52 @@ _bt_recsplitloc(FindSplitData *state,
 	int16		leftfree,
 				rightfree;
 	Size		firstrightitemsz;
+	Size		postingsubhikey = 0;
 	bool		newitemisfirstonright;
 
 	/* Is the new item going to be the first item on the right page? */
 	newitemisfirstonright = (firstoldonright == state->newitemoff
 							 && !newitemonleft);
 
+	/*
+	 * FIXME: Accessing every single tuple like this adds cycles to cases that
+	 * cannot possibly benefit (i.e. cases where we know that there cannot be
+	 * posting lists).  Maybe we should add a way to not bother when we are
+	 * certain that this is the case.
+	 *
+	 * We could either have _bt_split() pass us a flag, or invent a page flag
+	 * that indicates that the page might have posting lists, as an
+	 * optimization.  There is no shortage of btpo_flags bits for stuff like
+	 * this.
+	 */
 	if (newitemisfirstonright)
+	{
 		firstrightitemsz = state->newitemsz;
+
+		/* Calculate posting list overhead, if any */
+		if (state->is_leaf && BTreeTupleIsPosting(state->newitem))
+			postingsubhikey = IndexTupleSize(state->newitem) -
+				BTreeTupleGetPostingOffset(state->newitem);
+	}
 	else
+	{
 		firstrightitemsz = firstoldonrightsz;
 
+		/* Calculate posting list overhead, if any */
+		if (state->is_leaf)
+		{
+			ItemId		itemid;
+			IndexTuple	newhighkey;
+
+			itemid = PageGetItemId(state->page, firstoldonright);
+			newhighkey = (IndexTuple) PageGetItem(state->page, itemid);
+
+			if (BTreeTupleIsPosting(newhighkey))
+				postingsubhikey = IndexTupleSize(newhighkey) -
+					BTreeTupleGetPostingOffset(newhighkey);
+		}
+	}
+
 	/* Account for all the old tuples */
 	leftfree = state->leftspace - olddataitemstoleft;
 	rightfree = state->rightspace -
@@ -492,9 +530,13 @@ _bt_recsplitloc(FindSplitData *state,
 	 * adding a heap TID to the left half's new high key when splitting at the
 	 * leaf level.  In practice the new high key will often be smaller and
 	 * will rarely be larger, but conservatively assume the worst case.
+	 * Truncation always truncates away any posting list that appears in the
+	 * first right tuple, though, so it's safe to subtract that overhead
+	 * (while still conservatively assuming that truncation might have to add
+	 * back a single heap TID using the pivot tuple heap TID representation).
 	 */
 	if (state->is_leaf)
-		leftfree -= (int16) (firstrightitemsz +
+		leftfree -= (int16) ((firstrightitemsz - postingsubhikey) +
 							 MAXALIGN(sizeof(ItemPointerData)));
 	else
 		leftfree -= (int16) firstrightitemsz;
@@ -691,7 +733,8 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
 	itemid = PageGetItemId(state->page, OffsetNumberPrev(state->newitemoff));
 	tup = (IndexTuple) PageGetItem(state->page, itemid);
 	/* Do cheaper test first */
-	if (!_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid))
+	if (BTreeTupleIsPosting(tup) ||
+		!_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid))
 		return false;
 	/* Check same conditions as rightmost item case, too */
 	keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index bc855dd25d..6fdd776ea5 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -97,8 +97,6 @@ _bt_mkscankey(Relation rel, IndexTuple itup)
 	indoption = rel->rd_indoption;
 	tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0;
 
-	Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel));
-
 	/*
 	 * We'll execute search using scan key constructed on key columns.
 	 * Truncated attributes and non-key attributes are omitted from the final
@@ -110,9 +108,23 @@ _bt_mkscankey(Relation rel, IndexTuple itup)
 	key->anynullkeys = false;	/* initial assumption */
 	key->nextkey = false;
 	key->pivotsearch = false;
+	key->scantid = NULL;
 	key->keysz = Min(indnkeyatts, tupnatts);
-	key->scantid = key->heapkeyspace && itup ?
-		BTreeTupleGetHeapTID(itup) : NULL;
+	/* get information from relation info or from btree metapage */
+	key->dedup_is_possible = (itup == NULL) ? _bt_dedup_is_possible(rel) :
+		_bt_getdedupispossible(rel);
+
+	Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel));
+	Assert(!itup || !BTreeTupleIsPosting(itup) || key->heapkeyspace);
+
+	/*
+	 * When caller passes a tuple with a heap TID, use it to set scantid. Note
+	 * that this handles posting list tuples by setting scantid to the lowest
+	 * heap TID in the posting list.
+	 */
+	if (itup && key->heapkeyspace)
+		key->scantid = BTreeTupleGetHeapTID(itup);
+
 	skey = key->scankeys;
 	for (i = 0; i < indnkeyatts; i++)
 	{
@@ -1386,6 +1398,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			 * attribute passes the qual.
 			 */
 			Assert(ScanDirectionIsForward(dir));
+			Assert(BTreeTupleIsPivot(tuple));
 			continue;
 		}
 
@@ -1547,6 +1560,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
 			 * attribute passes the qual.
 			 */
 			Assert(ScanDirectionIsForward(dir));
+			Assert(BTreeTupleIsPivot(tuple));
 			cmpresult = 0;
 			if (subkey->sk_flags & SK_ROW_END)
 				break;
@@ -1786,10 +1800,35 @@ _bt_killitems(IndexScanDesc scan)
 		{
 			ItemId		iid = PageGetItemId(page, offnum);
 			IndexTuple	ituple = (IndexTuple) PageGetItem(page, iid);
+			bool		killtuple = false;
 
-			if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
+			if (BTreeTupleIsPosting(ituple))
 			{
-				/* found the item */
+				int			pi = i + 1;
+				int			nposting = BTreeTupleGetNPosting(ituple);
+				int			j;
+
+				for (j = 0; j < nposting; j++)
+				{
+					ItemPointer item = BTreeTupleGetPostingN(ituple, j);
+
+					if (!ItemPointerEquals(item, &kitem->heapTid))
+						break;	/* out of posting list loop */
+
+					/* Read-ahead to later kitems */
+					if (pi < numKilled)
+						kitem = &so->currPos.items[so->killedItems[pi++]];
+				}
+
+				if (j == nposting)
+					killtuple = true;
+			}
+			else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
+				killtuple = true;
+
+			if (killtuple)
+			{
+				/* found the item/all posting list items */
 				ItemIdMarkDead(iid);
 				killedsomething = true;
 				break;			/* out of inner search loop */
@@ -2027,7 +2066,30 @@ BTreeShmemInit(void)
 bytea *
 btoptions(Datum reloptions, bool validate)
 {
-	return default_reloptions(reloptions, validate, RELOPT_KIND_BTREE);
+	relopt_value *options;
+	BtreeOptions *rdopts;
+	int			numoptions;
+	static const relopt_parse_elt tab[] = {
+		{"fillfactor", RELOPT_TYPE_INT, offsetof(BtreeOptions, fillfactor)},
+		{"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL,
+		offsetof(BtreeOptions, vacuum_cleanup_index_scale_factor)},
+		{"deduplication", RELOPT_TYPE_BOOL, offsetof(BtreeOptions, do_deduplication)}
+	};
+
+	options = parseRelOptions(reloptions, validate, RELOPT_KIND_BTREE,
+							  &numoptions);
+
+	/* if none set, we're done */
+	if (numoptions == 0)
+		return NULL;
+
+	rdopts = allocateReloptStruct(sizeof(BtreeOptions), options, numoptions);
+
+	fillRelOptions((void *) rdopts, sizeof(BtreeOptions), options, numoptions,
+				   validate, tab, lengthof(tab));
+
+	pfree(options);
+	return (bytea *) rdopts;
 }
 
 /*
@@ -2140,6 +2202,24 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 
 		pivot = index_truncate_tuple(itupdesc, firstright, keepnatts);
 
+		if (BTreeTupleIsPosting(firstright))
+		{
+			BTreeTupleClearBtIsPosting(pivot);
+			BTreeTupleSetNAtts(pivot, keepnatts);
+			if (keepnatts == natts)
+			{
+				/*
+				 * index_truncate_tuple() just returned a copy of the
+				 * original, so make sure that the size of the new pivot tuple
+				 * doesn't have posting list overhead
+				 */
+				pivot->t_info &= ~INDEX_SIZE_MASK;
+				pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright));
+			}
+		}
+
+		Assert(!BTreeTupleIsPosting(pivot));
+
 		/*
 		 * If there is a distinguishing key attribute within new pivot tuple,
 		 * there is no need to add an explicit heap TID attribute
@@ -2156,6 +2236,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 		 * attribute to the new pivot tuple.
 		 */
 		Assert(natts != nkeyatts);
+		Assert(!BTreeTupleIsPosting(lastleft) &&
+			   !BTreeTupleIsPosting(firstright));
 		newsize = IndexTupleSize(pivot) + MAXALIGN(sizeof(ItemPointerData));
 		tidpivot = palloc0(newsize);
 		memcpy(tidpivot, pivot, IndexTupleSize(pivot));
@@ -2163,6 +2245,24 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 		pfree(pivot);
 		pivot = tidpivot;
 	}
+	else if (BTreeTupleIsPosting(firstright))
+	{
+		/*
+		 * No truncation was possible, since key attributes are all equal.  We
+		 * can always truncate away a posting list, though.
+		 *
+		 * It's necessary to add a heap TID attribute to the new pivot tuple.
+		 */
+		newsize = MAXALIGN(BTreeTupleGetPostingOffset(firstright)) +
+			MAXALIGN(sizeof(ItemPointerData));
+		pivot = palloc0(newsize);
+		memcpy(pivot, firstright, BTreeTupleGetPostingOffset(firstright));
+
+		pivot->t_info &= ~INDEX_SIZE_MASK;
+		pivot->t_info |= newsize;
+		BTreeTupleClearBtIsPosting(pivot);
+		BTreeTupleSetAltHeapTID(pivot);
+	}
 	else
 	{
 		/*
@@ -2170,7 +2270,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 		 * It's necessary to add a heap TID attribute to the new pivot tuple.
 		 */
 		Assert(natts == nkeyatts);
-		newsize = IndexTupleSize(firstright) + MAXALIGN(sizeof(ItemPointerData));
+		newsize = MAXALIGN(IndexTupleSize(firstright)) +
+			MAXALIGN(sizeof(ItemPointerData));
 		pivot = palloc0(newsize);
 		memcpy(pivot, firstright, IndexTupleSize(firstright));
 	}
@@ -2188,6 +2289,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	 * nbtree (e.g., there is no pg_attribute entry).
 	 */
 	Assert(itup_key->heapkeyspace);
+	Assert(!BTreeTupleIsPosting(pivot));
 	pivot->t_info &= ~INDEX_SIZE_MASK;
 	pivot->t_info |= newsize;
 
@@ -2200,7 +2302,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	 */
 	pivotheaptid = (ItemPointer) ((char *) pivot + newsize -
 								  sizeof(ItemPointerData));
-	ItemPointerCopy(&lastleft->t_tid, pivotheaptid);
+	ItemPointerCopy(BTreeTupleGetMaxHeapTID(lastleft), pivotheaptid);
 
 	/*
 	 * Lehman and Yao require that the downlink to the right page, which is to
@@ -2211,9 +2313,12 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	 * tiebreaker.
 	 */
 #ifndef DEBUG_NO_TRUNCATE
-	Assert(ItemPointerCompare(&lastleft->t_tid, &firstright->t_tid) < 0);
-	Assert(ItemPointerCompare(pivotheaptid, &lastleft->t_tid) >= 0);
-	Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0);
+	Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(lastleft),
+							  BTreeTupleGetHeapTID(firstright)) < 0);
+	Assert(ItemPointerCompare(pivotheaptid,
+							  BTreeTupleGetHeapTID(lastleft)) >= 0);
+	Assert(ItemPointerCompare(pivotheaptid,
+							  BTreeTupleGetHeapTID(firstright)) < 0);
 #else
 
 	/*
@@ -2226,7 +2331,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	 * attribute values along with lastleft's heap TID value when lastleft's
 	 * TID happens to be greater than firstright's TID.
 	 */
-	ItemPointerCopy(&firstright->t_tid, pivotheaptid);
+	ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid);
 
 	/*
 	 * Pivot heap TID should never be fully equal to firstright.  Note that
@@ -2235,7 +2340,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	 */
 	ItemPointerSetOffsetNumber(pivotheaptid,
 							   OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid)));
-	Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0);
+	Assert(ItemPointerCompare(pivotheaptid,
+							  BTreeTupleGetHeapTID(firstright)) < 0);
 #endif
 
 	BTreeTupleSetNAtts(pivot, nkeyatts);
@@ -2316,15 +2422,25 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright,
  * The approach taken here usually provides the same answer as _bt_keep_natts
  * will (for the same pair of tuples from a heapkeyspace index), since the
  * majority of btree opclasses can never indicate that two datums are equal
- * unless they're bitwise equal (once detoasted).  Similarly, result may
- * differ from the _bt_keep_natts result when either tuple has TOASTed datums,
- * though this is barely possible in practice.
+ * unless they're bitwise equal after detoasting.
  *
  * These issues must be acceptable to callers, typically because they're only
  * concerned about making suffix truncation as effective as possible without
  * leaving excessive amounts of free space on either side of page split.
  * Callers can rely on the fact that attributes considered equal here are
  * definitely also equal according to _bt_keep_natts.
+ *
+ * When an index only uses opclasses where equality is "precise", this
+ * function is guaranteed to give the same result as _bt_keep_natts().  This
+ * makes it safe to use this function to determine whether or not two tuples
+ * can be folded together into a single posting tuple.  Posting list
+ * deduplication cannot be used with nondeterministic collations for this
+ * reason.
+ *
+ * FIXME: Actually invent the needed "equality-is-precise" opclass
+ * infrastructure.  See dedicated -hackers thread:
+ *
+ * https://postgr.es/m/CAH2-Wzn3Ee49Gmxb7V1VJ3-AC8fWn-Fr8pfWQebHe8rYRxt5OQ@mail.gmail.com
  */
 int
 _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
@@ -2349,8 +2465,38 @@ _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
 		if (isNull1 != isNull2)
 			break;
 
+		/*
+		 * XXX: The ideal outcome from the point of view of the posting list
+		 * patch is that the definition of an opclass with "precise equality"
+		 * becomes: "equality operator function must give exactly the same
+		 * answer as datum_image_eq() would, provided that we aren't using a
+		 * nondeterministic collation". (Nondeterministic collations are
+		 * clearly not compatible with deduplication.)
+		 *
+		 * This will be a lot faster than actually using the authoritative
+		 * insertion scankey in some cases.  This approach also seems more
+		 * elegant, since suffix truncation gets to follow exactly the same
+		 * definition of "equal" as posting list deduplication -- there is a
+		 * subtle interplay between deduplication and suffix truncation, and
+		 * it would be nice to know for sure that they have exactly the same
+		 * idea about what equality is.
+		 *
+		 * This ideal outcome still avoids problems with TOAST.  We cannot
+		 * repeat bugs like the amcheck bug that was fixed in bugfix commit
+		 * eba775345d23d2c999bbb412ae658b6dab36e3e8.  datum_image_eq()
+		 * considers binary equality, though only _after_ each datum is
+		 * decompressed.
+		 *
+		 * If this ideal solution isn't possible, then we can fall back on
+		 * defining "precise equality" as: "type's output function must
+		 * produce identical textual output for any two datums that compare
+		 * equal when using a safe/equality-is-precise operator class (unless
+		 * using a nondeterministic collation)".  That would mean that we'd
+		 * have to make deduplication call _bt_keep_natts() instead (or some
+		 * other function that uses authoritative insertion scankey).
+		 */
 		if (!isNull1 &&
-			!datumIsEqual(datum1, datum2, att->attbyval, att->attlen))
+			!datum_image_eq(datum1, datum2, att->attbyval, att->attlen))
 			break;
 
 		keepnatts++;
@@ -2402,22 +2548,30 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
 	tupnatts = BTreeTupleGetNAtts(itup, rel);
 
+	/* !heapkeyspace indexes do not support deduplication */
+	if (!heapkeyspace && BTreeTupleIsPosting(itup))
+		return false;
+
+	/* INCLUDE indexes do not support deduplication */
+	if (natts != nkeyatts && BTreeTupleIsPosting(itup))
+		return false;
+
 	if (P_ISLEAF(opaque))
 	{
 		if (offnum >= P_FIRSTDATAKEY(opaque))
 		{
 			/*
-			 * Non-pivot tuples currently never use alternative heap TID
-			 * representation -- even those within heapkeyspace indexes
+			 * Non-pivot tuple should never be explicitly marked as a pivot
+			 * tuple
 			 */
-			if ((itup->t_info & INDEX_ALT_TID_MASK) != 0)
+			if (BTreeTupleIsPivot(itup))
 				return false;
 
 			/*
 			 * Leaf tuples that are not the page high key (non-pivot tuples)
 			 * should never be truncated.  (Note that tupnatts must have been
-			 * inferred, rather than coming from an explicit on-disk
-			 * representation.)
+			 * inferred, even with a posting list tuple, because only pivot
+			 * tuples store tupnatts directly.)
 			 */
 			return tupnatts == natts;
 		}
@@ -2461,12 +2615,12 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 			 * non-zero, or when there is no explicit representation and the
 			 * tuple is evidently not a pre-pg_upgrade tuple.
 			 *
-			 * Prior to v11, downlinks always had P_HIKEY as their offset. Use
-			 * that to decide if the tuple is a pre-v11 tuple.
+			 * Prior to v11, downlinks always had P_HIKEY as their offset.
+			 * Accept that as an alternative indication of a valid
+			 * !heapkeyspace negative infinity tuple.
 			 */
 			return tupnatts == 0 ||
-				((itup->t_info & INDEX_ALT_TID_MASK) == 0 &&
-				 ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY);
+				ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY;
 		}
 		else
 		{
@@ -2492,7 +2646,11 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 	 * heapkeyspace index pivot tuples, regardless of whether or not there are
 	 * non-key attributes.
 	 */
-	if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
+	if (!BTreeTupleIsPivot(itup))
+		return false;
+
+	/* Pivot tuple should not use posting list representation (redundant) */
+	if (BTreeTupleIsPosting(itup))
 		return false;
 
 	/*
@@ -2562,11 +2720,119 @@ _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace,
 					BTMaxItemSizeNoHeapTid(page),
 					RelationGetRelationName(rel)),
 			 errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
-					   ItemPointerGetBlockNumber(&newtup->t_tid),
-					   ItemPointerGetOffsetNumber(&newtup->t_tid),
+					   ItemPointerGetBlockNumber(BTreeTupleGetHeapTID(newtup)),
+					   ItemPointerGetOffsetNumber(BTreeTupleGetHeapTID(newtup)),
 					   RelationGetRelationName(heap)),
 			 errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
 					 "Consider a function index of an MD5 hash of the value, "
 					 "or use full text indexing."),
 			 errtableconstraint(heap, RelationGetRelationName(rel))));
 }
+
+/*
+ * Given a basic tuple that contains key datum and posting list, build a
+ * posting tuple.  Caller's "htids" array must be sorted in ascending order.
+ *
+ * Basic tuple can be a posting tuple, but we only use key part of it, all
+ * ItemPointers must be passed via htids.
+ *
+ * If nhtids == 1, just build a non-posting tuple.  It is necessary to avoid
+ * storage overhead after posting tuple was vacuumed.
+ */
+IndexTuple
+BTreeFormPostingTuple(IndexTuple tuple, ItemPointer htids, int nhtids)
+{
+	uint32		keysize,
+				newsize = 0;
+	IndexTuple	itup;
+
+	/* We only need key part of the tuple */
+	if (BTreeTupleIsPosting(tuple))
+		keysize = BTreeTupleGetPostingOffset(tuple);
+	else
+		keysize = IndexTupleSize(tuple);
+
+	Assert(nhtids > 0);
+
+	/* Add space needed for posting list */
+	if (nhtids > 1)
+		newsize = SHORTALIGN(keysize) + sizeof(ItemPointerData) * nhtids;
+	else
+		newsize = keysize;
+
+	newsize = MAXALIGN(newsize);
+	itup = palloc0(newsize);
+	memcpy(itup, tuple, keysize);
+	itup->t_info &= ~INDEX_SIZE_MASK;
+	itup->t_info |= newsize;
+
+	if (nhtids > 1)
+	{
+		/* Form posting tuple, fill posting fields */
+
+		itup->t_info |= INDEX_ALT_TID_MASK;
+		BTreeSetPostingMeta(itup, nhtids, SHORTALIGN(keysize));
+		/* Copy posting list into the posting tuple */
+		memcpy(BTreeTupleGetPosting(itup), htids,
+			   sizeof(ItemPointerData) * nhtids);
+
+#ifdef USE_ASSERT_CHECKING
+		{
+			/* Assert that htid array is sorted and has unique TIDs */
+			ItemPointerData last;
+			ItemPointer current;
+
+			ItemPointerCopy(BTreeTupleGetHeapTID(itup), &last);
+
+			for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+			{
+				current = BTreeTupleGetPostingN(itup, i);
+				Assert(ItemPointerCompare(current, &last) > 0);
+				ItemPointerCopy(current, &last);
+			}
+		}
+#endif
+	}
+	else
+	{
+		/* To finish building of a non-posting tuple, copy TID from htids */
+		itup->t_info &= ~INDEX_ALT_TID_MASK;
+		ItemPointerCopy(htids, &itup->t_tid);
+	}
+
+	return itup;
+}
+
+/*
+ * Note: This does not account for pg_uggrade'd !heapkeyspace indexes
+ */
+bool
+_bt_dedup_is_possible(Relation index)
+{
+	int			dedup_is_possible = false;
+
+	if (IndexRelationGetNumberOfAttributes(index) ==
+		IndexRelationGetNumberOfKeyAttributes(index))
+	{
+		int			i;
+
+		dedup_is_possible = true;
+
+		for (i = 0; i < IndexRelationGetNumberOfKeyAttributes(index); i++)
+		{
+			Oid			opfamily = index->rd_opfamily[i];
+			Oid			collation = index->rd_indcollation[i];
+
+			/* TODO add adequate check of opclasses and collations */
+			elog(DEBUG4, "index %s column i %d opfamilyOid %u collationOid %u",
+				 RelationGetRelationName(index), i, opfamily, collation);
+			/* NUMERIC BTREE OPFAMILY OID is 1988 */
+			if (opfamily == 1988)
+			{
+				return false;
+			}
+		}
+	}
+
+	return dedup_is_possible;
+}
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index dd5315c1aa..747ab4235c 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -21,8 +21,11 @@
 #include "access/xlog.h"
 #include "access/xlogutils.h"
 #include "storage/procarray.h"
+#include "utils/memutils.h"
 #include "miscadmin.h"
 
+static MemoryContext opCtx;		/* working memory for operations */
+
 /*
  * _bt_restore_page -- re-enter all the index tuples on a page
  *
@@ -111,6 +114,7 @@ _bt_restore_meta(XLogReaderState *record, uint8 block_id)
 	Assert(md->btm_version >= BTREE_NOVAC_VERSION);
 	md->btm_oldest_btpo_xact = xlrec->oldest_btpo_xact;
 	md->btm_last_cleanup_num_heap_tuples = xlrec->last_cleanup_num_heap_tuples;
+	md->btm_dedup_is_possible = xlrec->btm_dedup_is_possible;
 
 	pageop = (BTPageOpaque) PageGetSpecialPointer(metapg);
 	pageop->btpo_flags = BTP_META;
@@ -181,9 +185,46 @@ btree_xlog_insert(bool isleaf, bool ismeta, XLogReaderState *record)
 
 		page = BufferGetPage(buffer);
 
-		if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum,
-						false, false) == InvalidOffsetNumber)
-			elog(PANIC, "btree_xlog_insert: failed to add item");
+		if (xlrec->postingoff == InvalidOffsetNumber)
+		{
+			/* Simple retail insertion */
+			if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum,
+							false, false) == InvalidOffsetNumber)
+				elog(PANIC, "btree_xlog_insert: failed to add item");
+		}
+		else
+		{
+			ItemId		itemid;
+			IndexTuple	oposting,
+						newitem,
+						nposting;
+
+			/*
+			 * A posting list split occurred during insertion.
+			 *
+			 * Use _bt_posting_split() to repeat posting list split steps from
+			 * primary.  Note that newitem from WAL record is 'orignewitem',
+			 * not the final version of newitem that is actually inserted on
+			 * page.
+			 */
+			Assert(isleaf);
+			itemid = PageGetItemId(page, OffsetNumberPrev(xlrec->offnum));
+			oposting = (IndexTuple) PageGetItem(page, itemid);
+
+			/* newitem must be mutable copy for _bt_posting_split() */
+			newitem = CopyIndexTuple((IndexTuple) datapos);
+			nposting = _bt_posting_split(newitem, oposting,
+										 xlrec->postingoff);
+
+			/* Replace existing posting list with post-split version */
+			memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
+
+			/* insert new item */
+			Assert(IndexTupleSize(newitem) == datalen);
+			if (PageAddItem(page, (Item) newitem, datalen, xlrec->offnum,
+							false, false) == InvalidOffsetNumber)
+				elog(PANIC, "btree_xlog_insert: failed to add posting split new item");
+		}
 
 		PageSetLSN(page, lsn);
 		MarkBufferDirty(buffer);
@@ -265,20 +306,42 @@ btree_xlog_split(bool onleft, XLogReaderState *record)
 		BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
 		OffsetNumber off;
 		IndexTuple	newitem = NULL,
-					left_hikey = NULL;
+					left_hikey = NULL,
+					nposting = NULL;
 		Size		newitemsz = 0,
 					left_hikeysz = 0;
 		Page		newlpage;
-		OffsetNumber leftoff;
+		OffsetNumber leftoff,
+					replacepostingoff = InvalidOffsetNumber;
 
 		datapos = XLogRecGetBlockData(record, 0, &datalen);
 
-		if (onleft)
+		if (onleft || xlrec->postingoff != 0)
 		{
 			newitem = (IndexTuple) datapos;
 			newitemsz = MAXALIGN(IndexTupleSize(newitem));
 			datapos += newitemsz;
 			datalen -= newitemsz;
+
+			if (xlrec->postingoff != 0)
+			{
+				/*
+				 * Use _bt_posting_split() to repeat posting list split steps
+				 * from primary
+				 */
+				ItemId		itemid;
+				IndexTuple	oposting;
+
+				/* Posting list must be at offset number before new item's */
+				replacepostingoff = OffsetNumberPrev(xlrec->newitemoff);
+
+				/* newitem must be mutable copy for _bt_posting_split() */
+				newitem = CopyIndexTuple(newitem);
+				itemid = PageGetItemId(lpage, replacepostingoff);
+				oposting = (IndexTuple) PageGetItem(lpage, itemid);
+				nposting = _bt_posting_split(newitem, oposting,
+											 xlrec->postingoff);
+			}
 		}
 
 		/* Extract left hikey and its size (assuming 16-bit alignment) */
@@ -304,8 +367,20 @@ btree_xlog_split(bool onleft, XLogReaderState *record)
 			Size		itemsz;
 			IndexTuple	item;
 
+			/* Add replacement posting list when required */
+			if (off == replacepostingoff)
+			{
+				Assert(onleft || xlrec->firstright == xlrec->newitemoff);
+				if (PageAddItem(newlpage, (Item) nposting,
+								MAXALIGN(IndexTupleSize(nposting)), leftoff,
+								false, false) == InvalidOffsetNumber)
+					elog(ERROR, "failed to add new posting list item to left page after split");
+				leftoff = OffsetNumberNext(leftoff);
+				continue;
+			}
+
 			/* add the new item if it was inserted on left page */
-			if (onleft && off == xlrec->newitemoff)
+			else if (onleft && off == xlrec->newitemoff)
 			{
 				if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff,
 								false, false) == InvalidOffsetNumber)
@@ -379,6 +454,83 @@ btree_xlog_split(bool onleft, XLogReaderState *record)
 	}
 }
 
+static void
+btree_xlog_dedup(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	Buffer		buf;
+	xl_btree_dedup *xlrec = (xl_btree_dedup *) XLogRecGetData(record);
+
+	if (XLogReadBufferForRedo(record, 0, &buf) == BLK_NEEDS_REDO)
+	{
+		/*
+		 * Initialize a temporary empty page and copy all the items to that in
+		 * item number order.
+		 */
+		Page		page = (Page) BufferGetPage(buf);
+		OffsetNumber offnum;
+		BTDedupState *state;
+
+		state = (BTDedupState *) palloc(sizeof(BTDedupState));
+
+		state->maxitemsize = BTMaxItemSize(page);
+		state->checkingunique = false;	/* unused */
+		state->newitem = NULL;
+		/* Metadata about current pending posting list */
+		state->htids = NULL;
+		state->nhtids = 0;
+		state->nitems = 0;
+		state->alltupsize = 0;
+		state->overlap = false;
+		/* Metadata about based tuple of current pending posting list */
+		state->base = NULL;
+		state->baseoff = InvalidOffsetNumber;
+		state->basetupsize = 0;
+
+		/* Conservatively size array */
+		state->htids = palloc(state->maxitemsize);
+
+		/*
+		 * Iterate over tuples on the page belonging to the interval to
+		 * deduplicate them into a posting list.
+		 */
+		for (offnum = xlrec->baseoff;
+			 offnum < xlrec->baseoff + xlrec->nitems;
+			 offnum = OffsetNumberNext(offnum))
+		{
+			ItemId		itemid = PageGetItemId(page, offnum);
+			IndexTuple	itup = (IndexTuple) PageGetItem(page, itemid);
+
+			Assert(!ItemIdIsDead(itemid));
+
+			if (offnum == xlrec->baseoff)
+			{
+				/*
+				 * No previous/base tuple for first data item -- use first
+				 * data item as base tuple of first pending posting list
+				 */
+				_bt_dedup_start_pending(state, itup, offnum);
+			}
+			else
+			{
+				/* Heap TID(s) for itup will be saved in state */
+				if (!_bt_dedup_save_htid(state, itup))
+					elog(ERROR, "could not add heap tid to pending posting list");
+			}
+		}
+
+		Assert(state->nitems == xlrec->nitems);
+		/* Handle the last item */
+		_bt_dedup_finish_pending(buf, state, false);
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buf);
+	}
+
+	if (BufferIsValid(buf))
+		UnlockReleaseBuffer(buf);
+}
+
 static void
 btree_xlog_vacuum(XLogReaderState *record)
 {
@@ -386,8 +538,8 @@ btree_xlog_vacuum(XLogReaderState *record)
 	Buffer		buffer;
 	Page		page;
 	BTPageOpaque opaque;
-#ifdef UNUSED
 	xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record);
+#ifdef UNUSED
 
 	/*
 	 * This section of code is thought to be no longer needed, after analysis
@@ -478,14 +630,34 @@ btree_xlog_vacuum(XLogReaderState *record)
 
 		if (len > 0)
 		{
-			OffsetNumber *unused;
-			OffsetNumber *unend;
+			if (xlrec->nupdated > 0)
+			{
+				OffsetNumber *updatedoffsets;
+				IndexTuple	updated;
+				Size		itemsz;
 
-			unused = (OffsetNumber *) ptr;
-			unend = (OffsetNumber *) ((char *) ptr + len);
+				updatedoffsets = (OffsetNumber *)
+					(ptr + xlrec->ndeleted * sizeof(OffsetNumber));
+				updated = (IndexTuple) ((char *) updatedoffsets +
+										xlrec->nupdated * sizeof(OffsetNumber));
 
-			if ((unend - unused) > 0)
-				PageIndexMultiDelete(page, unused, unend - unused);
+				/* Handle posting tuples */
+				for (int i = 0; i < xlrec->nupdated; i++)
+				{
+					PageIndexTupleDelete(page, updatedoffsets[i]);
+
+					itemsz = MAXALIGN(IndexTupleSize(updated));
+
+					if (PageAddItem(page, (Item) updated, itemsz, updatedoffsets[i],
+									false, false) == InvalidOffsetNumber)
+						elog(PANIC, "btree_xlog_vacuum: failed to add updated posting list item");
+
+					updated = (IndexTuple) ((char *) updated + itemsz);
+				}
+			}
+
+			if (xlrec->ndeleted)
+				PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted);
 		}
 
 		/*
@@ -820,7 +992,9 @@ void
 btree_redo(XLogReaderState *record)
 {
 	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+	MemoryContext oldCtx;
 
+	oldCtx = MemoryContextSwitchTo(opCtx);
 	switch (info)
 	{
 		case XLOG_BTREE_INSERT_LEAF:
@@ -838,6 +1012,9 @@ btree_redo(XLogReaderState *record)
 		case XLOG_BTREE_SPLIT_R:
 			btree_xlog_split(false, record);
 			break;
+		case XLOG_BTREE_DEDUP_PAGE:
+			btree_xlog_dedup(record);
+			break;
 		case XLOG_BTREE_VACUUM:
 			btree_xlog_vacuum(record);
 			break;
@@ -863,6 +1040,23 @@ btree_redo(XLogReaderState *record)
 		default:
 			elog(PANIC, "btree_redo: unknown op code %u", info);
 	}
+	MemoryContextSwitchTo(oldCtx);
+	MemoryContextReset(opCtx);
+}
+
+void
+btree_xlog_startup(void)
+{
+	opCtx = AllocSetContextCreate(CurrentMemoryContext,
+								  "Btree recovery temporary context",
+								  ALLOCSET_DEFAULT_SIZES);
+}
+
+void
+btree_xlog_cleanup(void)
+{
+	MemoryContextDelete(opCtx);
+	opCtx = NULL;
 }
 
 /*
diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c
index 4ee6d04a68..1dde2da285 100644
--- a/src/backend/access/rmgrdesc/nbtdesc.c
+++ b/src/backend/access/rmgrdesc/nbtdesc.c
@@ -30,7 +30,8 @@ btree_desc(StringInfo buf, XLogReaderState *record)
 			{
 				xl_btree_insert *xlrec = (xl_btree_insert *) rec;
 
-				appendStringInfo(buf, "off %u", xlrec->offnum);
+				appendStringInfo(buf, "off %u; postingoff %u",
+								 xlrec->offnum, xlrec->postingoff);
 				break;
 			}
 		case XLOG_BTREE_SPLIT_L:
@@ -38,16 +39,30 @@ btree_desc(StringInfo buf, XLogReaderState *record)
 			{
 				xl_btree_split *xlrec = (xl_btree_split *) rec;
 
-				appendStringInfo(buf, "level %u, firstright %d, newitemoff %d",
-								 xlrec->level, xlrec->firstright, xlrec->newitemoff);
+				appendStringInfo(buf, "level %u, firstright %d, newitemoff %d, postingoff %d",
+								 xlrec->level,
+								 xlrec->firstright,
+								 xlrec->newitemoff,
+								 xlrec->postingoff);
+				break;
+			}
+		case XLOG_BTREE_DEDUP_PAGE:
+			{
+				xl_btree_dedup *xlrec = (xl_btree_dedup *) rec;
+
+				appendStringInfo(buf, "baseoff %u; nitems %u",
+								 xlrec->baseoff,
+								 xlrec->nitems);
 				break;
 			}
 		case XLOG_BTREE_VACUUM:
 			{
 				xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec;
 
-				appendStringInfo(buf, "lastBlockVacuumed %u",
-								 xlrec->lastBlockVacuumed);
+				appendStringInfo(buf, "lastBlockVacuumed %u; nupdated %u; ndeleted %u",
+								 xlrec->lastBlockVacuumed,
+								 xlrec->nupdated,
+								 xlrec->ndeleted);
 				break;
 			}
 		case XLOG_BTREE_DELETE:
@@ -131,6 +146,9 @@ btree_identify(uint8 info)
 		case XLOG_BTREE_SPLIT_R:
 			id = "SPLIT_R";
 			break;
+		case XLOG_BTREE_DEDUP_PAGE:
+			id = "DEDUPLICATE";
+			break;
 		case XLOG_BTREE_VACUUM:
 			id = "VACUUM";
 			break;
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 4a80e84aa7..593f74c26e 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -107,11 +107,43 @@ typedef struct BTMetaPageData
 										 * pages */
 	float8		btm_last_cleanup_num_heap_tuples;	/* number of heap tuples
 													 * during last cleanup */
+	bool		btm_dedup_is_possible;	/* whether the deduplication can be
+										 * applied to the index */
 } BTMetaPageData;
 
 #define BTPageGetMeta(p) \
 	((BTMetaPageData *) PageGetContents(p))
 
+/* Storage type for Btree's reloptions */
+typedef struct BtreeOptions
+{
+	int32		vl_len_;		/* varlena header (do not touch directly!) */
+	int			fillfactor;
+	double		vacuum_cleanup_index_scale_factor;
+	bool		do_deduplication;
+} BtreeOptions;
+
+/*
+ * By default deduplication is enabled for non unique indexes
+ * and disabled for unique ones
+ *
+ * XXX: Actually, we use deduplication everywhere for now.  Re-review this
+ * decision later on.
+ */
+#define BtreeDefaultDoDedup(relation) \
+	(relation->rd_index->indisunique ? true : true)
+
+#define BtreeGetDoDedupOption(relation) \
+	((relation)->rd_options ? \
+	 ((BtreeOptions *) (relation)->rd_options)->do_deduplication : BtreeDefaultDoDedup(relation))
+
+#define BtreeGetFillFactor(relation, defaultff) \
+	((relation)->rd_options ? \
+	 ((BtreeOptions *) (relation)->rd_options)->fillfactor : (defaultff))
+
+#define BtreeGetTargetPageFreeSpace(relation, defaultff) \
+	(BLCKSZ * (100 - BtreeGetFillFactor(relation, defaultff)) / 100)
+
 /*
  * The current Btree version is 4.  That's what you'll get when you create
  * a new index.
@@ -234,8 +266,7 @@ typedef struct BTMetaPageData
  *  t_tid | t_info | key values | INCLUDE columns, if any
  *
  * t_tid points to the heap TID, which is a tiebreaker key column as of
- * BTREE_VERSION 4.  Currently, the INDEX_ALT_TID_MASK status bit is never
- * set for non-pivot tuples.
+ * BTREE_VERSION 4.
  *
  * All other types of index tuples ("pivot" tuples) only have key columns,
  * since pivot tuples only exist to represent how the key space is
@@ -252,6 +283,38 @@ typedef struct BTMetaPageData
  * omitted rather than truncated, since its representation is different to
  * the non-pivot representation.)
  *
+ * Non-pivot posting tuple format:
+ *  t_tid | t_info | key values | INCLUDE columns, if any | posting_list[]
+ *
+ * In order to store duplicated keys more effectively, we use special format
+ * of tuples - posting tuples.  posting_list is an array of ItemPointerData.
+ *
+ * Deduplication never applies to unique indexes or indexes with INCLUDEd
+ * columns.
+ *
+ * To differ posting tuples we use INDEX_ALT_TID_MASK flag in t_info and
+ * BT_IS_POSTING flag in t_tid.
+ * These flags redefine the content of the posting tuple's tid:
+ * - t_tid.ip_blkid contains offset of the posting list.
+ * - t_tid offset field contains number of posting items this tuple contain
+ *
+ * The 12 least significant offset bits from t_tid are used to represent
+ * the number of posting items in posting tuples, leaving 4 status
+ * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for
+ * future use.
+ * BT_N_POSTING_OFFSET_MASK is large enough to store any number of posting
+ * tuples, which is constrainted by BTMaxItemSize.
+
+ * If page contains so many duplicates, that they do not fit into one posting
+ * tuple (bounded by BTMaxItemSize and ), page may contain several posting
+ * tuples with the same key.
+ * Also page can contain both posting and non-posting tuples with the same key.
+ * Currently, posting tuples always contain at least two TIDs in the posting
+ * list.
+ *
+ * Posting tuples always have the same number of attributes as the index has
+ * generally.
+ *
  * Pivot tuple format:
  *
  *  t_tid | t_info | key values | [heap TID]
@@ -281,23 +344,152 @@ typedef struct BTMetaPageData
  * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for
  * future use.  BT_N_KEYS_OFFSET_MASK should be large enough to store any
  * number of columns/attributes <= INDEX_MAX_KEYS.
+ * BT_IS_POSTING bit must be unset for pivot tuples, since we use it
+ * to distinct posting tuples from pivot tuples.
  *
  * Note well: The macros that deal with the number of attributes in tuples
- * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple,
- * and that a tuple without INDEX_ALT_TID_MASK set must be a non-pivot
- * tuple (or must have the same number of attributes as the index has
- * generally in the case of !heapkeyspace indexes).  They will need to be
- * updated if non-pivot tuples ever get taught to use INDEX_ALT_TID_MASK
- * for something else.
+ * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple or
+ * non-pivot posting tuple, and that a tuple without INDEX_ALT_TID_MASK set
+ * must be a non-pivot tuple (or must have the same number of attributes as
+ * the index has generally in the case of !heapkeyspace indexes).
  */
 #define INDEX_ALT_TID_MASK			INDEX_AM_RESERVED_BIT
 
 /* Item pointer offset bits */
 #define BT_RESERVED_OFFSET_MASK		0xF000
 #define BT_N_KEYS_OFFSET_MASK		0x0FFF
+#define BT_N_POSTING_OFFSET_MASK	0x0FFF
 #define BT_HEAP_TID_ATTR			0x1000
+#define BT_IS_POSTING				0x2000
 
-/* Get/set downlink block number */
+/*
+ * MaxPostingIndexTuplesPerPage is an upper bound on the number of tuples
+ * that can fit on one btree leaf page.
+ *
+ * Btree leaf pages may contain posting tuples, which store duplicates
+ * in a more effective way, so MaxPostingIndexTuplesPerPage is larger then
+ * MaxIndexTuplesPerPage.
+ *
+ * Each leaf page must contain at least three items, so estimate it as
+ * if we have three posting tuples with minimal size keys.
+ */
+#define MaxPostingIndexTuplesPerPage \
+	((int) ((BLCKSZ - SizeOfPageHeaderData - \
+			3*((MAXALIGN(sizeof(IndexTupleData) + 1) + sizeof(ItemIdData))) )) / \
+			(sizeof(ItemPointerData)))
+
+/*
+ * State used to representing a pending posting list during deduplication.
+ *
+ * Each entry represents a group of consecutive items from the page, starting
+ * from page offset number 'baseoff', which is the offset number of the "base"
+ * tuple on the page undergoing deduplication.  'nitems' is the total number
+ * of items from the page that will be merged to make a new posting tuple.
+ *
+ * Note: 'nitems' means the number of physical index tuples/line pointers on
+ * the page, starting with and including the item at offset number 'baseoff'
+ * (so nitems should be at least 2 when interval is used).  These existing
+ * tuples may be posting list tuples or regular tuples.
+ */
+typedef struct BTDedupInterval
+{
+	OffsetNumber baseoff;
+	OffsetNumber nitems;
+} BTDedupInterval;
+
+/*
+ * Btree-private state needed to build posting tuples.  htids is an array of
+ * ItemPointers for pending posting list.
+ *
+ * Iterating over tuples during index build or applying deduplication to a
+ * single page, we remember a "base" tuple, then compare the next one with it.
+ * If tuples are equal, save their TIDs in the posting list.
+ */
+typedef struct BTDedupState
+{
+	Relation	rel;
+	/* Deduplication status info for entire page/operation */
+	Size		maxitemsize;	/* BTMaxItemSize() limit for page */
+	IndexTuple	newitem;
+	bool		checkingunique;
+
+	/* Metadata about current pending posting list */
+	ItemPointer htids;			/* Heap TIDs in pending posting list */
+	int			nhtids;			/* # valid heap TIDs in nhtids array */
+	int			nitems;			/* See BTDedupInterval definition */
+	Size		alltupsize;		/* Includes line pointer overhead */
+	bool		overlap;		/* Avoid overlapping posting lists? */
+
+	/* Metadata about base tuple of current pending posting list */
+	IndexTuple	base;			/* Use to form new posting list */
+	OffsetNumber baseoff;		/* page offset of base */
+	Size		basetupsize;	/* base size without posting list */
+
+	/*
+	 * Pending posting list.  Contains information about a group of
+	 * consecutive items that will be deduplicated by creating a new posting
+	 * list tuple.
+	 */
+	BTDedupInterval interval;
+} BTDedupState;
+
+/*
+ * N.B.: BTreeTupleIsPivot() should only be used in code that deals with
+ * heapkeyspace indexes specifically.  BTreeTupleIsPosting() works with all
+ * nbtree indexes, though.
+ */
+#define BTreeTupleIsPivot(itup)  \
+	( \
+		((itup)->t_info & INDEX_ALT_TID_MASK && \
+		((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) == 0))\
+	)
+#define BTreeTupleIsPosting(itup)  \
+	( \
+		((itup)->t_info & INDEX_ALT_TID_MASK && \
+		((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) != 0))\
+	)
+
+#define BTreeTupleClearBtIsPosting(itup) \
+	do { \
+		ItemPointerSetOffsetNumber(&(itup)->t_tid, \
+		ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & ~BT_IS_POSTING); \
+	} while(0)
+
+#define BTreeTupleGetNPosting(itup)	\
+	( \
+		AssertMacro(BTreeTupleIsPosting(itup)), \
+		ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_POSTING_OFFSET_MASK \
+	)
+#define BTreeTupleSetNPosting(itup, n) \
+	do { \
+		ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_POSTING_OFFSET_MASK); \
+		Assert((itup)->t_info & INDEX_ALT_TID_MASK); \
+		Assert(!((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) != 0)); \
+		ItemPointerSetOffsetNumber(&(itup)->t_tid, \
+			ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_IS_POSTING); \
+	} while(0)
+
+/*
+ * If tuple is posting, t_tid.ip_blkid contains offset of the posting list
+ */
+#define BTreeTupleGetPostingOffset(itup) \
+	( \
+		AssertMacro(BTreeTupleIsPosting(itup)), \
+		ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid)) \
+	)
+#define BTreeSetPostingMeta(itup, nposting, off) \
+	do { \
+		BTreeTupleSetNPosting(itup, nposting); \
+		Assert(BTreeTupleIsPosting(itup)); \
+		ItemPointerSetBlockNumber(&((itup)->t_tid), (off)); \
+	} while(0)
+
+#define BTreeTupleGetPosting(itup) \
+	(ItemPointer) ((char*) (itup) + BTreeTupleGetPostingOffset(itup))
+#define BTreeTupleGetPostingN(itup,n) \
+	(BTreeTupleGetPosting(itup) + (n))
+
+/* Get/set downlink block number  */
 #define BTreeInnerTupleGetDownLink(itup) \
 	ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid))
 #define BTreeInnerTupleSetDownLink(itup, blkno) \
@@ -326,40 +518,73 @@ typedef struct BTMetaPageData
  */
 #define BTreeTupleGetNAtts(itup, rel)	\
 	( \
-		(itup)->t_info & INDEX_ALT_TID_MASK ? \
+		((itup)->t_info & INDEX_ALT_TID_MASK && \
+		((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) == 0)) ? \
 		( \
 			ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK \
 		) \
 		: \
 		IndexRelationGetNumberOfAttributes(rel) \
 	)
-#define BTreeTupleSetNAtts(itup, n) \
-	do { \
-		(itup)->t_info |= INDEX_ALT_TID_MASK; \
-		ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_KEYS_OFFSET_MASK); \
-	} while(0)
+
+static inline void
+BTreeTupleSetNAtts(IndexTuple itup, int n)
+{
+	Assert(!BTreeTupleIsPosting(itup));
+	itup->t_info |= INDEX_ALT_TID_MASK;
+	ItemPointerSetOffsetNumber(&itup->t_tid, n & BT_N_KEYS_OFFSET_MASK);
+}
 
 /*
- * Get tiebreaker heap TID attribute, if any.  Macro works with both pivot
- * and non-pivot tuples, despite differences in how heap TID is represented.
+ * Get tiebreaker heap TID attribute, if any.  Works with both pivot and
+ * non-pivot tuples, despite differences in how heap TID is represented.
+ *
+ * This returns the first/lowest heap TID in the case of a posting list tuple.
  */
-#define BTreeTupleGetHeapTID(itup) \
-	( \
-	  (itup)->t_info & INDEX_ALT_TID_MASK && \
-	  (ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_HEAP_TID_ATTR) != 0 ? \
-	  ( \
-		(ItemPointer) (((char *) (itup) + IndexTupleSize(itup)) - \
-					   sizeof(ItemPointerData)) \
-	  ) \
-	  : (itup)->t_info & INDEX_ALT_TID_MASK ? NULL : (ItemPointer) &((itup)->t_tid) \
-	)
+static inline ItemPointer
+BTreeTupleGetHeapTID(IndexTuple itup)
+{
+	if (BTreeTupleIsPivot(itup))
+	{
+		/* Pivot tuple heap TID representation? */
+		if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) &
+			 BT_HEAP_TID_ATTR) != 0)
+			return (ItemPointer) ((char *) itup + IndexTupleSize(itup) -
+								  sizeof(ItemPointerData));
+
+		/* Heap TID attribute was truncated */
+		return NULL;
+	}
+	else if (BTreeTupleIsPosting(itup))
+		return BTreeTupleGetPosting(itup);
+
+	return &(itup->t_tid);
+}
+
+/*
+ * Get maximum heap TID attribute, which could be the only TID in the case of
+ * a non-pivot tuple that does not have a posting list tuple.  Works with
+ * non-pivot tuples only.
+ */
+static inline ItemPointer
+BTreeTupleGetMaxHeapTID(IndexTuple itup)
+{
+	Assert(!BTreeTupleIsPivot(itup));
+
+	if (BTreeTupleIsPosting(itup))
+		return (ItemPointer) (BTreeTupleGetPosting(itup) +
+							  (BTreeTupleGetNPosting(itup) - 1));
+
+	return &(itup->t_tid);
+}
+
 /*
  * Set the heap TID attribute for a tuple that uses the INDEX_ALT_TID_MASK
- * representation (currently limited to pivot tuples)
+ * representation
  */
 #define BTreeTupleSetAltHeapTID(itup) \
 	do { \
-		Assert((itup)->t_info & INDEX_ALT_TID_MASK); \
+		Assert(BTreeTupleIsPivot(itup)); \
 		ItemPointerSetOffsetNumber(&(itup)->t_tid, \
 								   ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_HEAP_TID_ATTR); \
 	} while(0)
@@ -472,6 +697,7 @@ typedef struct BTScanInsertData
 	bool		anynullkeys;
 	bool		nextkey;
 	bool		pivotsearch;
+	bool		dedup_is_possible;
 	ItemPointer scantid;		/* tiebreaker for scankeys */
 	int			keysz;			/* Size of scankeys array */
 	ScanKeyData scankeys[INDEX_MAX_KEYS];	/* Must appear last */
@@ -499,6 +725,13 @@ typedef struct BTInsertStateData
 	/* Buffer containing leaf page we're likely to insert itup on */
 	Buffer		buf;
 
+	/*
+	 * if _bt_binsrch_insert() found the location inside existing posting
+	 * list, save the position inside the list.  This will be -1 in rare cases
+	 * where the overlapping posting list is LP_DEAD.
+	 */
+	int			postingoff;
+
 	/*
 	 * Cache of bounds within the current buffer.  Only used for insertions
 	 * where _bt_check_unique is called.  See _bt_binsrch_insert and
@@ -534,7 +767,9 @@ typedef BTInsertStateData *BTInsertState;
  * If we are doing an index-only scan, we save the entire IndexTuple for each
  * matched item, otherwise only its heap TID and offset.  The IndexTuples go
  * into a separate workspace array; each BTScanPosItem stores its tuple's
- * offset within that array.
+ * offset within that array.  Posting list tuples store a version of the
+ * tuple that does not include the posting list, allowing the same key to be
+ * returned for each logical tuple associated with the posting list.
  */
 
 typedef struct BTScanPosItem	/* what we remember about each match */
@@ -563,9 +798,13 @@ typedef struct BTScanPosData
 
 	/*
 	 * If we are doing an index-only scan, nextTupleOffset is the first free
-	 * location in the associated tuple storage workspace.
+	 * location in the associated tuple storage workspace.  Posting list
+	 * tuples need postingTupleOffset to store the current location of the
+	 * tuple that is returned multiple times (once per heap TID in posting
+	 * list).
 	 */
 	int			nextTupleOffset;
+	int			postingTupleOffset;
 
 	/*
 	 * The items array is always ordered in index order (ie, increasing
@@ -578,7 +817,7 @@ typedef struct BTScanPosData
 	int			lastItem;		/* last valid index in items[] */
 	int			itemIndex;		/* current index in items[] */
 
-	BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */
+	BTScanPosItem items[MaxPostingIndexTuplesPerPage];	/* MUST BE LAST */
 } BTScanPosData;
 
 typedef BTScanPosData *BTScanPos;
@@ -730,8 +969,15 @@ extern void _bt_parallel_advance_array_keys(IndexScanDesc scan);
  */
 extern bool _bt_doinsert(Relation rel, IndexTuple itup,
 						 IndexUniqueCheck checkUnique, Relation heapRel);
+extern IndexTuple _bt_posting_split(IndexTuple newitem, IndexTuple oposting,
+									OffsetNumber postingoff);
 extern void _bt_finish_split(Relation rel, Buffer bbuf, BTStack stack);
 extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child);
+extern void _bt_dedup_start_pending(BTDedupState *state, IndexTuple base,
+									OffsetNumber base_off);
+extern bool _bt_dedup_save_htid(BTDedupState *state, IndexTuple itup);
+extern Size _bt_dedup_finish_pending(Buffer buffer, BTDedupState *state,
+									 bool need_wal);
 
 /*
  * prototypes for functions in nbtsplitloc.c
@@ -743,7 +989,8 @@ extern OffsetNumber _bt_findsplitloc(Relation rel, Page page,
 /*
  * prototypes for functions in nbtpage.c
  */
-extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level);
+extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
+							 bool dedup_is_possible);
 extern void _bt_update_meta_cleanup_info(Relation rel,
 										 TransactionId oldestBtpoXact, float8 numHeapTuples);
 extern void _bt_upgrademetapage(Page page);
@@ -751,6 +998,7 @@ extern Buffer _bt_getroot(Relation rel, int access);
 extern Buffer _bt_gettrueroot(Relation rel);
 extern int	_bt_getrootheight(Relation rel);
 extern bool _bt_heapkeyspace(Relation rel);
+extern bool _bt_getdedupispossible(Relation rel);
 extern void _bt_checkpage(Relation rel, Buffer buf);
 extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
 extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
@@ -762,6 +1010,8 @@ extern void _bt_delitems_delete(Relation rel, Buffer buf,
 								OffsetNumber *itemnos, int nitems, Relation heapRel);
 extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
 								OffsetNumber *itemnos, int nitems,
+								OffsetNumber *updateitemnos,
+								IndexTuple *updated, int nupdateable,
 								BlockNumber lastBlockVacuumed);
 extern int	_bt_pagedel(Relation rel, Buffer buf);
 
@@ -812,6 +1062,9 @@ extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page,
 							OffsetNumber offnum);
 extern void _bt_check_third_page(Relation rel, Relation heap,
 								 bool needheaptidspace, Page page, IndexTuple newtup);
+extern IndexTuple BTreeFormPostingTuple(IndexTuple tuple, ItemPointer htids,
+										int nhtids);
+extern bool _bt_dedup_is_possible(Relation index);
 
 /*
  * prototypes for functions in nbtvalidate.c
diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h
index 91b9ee00cf..71f6568234 100644
--- a/src/include/access/nbtxlog.h
+++ b/src/include/access/nbtxlog.h
@@ -28,7 +28,8 @@
 #define XLOG_BTREE_INSERT_META	0x20	/* same, plus update metapage */
 #define XLOG_BTREE_SPLIT_L		0x30	/* add index tuple with split */
 #define XLOG_BTREE_SPLIT_R		0x40	/* as above, new item on right */
-/* 0x50 and 0x60 are unused */
+#define XLOG_BTREE_DEDUP_PAGE	0x50	/* deduplicate tuples on leaf page */
+/* 0x60 is unused */
 #define XLOG_BTREE_DELETE		0x70	/* delete leaf index tuples for a page */
 #define XLOG_BTREE_UNLINK_PAGE	0x80	/* delete a half-dead page */
 #define XLOG_BTREE_UNLINK_PAGE_META 0x90	/* same, and update metapage */
@@ -53,6 +54,7 @@ typedef struct xl_btree_metadata
 	uint32		fastlevel;
 	TransactionId oldest_btpo_xact;
 	float8		last_cleanup_num_heap_tuples;
+	bool		btm_dedup_is_possible;
 } xl_btree_metadata;
 
 /*
@@ -61,16 +63,21 @@ typedef struct xl_btree_metadata
  * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META.
  * Note that INSERT_META implies it's not a leaf page.
  *
- * Backup Blk 0: original page (data contains the inserted tuple)
+ * Backup Blk 0: original page (data contains the inserted tuple);
+ *				 if postingoff is set, this started out as an insertion
+ *				 into an existing posting tuple at the offset before
+ *				 offnum (i.e. it's a posting list split).  (REDO will
+ *				 have to update split posting list, too.)
  * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META
  * Backup Blk 2: xl_btree_metadata, if INSERT_META
  */
 typedef struct xl_btree_insert
 {
 	OffsetNumber offnum;
+	OffsetNumber postingoff;
 } xl_btree_insert;
 
-#define SizeOfBtreeInsert	(offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber))
+#define SizeOfBtreeInsert	(offsetof(xl_btree_insert, postingoff) + sizeof(OffsetNumber))
 
 /*
  * On insert with split, we save all the items going into the right sibling
@@ -91,9 +98,19 @@ typedef struct xl_btree_insert
  *
  * Backup Blk 0: original page / new left page
  *
- * The left page's data portion contains the new item, if it's the _L variant.
- * An IndexTuple representing the high key of the left page must follow with
- * either variant.
+ * The left page's data portion contains the new item, if it's the _L variant
+ * (though _R variant page split records with a posting list split sometimes
+ * need to include newitem).  An IndexTuple representing the high key of the
+ * left page must follow in all cases.
+ *
+ * The newitem is actually an "original" newitem when a posting list split
+ * occurs that requires than the original posting list be updated in passing.
+ * Recovery recognizes this case when postingoff is set, and must use the
+ * posting offset to do an in-place update of the existing posting list that
+ * was actually split, and change the newitem to the "final" newitem.  This
+ * corresponds to the xl_btree_insert postingoff-is-set case.  postingoff
+ * won't be set when a posting list split occurs where both original posting
+ * list and newitem go on the right page.
  *
  * Backup Blk 1: new right page
  *
@@ -111,10 +128,26 @@ typedef struct xl_btree_split
 {
 	uint32		level;			/* tree level of page being split */
 	OffsetNumber firstright;	/* first item moved to right page */
-	OffsetNumber newitemoff;	/* new item's offset (useful for _L variant) */
+	OffsetNumber newitemoff;	/* new item's offset */
+	OffsetNumber postingoff;	/* offset inside orig posting tuple */
 } xl_btree_split;
 
-#define SizeOfBtreeSplit	(offsetof(xl_btree_split, newitemoff) + sizeof(OffsetNumber))
+#define SizeOfBtreeSplit	(offsetof(xl_btree_split, postingoff) + sizeof(OffsetNumber))
+
+/*
+ * When page is deduplicated, consecutive groups of tuples with equal keys are
+ * merged together into posting list tuples.
+ *
+ * The WAL record represents the interval that describes the posing tuple
+ * that should be added to the page.
+ */
+typedef struct xl_btree_dedup
+{
+	OffsetNumber baseoff;
+	OffsetNumber nitems;
+} xl_btree_dedup;
+
+#define SizeOfBtreeDedup 	(offsetof(xl_btree_dedup, nitems) + sizeof(OffsetNumber))
 
 /*
  * This is what we need to know about delete of individual leaf index tuples.
@@ -166,16 +199,27 @@ typedef struct xl_btree_reuse_page
  * block numbers aren't given.
  *
  * Note that the *last* WAL record in any vacuum of an index is allowed to
- * have a zero length array of offsets. Earlier records must have at least one.
+ * have a zero length array of target offsets (i.e. no deletes or updates).
+ * Earlier records must have at least one.
  */
 typedef struct xl_btree_vacuum
 {
 	BlockNumber lastBlockVacuumed;
 
-	/* TARGET OFFSET NUMBERS FOLLOW */
+	/*
+	 * This field helps us to find beginning of the updated versions of tuples
+	 * which follow array of offset numbers, needed when a posting list is
+	 * vacuumed without killing all of its logical tuples.
+	 */
+	uint32		nupdated;
+	uint32		ndeleted;
+
+	/* UPDATED TARGET OFFSET NUMBERS FOLLOW (if any) */
+	/* UPDATED TUPLES TO ADD BACK FOLLOW (if any) */
+	/* DELETED TARGET OFFSET NUMBERS FOLLOW (if any) */
 } xl_btree_vacuum;
 
-#define SizeOfBtreeVacuum	(offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber))
+#define SizeOfBtreeVacuum	(offsetof(xl_btree_vacuum, ndeleted) + sizeof(BlockNumber))
 
 /*
  * This is what we need to know about marking an empty branch for deletion.
@@ -256,6 +300,8 @@ typedef struct xl_btree_newroot
 extern void btree_redo(XLogReaderState *record);
 extern void btree_desc(StringInfo buf, XLogReaderState *record);
 extern const char *btree_identify(uint8 info);
+extern void btree_xlog_startup(void);
+extern void btree_xlog_cleanup(void);
 extern void btree_mask(char *pagedata, BlockNumber blkno);
 
 #endif							/* NBTXLOG_H */
diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h
index 3c0db2ccf5..2b8c6c7fc8 100644
--- a/src/include/access/rmgrlist.h
+++ b/src/include/access/rmgrlist.h
@@ -36,7 +36,7 @@ PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL,
 PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL)
 PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask)
 PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask)
-PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, NULL, NULL, btree_mask)
+PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, btree_xlog_startup, btree_xlog_cleanup, btree_mask)
 PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask)
 PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask)
 PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask)
diff --git a/src/tools/valgrind.supp b/src/tools/valgrind.supp
index ec47a228ae..71a03e39d3 100644
--- a/src/tools/valgrind.supp
+++ b/src/tools/valgrind.supp
@@ -212,3 +212,24 @@
    Memcheck:Cond
    fun:PyObject_Realloc
 }
+
+# Temporarily work around bug in datum_image_eq's handling of the cstring
+# (typLen == -2) case.  datumIsEqual() is not affected, but also doesn't handle
+# TOAST'ed values correctly.
+#
+# FIXME: Remove both suppressions when bug is fixed on master branch
+{
+   temporary_workaround_1
+   Memcheck:Addr1
+   fun:bcmp
+   fun:datum_image_eq
+   fun:_bt_keep_natts_fast
+}
+
+{
+   temporary_workaround_8
+   Memcheck:Addr8
+   fun:bcmp
+   fun:datum_image_eq
+   fun:_bt_keep_natts_fast
+}
-- 
2.17.1