From 1837d2404ebfadeed824e904ffe4c1588b14a49b Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Tue, 27 Nov 2018 16:54:17 -0800
Subject: [PATCH v9 7/7] POC: Add dynamic prefix truncation to nbtree.

There is an extremely rare and subtle race condition in this commit,
that seems avoidable by rethinking the exact approach taken to
implementing page deletion.  Note that assert-enabled builds don't trust
dynamic prefix truncation, and instead verify that it's correct.  That
doesn't seem to cause any trouble in practice, though only because the
race is remarkably narrow.
---
 src/backend/access/nbtree/README      | 53 +++++++++++++++++++++++
 src/backend/access/nbtree/nbtsearch.c | 62 ++++++++++++++++++++++++++-
 src/backend/access/nbtree/nbtutils.c  |  2 +
 src/include/access/nbtree.h           | 11 +++++
 4 files changed, 127 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 700b940b79..83112fb61c 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -747,6 +747,59 @@ groups of duplicates align with page boundaries).  In any case, getting
 better locality of access for index scans is more important than getting
 optimal space utilization.
 
+Notes about dynamic prefix truncation
+-------------------------------------
+
+We don't implement canonical prefix B-Trees from the Bayer & Unterauer
+paper (only "simple prefix B-Trees"/suffix truncation), as truncating a
+prefix from keys on the leaf level is not compatible with index-only
+scans, and would probably negatively impact the performance of selective
+index scans.  We can nevertheless exploit the fact that there is often
+redundancy in prefixes that are common to all possible tuples on a page,
+especially at the leaf level.
+
+Binary searches on internal pages can eliminate earlier attributes from
+consideration from their remaining search interval, and from the entire
+subtree that the search is about to descend to the root of.  Insertion
+scan keys store mutable state that remembers which attribute future
+comparisons can skip straight to.  This remains valid throughout the
+entire lifetime of the insertion scan key, without any need to treat
+concurrent page splits and page deletions as special cases.  This is
+called dynamic prefix truncation.
+
+Recall that an insertion scan key is only used to find the initial leaf
+page of an index scan.  In general, it can be thought of as a structure
+that searches for one exact location equal to or just after some existing
+leaf level non-pivot tuple.  The only exception is insertions into unique
+indexes, where the exact position searched for varies over time to suit
+the purposes of unique checking (a scantid is filled-in after uniqueness
+is established).  That special case is the main reason why we don't
+continue to eliminate attributes at leaf level binary searches (we could
+be more selective, but that doesn't seem worthwhile -- note that we can
+still skip attributes at the leaf level in all cases).   The limited and
+well-defined purpose of insertion scan keys makes it safe to eliminate
+whole attributes from consideration.
+
+Concurrent page splits cannot invalidate the skip hint, because pivot
+tuples in the parent are to the left of values in child pages; the new
+high key in the left half of the split must still have equal attributes
+before the attribute that we skip straight to, so the decision about
+whether or not we move right is still correct.
+
+Concurrent deletion of a child page cannot invalidate the skip hint,
+either.  Upon finding an ignorable page we'll move right, and find a page
+whose keyspace is _smaller_ than expected.  If the page to the right of a
+child page is concurrently deleted, we'll keep its original downlink as
+the child's high key, so nothing changes.
+
+XXX: What about the case where the page to the left of a child page we
+land on is concurrently deleted?  Doesn't that have basically the same
+race condition as the one described at length in contrib/amcheck's
+bt_downlink_check() function? It seems like there might have been some
+wisdom in Lanin & Shasha's idea of making the key space move left rather
+than move right during page deletion; that makes page deletion the exact
+opposite of a page split, which we don't quite manage.
+
 Notes About Data Representation
 -------------------------------
 
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index c1a483e8d1..105e1f4d87 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -365,7 +365,10 @@ _bt_binsrch(Relation rel,
 				savehigh;
 	int32		result,
 				cmpval;
-	bool		isleaf;
+	bool		isleaf,
+				saveskipatt;
+	int			lastcomparedattlow;
+	int			lastcomparedatthigh;
 
 	page = BufferGetPage(buf);
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -430,6 +433,12 @@ _bt_binsrch(Relation rel,
 
 	cmpval = key->nextkey ? 0 : 1;	/* select comparison value */
 	savehigh = high;
+	/* Set up state for skipping comparisons of entire attributes */
+	key->lastcomparedatt =
+		lastcomparedattlow =
+		lastcomparedatthigh =
+		key->skiptoatt;
+	saveskipatt = !isleaf && key->heapkeyspace;
 	while (high > low)
 	{
 		OffsetNumber mid = low + ((high - low) / 2);
@@ -442,10 +451,14 @@ _bt_binsrch(Relation rel,
 			result = _bt_nonpivot_compare(rel, key, page, mid);
 
 		if (result >= cmpval)
+		{
 			low = mid + 1;
+			lastcomparedattlow = key->lastcomparedatt;
+		}
 		else
 		{
 			high = mid;
+			lastcomparedatthigh = key->lastcomparedatt;
 
 			/*
 			 * high can only be reused by more restrictive binary search when
@@ -454,6 +467,31 @@ _bt_binsrch(Relation rel,
 			if (result != 0)
 				savehigh = high;
 		}
+
+		if (saveskipatt)
+		{
+			int			skiptoatt;
+
+			/*
+			 * On an internal page, caller is about to descend a downlink
+			 * between two "separator keys" -- one in the pivot tuple that
+			 * contains the downlink, and the other to its right (may be the
+			 * separator in the high key).  Since these separators provide
+			 * bounds on the keyspace of the subtree whose root is the child
+			 * page we're about to descend to, we may be able to safely skip
+			 * comparisons of earlier attributes in that subtree.  We can even
+			 * safely eliminate earlier attributes from the remaining search
+			 * interval of the current page.  Remember the attribute after the
+			 * last attribute that is definitely equal to the insertion scan
+			 * key in the case of both separators.  This should never decrease
+			 * throughout the entire lifetime of any insertion scan key.
+			 *
+			 * This is explained at length in the nbtree README.
+			 */
+			skiptoatt = Min(lastcomparedattlow, lastcomparedatthigh);
+			Assert(key->skiptoatt <= skiptoatt);
+			key->skiptoatt = skiptoatt;
+		}
 	}
 
 	if (key->savebinsrch)
@@ -522,7 +560,10 @@ _bt_compare(Relation rel,
 	 * still required.
 	 */
 	if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
+	{
+		key->lastcomparedatt = key->skiptoatt;
 		return 1;
+	}
 
 	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
 	ntupatts = BTreeTupleGetNAtts(itup, rel);
@@ -591,8 +632,14 @@ _bt_tuple_compare(Relation rel,
 	 */
 
 	ncmpkey = Min(ntupatts, key->keysz);
+	/* Don't skip attributes on assert-enabled builds */
+#ifndef USE_ASSERT_CHECKING
+	scankey = key->scankeys + (key->skiptoatt - 1);
+	for (i = key->skiptoatt; i <= ncmpkey; i++)
+#else
 	scankey = key->scankeys;
 	for (i = 1; i <= ncmpkey; i++)
+#endif
 	{
 		Datum		datum;
 		bool		isNull;
@@ -636,6 +683,11 @@ _bt_tuple_compare(Relation rel,
 				INVERT_COMPARE_RESULT(result);
 		}
 
+		/* Assert that key->skiptoatt is correct */
+		Assert(i >= key->skiptoatt || result == 0);
+		/* Record for caller that this attribute was compared */
+		key->lastcomparedatt = i;
+
 		/* if the keys are unequal, return the difference */
 		if (result != 0)
 			return result;
@@ -643,6 +695,12 @@ _bt_tuple_compare(Relation rel,
 		scankey++;
 	}
 
+	/*
+	 * Record for caller that comparison is resolved at a truncated/negative
+	 * infinity attribute, or heap TID attribute
+	 */
+	key->lastcomparedatt = ncmpkey + 1;
+
 	/*
 	 * Use the number of attributes as a tie-breaker, in order to treat
 	 * truncated attributes in index as minus infinity
@@ -1253,6 +1311,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	key.heapkeyspace = _bt_heapkeyspace(rel);
 	key.savebinsrch = key.restorebinsrch = false;
 	key.low = key.high = InvalidOffsetNumber;
+	key.lastcomparedatt = 1;
+	key.skiptoatt = 1;
 	key.nextkey = nextkey;
 	key.keysz = keysCount;
 	key.scantid = NULL;
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index a4964dc22c..3637459902 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -111,6 +111,8 @@ _bt_mkscankey(Relation rel, IndexTuple itup, bool assumeheapkeyspace)
 	res->heapkeyspace = assumeheapkeyspace || _bt_heapkeyspace(rel);
 	res->savebinsrch = res->restorebinsrch = false;
 	res->low = res->high = InvalidOffsetNumber;
+	res->lastcomparedatt = 1;
+	res->skiptoatt = 1;
 	res->nextkey = false;
 	res->keysz = Min(indnkeyatts, tupnatts);
 	res->scantid = res->heapkeyspace ? BTreeTupleGetHeapTID(itup) : NULL;
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index d7fa9e8c49..a2d9675259 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -407,6 +407,17 @@ typedef BTStackData *BTStack;
 
 typedef struct BTScanInsertData
 {
+	/*
+	 * Mutable state used used by _bt_binsrch() for "dynamic prefix
+	 * truncation", an optimization that allows later _bt_search() comparisons
+	 * to skip earlier attributes that can no longer be unequal to scankey
+	 * values.  Used by both _bt_first() and _bt_doinsert().  "skiptoatt" may
+	 * increase when a descent of a tree eliminates additional whole
+	 * attributes from consideration.  It can never decrease.
+	 */
+	int			 lastcomparedatt;
+	int			 skiptoatt;
+
 	/*
 	 * Mutable state used by _bt_binsrch() to inexpensively repeat a binary
 	 * search on the leaf level when only scantid has changed.  Only used for
-- 
2.17.1