From e9135038d429fe8dc47db746e9fa04627fcd604c Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 1 Oct 2018 16:48:08 -0700
Subject: [PATCH v10 5/7] Add split-at-new-tuple page split optimization.

Add additional heuristics to the algorithm for locating an optimal split
location.  New logic identifies localized monotonically increasing
values by recognizing cases where a newly inserted tuple has a heap TID
that's slightly greater than that of the existing tuple to the immediate
left, but isn't just a duplicate.  It can greatly help space utilization
to split between two groups of localized monotonically increasing
values.

Without this patch, affected cases will reliably leave leaf pages no
more than about 50% full.  50/50 page splits are only appropriate with a
pattern of truly random insertions.  The optimization is very similar to
the long established fillfactor optimization used during rightmost page
splits, where we usually leave the new left side of the split 90% full.
Split-at-new-tuple page splits target essentially the same case. The
splits targeted are those at the rightmost point of a localized grouping
of values, rather than those at the rightmost point of the entire key
space.

This enhancement is very effective at avoiding index bloat when initial
bulk INSERTs for the TPC-C benchmark are run, and throughout the TPC-C
benchmark.  The TPC-C issue has been independently observed and reported
on [1].  Evidently, the primary keys for all of the largest indexes in
the TPC-C schema are populated through localized, monotonically
increasing values:

Master
======

order_line_pkey: 774 MB
stock_pkey: 181 MB
idx_customer_name: 107 MB
oorder_pkey: 78 MB
customer_pkey: 75 MB
oorder_o_w_id_o_d_id_o_c_id_o_id_key: 60 MB
new_order_pkey: 22 MB
item_pkey: 2216 kB
district_pkey: 40 kB
warehouse_pkey: 24 kB

Patch series, up to and including this commit
=============================================

order_line_pkey: 451 MB
stock_pkey: 114 MB
idx_customer_name: 105 MB
oorder_pkey: 45 MB
customer_pkey: 48 MB
oorder_o_w_id_o_d_id_o_c_id_o_id_key: 61 MB
new_order_pkey: 13 MB
item_pkey: 2216 kB
district_pkey: 40 kB
warehouse_pkey: 24 kB

Without this patch, but with all previous patches in the series, a much
more modest reduction in the volume of bloat occurs when the same test
case is run.  There is a reduction in the size of the largest index (the
order line primary key) of ~5% of its original size, whereas we see a
reduction of ~42% here.

The problem can easily be recreated by bulk loading using benchmarkSQL
(a fair use TPC-C implementation) while avoiding building indexes with
CREATE INDEX [2].  Note that the patch series generally has less of an
advantage over master if the indexes are initially built with CREATE
INDEX (use my fork of BenchmarkSQL [3] to run a TPC-C benchmark while
avoiding having CREATE INDEX mask the problems on the master branch).

[1] https://www.commandprompt.com/blog/postgres_autovacuum_bloat_tpc-c
[2] https://bitbucket.org/openscg/benchmarksql/issues/6/making-it-easier-to-recreate-postgres-tpc
[3] https://github.com/petergeoghegan/benchmarksql/tree/nbtree-customizations
---
 src/backend/access/nbtree/nbtsplitloc.c | 164 ++++++++++++++++++++++++
 1 file changed, 164 insertions(+)

diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c
index 86cde0206c..c707e2f4c6 100644
--- a/src/backend/access/nbtree/nbtsplitloc.c
+++ b/src/backend/access/nbtree/nbtsplitloc.c
@@ -62,6 +62,9 @@ static OffsetNumber _bt_dofindsplitloc(Relation rel, Page page,
 static int _bt_checksplitloc(FindSplitData *state,
 				  OffsetNumber firstoldonright, bool newitemonleft,
 				  int dataitemstoleft, Size firstoldonrightsz);
+static bool _bt_splitatnewitem(Relation rel, Page page, int leaffillfactor,
+				   OffsetNumber newitemoff, IndexTuple newitem,
+				   double *propfullonleft);
 static OffsetNumber _bt_bestsplitloc(Relation rel, Page page,
 				 FindSplitData *state,
 				 int perfectpenalty,
@@ -72,6 +75,7 @@ static int _bt_perfect_penalty(Relation rel, Page page, FindSplitData *state,
 					SplitMode *secondmode);
 static int _bt_split_penalty(Relation rel, Page page, OffsetNumber newitemoff,
 				  IndexTuple newitem, SplitPoint *split, bool is_leaf);
+static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid);
 
 
 /*
@@ -243,6 +247,12 @@ _bt_dofindsplitloc(Relation rel,
 			state.propfullonleft = leaffillfactor / 100.0;
 			state.is_weighted = true;
 		}
+		else if (_bt_splitatnewitem(rel, page, leaffillfactor, newitemoff,
+									newitem, &state.propfullonleft))
+		{
+			/* propfullonleft was set for us */
+			state.is_weighted = true;
+		}
 		else
 		{
 			/* propfullonleft won't be used, but be tidy */
@@ -540,6 +550,132 @@ _bt_checksplitloc(FindSplitData *state,
 	return INT_MAX;
 }
 
+/*
+ * Subroutine to determine whether or not the page should be split at the
+ * point that the new/incoming item would have been inserted, leaving the
+ * incoming tuple as the last tuple on the new left page.  When the new item
+ * is at the first or last offset, a fillfactor is applied so that space
+ * utilization is comparable to the traditional rightmost split case.
+ *
+ * This routine is primarily concerned with composite indexes that consist
+ * of one or more leading columns that describe some grouping, plus a
+ * trailing, monotonically increasing column.  This usage pattern is
+ * prevalent in many real world applications.  Consider the example of a
+ * composite index on (supplier_id, invoice_id), where there are a small,
+ * nearly-fixed number of suppliers, and invoice_id is a monotonically
+ * increasing identifier (it doesn't matter whether or not suppliers are
+ * assigned invoice_id values from the same counter, or their own counter).
+ * Without this optimization, approximately 50% of space in leaf pages will
+ * be wasted by unweighted/50:50 page splits.  With this optimization, space
+ * utilization will be close to optimal.  There may be excessive amounts of
+ * free space remaining on right pages where only one supplier is
+ * represented if the supplier has few distinct invoice_id values, but that
+ * problem should be self-limiting.
+ *
+ * Secondarily, DESC-ordered insertions are recognized here, though not for
+ * single attribute indexes, where explicitly using DESC ordering doesn't
+ * make sense.  It seems worthwhile to try to get rightmost style space
+ * utilization for cases like explicitly-DESC date columns.
+ *
+ * Caller uses propfullonleft rather than using the new item offset directly
+ * because not all offsets will be deemed legal as split points.
+ */
+static bool
+_bt_splitatnewitem(Relation rel, Page page, int leaffillfactor,
+				   OffsetNumber newitemoff, IndexTuple newitem,
+				   double *propfullonleft)
+{
+	OffsetNumber maxoff;
+	int16		nkeyatts;
+	ItemId		itemid;
+	IndexTuple	tup;
+	Size		tupspace;
+	Size		hikeysize;
+	int			keepnatts;
+
+	maxoff = PageGetMaxOffsetNumber(page);
+
+	/* Proceed only when items on page look fairly short */
+	if (maxoff < MaxIndexTuplesPerPage / 2)
+		return false;
+
+	nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+
+	/* Single key indexes not considered here */
+	if (nkeyatts == 1)
+		return false;
+
+	/*
+	 * Assume that the optimization won't be useful unless tuples are of
+	 * uniform size, with the exception of the high key, which may already be
+	 * truncation.
+	 */
+	Assert(!P_RIGHTMOST((BTPageOpaque) PageGetSpecialPointer(page)));
+	tupspace = ((PageHeader) page)->pd_special - ((PageHeader) page)->pd_upper;
+	itemid = PageGetItemId(page, P_HIKEY);
+	hikeysize = ItemIdGetLength(itemid);
+	if (IndexTupleSize(newitem) * (maxoff - 1) != tupspace - hikeysize)
+		return false;
+
+	/*
+	 * When heap TIDs appear in DESC order, consider left-heavy split.
+	 *
+	 * Accept left-heavy split when new item, which will be inserted at first
+	 * data offset, has adjacent TID to extant item at that position.  This is
+	 * considered equivalent to a rightmost split, so apply flipped-around
+	 * fillfactor.
+	 */
+	if (newitemoff == P_FIRSTKEY)
+	{
+		itemid = PageGetItemId(page, P_FIRSTKEY);
+		tup = (IndexTuple) PageGetItem(page, itemid);
+		keepnatts = _bt_keep_natts_fast(rel, tup, newitem);
+
+		if (keepnatts > 1 && keepnatts <= nkeyatts)
+		{
+			*propfullonleft = (double) Max(100 - leaffillfactor,
+										   BTREE_MIN_FILLFACTOR) / 100.0;
+			return true;
+		}
+
+		return false;
+	}
+
+	/*
+	 * At least the first attribute must be equal, but new item cannot be a
+	 * simple duplicate of item that belongs to its immediate left
+	 */
+	if (newitemoff > maxoff)
+	{
+		itemid = PageGetItemId(page, maxoff);
+		tup = (IndexTuple) PageGetItem(page, itemid);
+		keepnatts = _bt_keep_natts_fast(rel, tup, newitem);
+
+		if (keepnatts > 1 && keepnatts <= nkeyatts)
+		{
+			*propfullonleft = (double) leaffillfactor / 100.0;
+			return true;
+		}
+
+		return false;
+	}
+
+	/* When item isn't first or last on origpage, consider heap TID too */
+	itemid = PageGetItemId(page, OffsetNumberPrev(newitemoff));
+	tup = (IndexTuple) PageGetItem(page, itemid);
+	if (!_bt_adjacenthtid(&tup->t_tid, &newitem->t_tid))
+		return false;
+	keepnatts = _bt_keep_natts_fast(rel, tup, newitem);
+
+	if (keepnatts > 1 && keepnatts <= nkeyatts)
+	{
+		*propfullonleft = (double) newitemoff / (((double) maxoff + 1));
+		return true;
+	}
+
+	return false;
+}
+
 /*
  * Subroutine to find the "best" split point among an array of acceptable
  * candidate split points that split without there being an excessively high
@@ -820,3 +956,31 @@ _bt_split_penalty(Relation rel, Page page, OffsetNumber newitemoff,
 	Assert(lastleft != firstright);
 	return _bt_keep_natts_fast(rel, lastleft, firstright);
 }
+
+/*
+ * Subroutine for determining if two heap TIDS are "adjacent".
+ *
+ * Adjacent means that the high TID is very likely to have been inserted into
+ * heap relation immediately after the low TID, probably by the same
+ * transaction.
+ */
+static bool
+_bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid)
+{
+	BlockNumber lowblk,
+				highblk;
+
+	lowblk = ItemPointerGetBlockNumber(lowhtid);
+	highblk = ItemPointerGetBlockNumber(highhtid);
+
+	/* Make optimistic assumption of adjacency when heap blocks match */
+	if (lowblk == highblk)
+		return true;
+
+	/* When heap block one up, second offset should be FirstOffsetNumber */
+	if (lowblk + 1 == highblk &&
+		ItemPointerGetOffsetNumber(highhtid) == FirstOffsetNumber)
+		return true;
+
+	return false;
+}
-- 
2.17.1