From 7a0dd644a1b26cc3c80469aea0e2c4edc3d86f8a Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 1 Oct 2018 16:48:08 -0700
Subject: [PATCH v11 5/7] Add split-at-new-tuple page split optimization.

Add additional heuristics to the algorithm for locating an optimal split
location.  New logic identifies localized monotonically increasing
values by recognizing cases where a newly inserted tuple has a heap TID
that's slightly greater than that of the existing tuple to the immediate
left, but isn't just a duplicate.  It can greatly help space utilization
to split between two groups of localized monotonically increasing
values.

Without this patch, affected cases will reliably leave leaf pages no
more than about 50% full.  50/50 page splits are only appropriate with a
pattern of truly random insertions.  The optimization is very similar to
the long established fillfactor optimization used during rightmost page
splits, where we usually leave the new left side of the split 90% full.
Split-at-new-tuple page splits target essentially the same case. The
splits targeted are those at the rightmost point of a localized grouping
of values, rather than those at the rightmost point of the entire key
space.

This enhancement is very effective at avoiding index bloat when initial
bulk INSERTs for the TPC-C benchmark are run, and throughout the TPC-C
benchmark.  The TPC-C issue has been independently observed and reported
on [1].  Evidently, the primary keys for all of the largest indexes in
the TPC-C schema are populated through localized, monotonically
increasing values:

Master
======

order_line_pkey: 774 MB
stock_pkey: 181 MB
idx_customer_name: 107 MB
oorder_pkey: 78 MB
customer_pkey: 75 MB
oorder_o_w_id_o_d_id_o_c_id_o_id_key: 60 MB
new_order_pkey: 22 MB
item_pkey: 2216 kB
district_pkey: 40 kB
warehouse_pkey: 24 kB

Patch series, up to and including this commit
=============================================

order_line_pkey: 451 MB
stock_pkey: 114 MB
idx_customer_name: 105 MB
oorder_pkey: 45 MB
customer_pkey: 48 MB
oorder_o_w_id_o_d_id_o_c_id_o_id_key: 61 MB
new_order_pkey: 13 MB
item_pkey: 2216 kB
district_pkey: 40 kB
warehouse_pkey: 24 kB

Without this patch, but with all previous patches in the series, a much
more modest reduction in the volume of bloat occurs when the same test
case is run.  There is a reduction in the size of the largest index (the
order line primary key) of ~5% of its original size, whereas we see a
reduction of ~42% here.

The problem can easily be recreated by bulk loading using benchmarkSQL
(a fair use TPC-C implementation) while avoiding building indexes with
CREATE INDEX [2].  Note that the patch series generally has less of an
advantage over master if the indexes are initially built with CREATE
INDEX (use my fork of BenchmarkSQL [3] to run a TPC-C benchmark while
avoiding having CREATE INDEX mask the problems on the master branch).

[1] https://www.commandprompt.com/blog/postgres_autovacuum_bloat_tpc-c
[2] https://bitbucket.org/openscg/benchmarksql/issues/6/making-it-easier-to-recreate-postgres-tpc
[3] https://github.com/petergeoghegan/benchmarksql/tree/nbtree-customizations
---
 src/backend/access/nbtree/nbtsplitloc.c | 184 ++++++++++++++++++++++++
 1 file changed, 184 insertions(+)

diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c
index 7f337bac55..3edf97bfeb 100644
--- a/src/backend/access/nbtree/nbtsplitloc.c
+++ b/src/backend/access/nbtree/nbtsplitloc.c
@@ -62,6 +62,9 @@ static OffsetNumber _bt_dofindsplitloc(Relation rel, Page page,
 static int _bt_checksplitloc(FindSplitData *state,
 				  OffsetNumber firstoldonright, bool newitemonleft,
 				  int dataitemstoleft, Size firstoldonrightsz);
+static bool _bt_splitatnewitem(Relation rel, Page page, int leaffillfactor,
+				   OffsetNumber newitemoff, IndexTuple newitem,
+				   double *propfullonleft);
 static OffsetNumber _bt_bestsplitloc(Relation rel, Page page,
 				 FindSplitData *state,
 				 int perfectpenalty,
@@ -72,6 +75,7 @@ static int _bt_perfect_penalty(Relation rel, Page page, FindSplitData *state,
 					SplitMode *secondmode);
 static int _bt_split_penalty(Relation rel, Page page, OffsetNumber newitemoff,
 				  IndexTuple newitem, SplitPoint *split, bool is_leaf);
+static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid);
 
 
 /*
@@ -243,6 +247,12 @@ _bt_dofindsplitloc(Relation rel,
 			state.propfullonleft = leaffillfactor / 100.0;
 			state.is_weighted = true;
 		}
+		else if (_bt_splitatnewitem(rel, page, leaffillfactor, newitemoff,
+									newitem, &state.propfullonleft))
+		{
+			/* propfullonleft was set for us */
+			state.is_weighted = true;
+		}
 		else
 		{
 			/* propfullonleft won't be used, but be tidy */
@@ -540,6 +550,152 @@ _bt_checksplitloc(FindSplitData *state,
 	return INT_MAX;
 }
 
+/*
+ * Subroutine to determine whether or not the page should be split at the
+ * point that the new/incoming item would have been inserted, leaving the
+ * incoming tuple as the last tuple on the new left page.  When the new item
+ * is at the first or last offset, a fillfactor is applied so that space
+ * utilization is comparable to the traditional rightmost split case.
+ *
+ * This routine targets splits in composite indexes that consist of one or
+ * more leading columns that describe some grouping, plus a trailing column
+ * with ascending (or descending) values.  This pattern is prevalent in real
+ * world applications.  Consider the example of a composite index on
+ * (supplier_id, invoice_id), where there are a small, nearly-fixed number of
+ * suppliers, and invoice_id is an identifier assigned in ascending order (it
+ * doesn't matter whether or not suppliers are assigned invoice_id values from
+ * the same counter, or their own counter).  Without this optimization,
+ * approximately 50% of space in leaf pages will be wasted by unweighted/50:50
+ * page splits.  With this optimization, space utilization will be close to
+ * that of a similar index where all tuple insertions modify the current
+ * rightmost leaf page in the index.
+ *
+ * This optimization may leave extra free space remaining on the rightmost
+ * page of a "most significant column" grouping of tuples if that grouping
+ * never ends up having future insertions that use the free space.  Testing
+ * has shown the effect to be self-limiting; a future grouping that becomes
+ * the "nearest on the right" grouping of the affected grouping usually puts
+ * the extra free space to good use instead.
+ *
+ * Caller uses propfullonleft rather than using the new item offset directly
+ * because not all offsets will be deemed legal as split points.  This also
+ * allows us to apply leaf fillfactor in the common case where the new
+ * insertion is after the last offset (or at the first offset).
+ */
+static bool
+_bt_splitatnewitem(Relation rel, Page page, int leaffillfactor,
+				   OffsetNumber newitemoff, IndexTuple newitem,
+				   double *propfullonleft)
+{
+	OffsetNumber maxoff;
+	int16		nkeyatts;
+	ItemId		itemid;
+	IndexTuple	tup;
+	Size		tupspace;
+	Size		hikeysize;
+	int			keepnatts;
+
+	maxoff = PageGetMaxOffsetNumber(page);
+
+	/* Proceed only when items on page look fairly short */
+	if (maxoff < MaxIndexTuplesPerPage / 2)
+		return false;
+
+	nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+
+	/* Single key indexes not considered here */
+	if (nkeyatts == 1)
+		return false;
+
+	/*
+	 * Avoid applying optimization when tuples are not all of uniform size,
+	 * with the exception of the high key (existing high key may be smaller
+	 * due to truncation).  Surmise that page has equisized tuples when page
+	 * layout is consistent with having maxoff-1 non-pivot tuples that are all
+	 * the same size as the newly inserted tuple.
+	 */
+	tupspace = ((PageHeader) page)->pd_special - ((PageHeader) page)->pd_upper;
+	Assert(!P_RIGHTMOST((BTPageOpaque) PageGetSpecialPointer(page)));
+	itemid = PageGetItemId(page, P_HIKEY);
+	hikeysize = ItemIdGetLength(itemid);
+	if (IndexTupleSize(newitem) * (maxoff - 1) != tupspace - hikeysize)
+		return false;
+
+	/*
+	 * At least the first attribute's value must be equal to the corresponding
+	 * value in antecedent tuple to apply optimization.  New item cannot be a
+	 * duplicate, either.
+	 */
+	if (newitemoff == P_FIRSTKEY)
+	{
+		/* Try to infer descending insertion pattern */
+		itemid = PageGetItemId(page, P_FIRSTKEY);
+		tup = (IndexTuple) PageGetItem(page, itemid);
+		keepnatts = _bt_keep_natts_fast(rel, tup, newitem);
+
+		if (keepnatts > 1 && keepnatts <= nkeyatts)
+		{
+			*propfullonleft = (double) Max(100 - leaffillfactor,
+										   BTREE_MIN_FILLFACTOR) / 100.0;
+			return true;
+		}
+
+		return false;
+	}
+	else if (newitemoff > maxoff)
+	{
+		/* Try to infer ascending insertion pattern */
+		itemid = PageGetItemId(page, maxoff);
+		tup = (IndexTuple) PageGetItem(page, itemid);
+		keepnatts = _bt_keep_natts_fast(rel, tup, newitem);
+
+		if (keepnatts > 1 && keepnatts <= nkeyatts)
+		{
+			*propfullonleft = (double) leaffillfactor / 100.0;
+			return true;
+		}
+
+		return false;
+	}
+
+	/*
+	 * When item isn't first or last on page, try to infer ascending insertion
+	 * pattern.  We try to split at the precise point of the insertion here,
+	 * rather than applying leaf fillfactor.
+	 *
+	 * "Low cardinality leading column, high cardinality suffix column"
+	 * indexes with a random insertion pattern (e.g. an index on '(country_id,
+	 * event_uuid)') may sometimes end up having the optimization applied
+	 * instead of getting a 50:50 (unweighted) page split.  This is
+	 * suboptimal.
+	 *
+	 * We're willing to accept that outcome when an incoming/new tuple is
+	 * either to the left or to the right of all existing items on the page,
+	 * since that's expected for less than 1% of all page splits that occur in
+	 * the index's lifetime (assuming default BLCKSZ).  More care must be
+	 * taken here, where we consider splits involving the new item being
+	 * inserted at neither edge of the page: we proceed only when new item's
+	 * heap TID is "adjacent" to the heap TID of the existing tuple to the
+	 * immediate left of the offset for the new item.  Heap TID adjacency
+	 * strongly suggests that the item just to the left was inserted very
+	 * recently.
+	 */
+	itemid = PageGetItemId(page, OffsetNumberPrev(newitemoff));
+	tup = (IndexTuple) PageGetItem(page, itemid);
+	if (!_bt_adjacenthtid(&tup->t_tid, &newitem->t_tid))
+		return false;
+	/* Also check the usual conditions */
+	keepnatts = _bt_keep_natts_fast(rel, tup, newitem);
+
+	if (keepnatts > 1 && keepnatts <= nkeyatts)
+	{
+		*propfullonleft = (double) newitemoff / (((double) maxoff + 1));
+		return true;
+	}
+
+	return false;
+}
+
 /*
  * Subroutine to find the "best" split point among an array of acceptable
  * candidate split points that split without there being an excessively high
@@ -820,3 +976,31 @@ _bt_split_penalty(Relation rel, Page page, OffsetNumber newitemoff,
 	Assert(lastleft != firstright);
 	return _bt_keep_natts_fast(rel, lastleft, firstright);
 }
+
+/*
+ * Subroutine for determining if two heap TIDS are "adjacent".
+ *
+ * Adjacent means that the high TID is very likely to have been inserted into
+ * heap relation immediately after the low TID, probably by the same
+ * transaction.
+ */
+static bool
+_bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid)
+{
+	BlockNumber lowblk,
+				highblk;
+
+	lowblk = ItemPointerGetBlockNumber(lowhtid);
+	highblk = ItemPointerGetBlockNumber(highhtid);
+
+	/* Make optimistic assumption of adjacency when heap blocks match */
+	if (lowblk == highblk)
+		return true;
+
+	/* When heap block one up, second offset should be FirstOffsetNumber */
+	if (lowblk + 1 == highblk &&
+		ItemPointerGetOffsetNumber(highhtid) == FirstOffsetNumber)
+		return true;
+
+	return false;
+}
-- 
2.17.1