/*-------------------------------------------------------------------------
 *
 * nbtsplitloc.c
 *	  Choose split point code for Postgres btree implementation.
 *
 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
 *	  src/backend/access/nbtree/nbtsplitloc.c
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

#include "access/nbtree.h"
#include "storage/lmgr.h"

typedef struct
{
	/* FindSplitData candidate split */
	int			leftfree;
	int			rightfree;
	OffsetNumber firstoldonright;
	bool		newitemonleft;
	IndexTuple	lastleft_tuple;
	IndexTuple	firstright_tuple;
}			SplitPoint;

typedef struct
{
	/* context data for _bt_checksplitloc */
	Relation	rel;
	bool		is_leaf;		/* T if splitting a leaf page */
	OffsetNumber newitemoff;	/* where the new item is to be inserted */
	int			leftspace;		/* space available for items on left page */
	int			rightspace;		/* space available for items on right page */
	int			dataitemstotal; /* space taken by all items, old and new */

	int			ncandidates;	/* current number of split candidates */
	SplitPoint *candidates;		/* sorted by offset number */

	/* context data for _bt_findbestsplitloc */
	int			optimal_natts;
	bool		all_duplicates;
	double		goodenough;		/* good enough left/right space delta */
	double		propfullonleft; /* want propfullonleft * leftfree on left */
	bool		is_weighted;	/* T if propfullonleft used by split */
	SplitPoint *best_candidate;
	double		best_delta;
} FindSplitData;

static void _bt_checksplitloc(FindSplitData *state, int olddataitemstoleft,
				  OffsetNumber firstoldonright, bool newitemonleft,
				  IndexTuple lastleftitem, IndexTuple firstrightitem,
				  Size firstrightitemsz);
static void _bt_findbestsplitloc(FindSplitData *state, int left, int right, int level);

/*
 *	_bt_findsplitloc() -- find an appropriate place to split a page.
 *
 * The main goal here is to equalize the free space that will be on each
 * split page, *after accounting for the inserted tuple*.  (If we fail to
 * account for it, we might find ourselves with too little room on the page
 * that it needs to go into!)
 *
 * We are passed the intended insert position of the new tuple, expressed as
 * the offsetnumber of the tuple it must go in front of.  (This could be
 * maxoff+1 if the tuple is to go at the end.)
 *
 * We return the index of the first existing tuple that should go on the
 * righthand page, plus a boolean indicating whether the new tuple goes on
 * the left or right page.  The bool is necessary to disambiguate the case
 * where firstright == newitemoff.
 *
 * The high key for the left page is formed using the first item on the
 * right page, which may seem to be contrary to Lehman & Yao's approach of
 * using the left page's last item as its new high key.  It isn't, though;
 * suffix truncation will leave the left page's high key equal to the last
 * item on the left page when two tuples with equal key values enclose the
 * split point.  It's convenient to always express a split point as a
 * firstright offset due to internal page splits, which leave us with a
 * right half whose first item becomes a negative infinity item through
 * truncation to 0 attributes.  In effect, internal page splits store
 * firstright's "separator" key at the end of the left page (as left's new
 * high key), and store its downlink at the start of the right page.  In
 * other words, internal page splits conceptually split in the middle of the
 * firstright tuple, not on either side of it.  Crucially, when splitting
 * either a leaf page or an internal page, the new high key will be strictly
 * less than the first item on the right page in all cases, despite the fact
 * that we start with the assumption that firstright becomes the new high
 * key.
 */
OffsetNumber
_bt_findsplitloc(Relation rel,
				 Page page,
				 OffsetNumber newitemoff,
				 Size newitemsz,
				 IndexTuple newitem,
				 bool *newitemonleft)
{
	BTPageOpaque opaque;
	OffsetNumber offnum;
	OffsetNumber firstoff;
	OffsetNumber maxoff;
	ItemId		itemid;
	FindSplitData state;
	int			leftspace,
				rightspace,
				olddataitemstotal,
				dataitemstoleft,
				leaffillfactor;
	SplitPoint *candidates;
	int			indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
	OffsetNumber finalfirstright;
	IndexTuple	previtem;

	/* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
	newitemsz += sizeof(ItemIdData);

	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	maxoff = PageGetMaxOffsetNumber(page);

	/* Total free space available on a btree page, after fixed overhead */
	leftspace = rightspace =
		PageGetPageSize(page) - SizeOfPageHeaderData -
		MAXALIGN(sizeof(BTPageOpaqueData));

	/* The right page will have the same high key as the old page */
	if (!P_RIGHTMOST(opaque))
	{
		itemid = PageGetItemId(page, P_HIKEY);
		rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
							 sizeof(ItemIdData));
	}

	/* Count up total space in data items without actually scanning 'em */
	olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page);
	leaffillfactor = RelationGetFillFactor(rel, BTREE_DEFAULT_FILLFACTOR);

	candidates = (SplitPoint *) palloc((maxoff + 1) * sizeof(SplitPoint));

	state.rel = rel;
	state.is_leaf = P_ISLEAF(opaque);
	state.leftspace = leftspace;
	state.rightspace = rightspace;
	state.dataitemstotal = olddataitemstotal + newitemsz;
	state.candidates = candidates;
	state.ncandidates = 0;

	/*
	 * Scan through the data items and calculate space usage for a split at
	 * each possible position.  All the possible splits are recorded in
	 * 'state.candidates' array.
	 */
	firstoff = P_FIRSTDATAKEY(opaque);
	if (newitemoff == firstoff)
	{
		previtem = newitem;
		dataitemstoleft = newitemsz;
	}
	else
	{
		itemid = PageGetItemId(page, firstoff);
		previtem = (IndexTuple) PageGetItem(page, itemid);
		dataitemstoleft = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
	}

	for (offnum = firstoff + 1;
		 offnum <= maxoff;
		 offnum = OffsetNumberNext(offnum))
	{
		IndexTuple	item;
		Size		itemsz;

		itemid = PageGetItemId(page, offnum);
		item = (IndexTuple) PageGetItem(page, itemid);
		itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);

		/*
		 * Will the new item go to left or right of split?
		 */
		if (offnum < newitemoff)
		{
			_bt_checksplitloc(&state, dataitemstoleft, offnum, false, previtem, item, itemsz);
			dataitemstoleft += itemsz;
			previtem = item;
		}
		else if (offnum == newitemoff)
		{
			/* need to try it both ways! */
			_bt_checksplitloc(&state, dataitemstoleft, offnum, false, previtem, newitem, newitemsz);
			dataitemstoleft += newitemsz;
			previtem = newitem;

			_bt_checksplitloc(&state, dataitemstoleft, offnum, true, previtem, item, itemsz);
			dataitemstoleft += itemsz;
			previtem = item;
		}
		else
		{
			_bt_checksplitloc(&state, dataitemstoleft, offnum, true, previtem, item, itemsz);
			dataitemstoleft += itemsz;
			previtem = item;
		}
	}

	/*
	 * If the new item goes as the last item, check for splitting so that all
	 * the old items go to the left page and the new item goes to the right
	 * page.
	 */
	if (newitemoff > maxoff)
		_bt_checksplitloc(&state, olddataitemstotal, maxoff, false, previtem, newitem, newitemsz);

	/*
	 * Try to select a point where a distinguishing attribute appears earlier
	 * in the new high key for the left side of the split, in order to
	 * maximize the number of trailing attributes that can be truncated away.
	 * This is even useful with pages that only have a single (non-TID)
	 * attribute, since it's helpful to avoid appending an explicit heap TID
	 * attribute to the new pivot tuple (high key/downlink) when it cannot
	 * actually be truncated.  Note that it is always assumed that caller goes
	 * on to perform truncation, even with pg_upgrade'd indexes where that
	 * isn't actually the case.  There is still a modest benefit to choosing a
	 * split location while weighing suffix truncation: the resulting
	 * (untruncated) pivot tuples are nevertheless more predictive of future
	 * space utilization.
	 *
	 * Among all tuple possible split points, with the maximum amount of
	 * suffix truncation, we choose a split point that balances the left and
	 * right pages the best.  If the page is the rightmost page on its level,
	 * however, we instead try to arrange to leave the left split page
	 * fillfactor% full.  In this way, when we are inserting successively
	 * increasing keys (consider sequences, timestamps, etc) we will end up
	 * with a tree whose pages are about fillfactor% full, instead of the 50%
	 * full result that we'd get without this special case.  This is the same
	 * as nbtsort.c produces for a newly-created tree.  Note that leaf and
	 * nonleaf pages use different fillfactors.
	 *
	 * If the page is entirely full of duplicates, we try to leave the left
	 * split page even more full than in the fillfactor% rightmost page case.
	 * This maximizes space utilization in cases where tuples with the same
	 * attribute values span across many pages.  Newly inserted duplicates
	 * will tend to have higher heap TID values, so we'll end up splitting to
	 * the right in the manner of ascending insertions of monotonically
	 * increasing values.  See nbtree/README for more information about suffix
	 * truncation, and how a split point is chosen.
	 */

	/*
	 * Compute how many attributes we need to keep, in the most distinguishing
	 * possible split among all the candidate splits.  We do this by comparing
	 * the leftmost candidate's last-left item, with the rightmost candidate's
	 * first-right item.  The assumption here is that any split point in
	 * between those must keep at least as many keys.
	 *
	 * XXX: That's not quite true, because _bt_keep_natts_fast() uses binary
	 * comparison, and thus isn't totally accurate!
	 */
	state.optimal_natts = _bt_keep_natts_fast(rel,
											  candidates[0].lastleft_tuple,
											  candidates[state.ncandidates - 1].firstright_tuple);
	if (!state.is_leaf)
	{
		if (P_RIGHTMOST(opaque))
		{
			state.propfullonleft = BTREE_NONLEAF_FILLFACTOR / 100.0;
			state.is_weighted = true;
		}
		else
		{
			state.propfullonleft = 0.5;
			state.is_weighted = false;
		}
	}
	else if (state.optimal_natts > indnkeyatts)
	{
		/* The page is full of duplicates. */
		state.propfullonleft = BTREE_SINGLEVAL_FILLFACTOR / 100.0;
		state.is_weighted = true;
	}
	else if (P_RIGHTMOST(opaque))
	{
		state.propfullonleft = leaffillfactor / 100.0;
		state.is_weighted = true;
	}
	else
	{
		state.propfullonleft = 0.5;
		state.is_weighted = false;
	}
	state.all_duplicates = (state.optimal_natts > indnkeyatts);

	/*
	 * Finding the best possible split would require checking all the possible
	 * split points, because of the high-key and left-key special cases.
	 * That's probably more work than it's worth; instead, stop as soon as we
	 * find a sufficiently "good-enough" split, where good-enough is defined
	 * as an imbalance in free space of no more than pagesize/16
	 * (arbitrary...) This should let us stop near the middle on most pages,
	 * instead of plowing through all candidates.
	 */
	state.goodenough = leftspace / 16;

	/*
	 * Find the best (or good enough) split among the candidates.
	 */
	state.best_candidate = NULL;
	state.best_delta = INT_MAX;
	_bt_findbestsplitloc(&state, 0, state.ncandidates - 1, 0);

	/*
	 * I believe it is not possible to fail to find a feasible split, but just
	 * in case ...
	 */
	if (!state.best_candidate)
		elog(ERROR, "could not find a feasible split point for index \"%s\"",
			 RelationGetRelationName(rel));

	finalfirstright = state.best_candidate->firstoldonright;
	*newitemonleft = state.best_candidate->newitemonleft;

	pfree(state.candidates);

	return finalfirstright;
}

/*
 * Subroutine to analyze a particular possible split choice (ie, firstright
 * and newitemonleft settings), and record it in state->candidates, if a
 * split is possible at this point.
 *
 * firstoldonright is the offset of the first item on the original page
 * that goes to the right page.  firstoldonright can be > max offset, which
 * means that all the old items go to the left page and only the new item goes
 * to the right page.
 *
 * lastleftitem and firstrightitem are the tuples, between which the split
 * is made.  firstrightitemsz is the size of 'firstrightitem'.
 *
 * dataitemstoleft is the total size of all items on the left page,
 * firstoldonright.
 *
 * Returns delta between space that will be left free on left and right side
 * of split.
 */
static void
_bt_checksplitloc(FindSplitData *state,
				  int dataitemstoleft,
				  OffsetNumber firstoldonright,
				  bool newitemonleft,
				  IndexTuple lastleftitem,
				  IndexTuple firstrightitem,
				  Size firstrightitemsz)
{
	int			leftfree,
				rightfree;

	/* Account for all the tuples */
	leftfree = state->leftspace - dataitemstoleft;
	rightfree = state->rightspace -
		(state->dataitemstotal - dataitemstoleft);

	/*
	 * The first item on the right page becomes the high key of the left page;
	 * therefore it counts against left space as well as right space (we
	 * cannot assume that suffix truncation will make it any smaller).  When
	 * index has included attributes, then those attributes of left page high
	 * key will be truncated leaving that page with slightly more free space.
	 * However, that shouldn't affect our ability to find valid split
	 * location, since we err in the direction of being pessimistic about free
	 * space on the left half.  Besides, even when suffix truncation of
	 * non-TID attributes occurs, there often won't be an entire MAXALIGN()
	 * quantum in pivot space savings.
	 *
	 * If we are on the leaf level, assume that suffix truncation cannot avoid
	 * adding a heap TID to the left half's new high key when splitting at the
	 * leaf level.  In practice the new high key will often be smaller and
	 * will rarely be larger, but conservatively assume the worst case.
	 */
	if (state->is_leaf)
		leftfree -= (int) (firstrightitemsz +
						   MAXALIGN(sizeof(ItemPointerData)));
	else
		leftfree -= (int) firstrightitemsz;

	/*
	 * If we are not on the leaf level, we will be able to discard the key
	 * data from the first item that winds up on the right page.
	 */
	if (!state->is_leaf)
		rightfree += (int) firstrightitemsz -
			(int) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData));

	if (leftfree >= 0 && rightfree >= 0)
	{
		/* this split is possible, memorize it */
		state->candidates[state->ncandidates].leftfree = leftfree;
		state->candidates[state->ncandidates].rightfree = rightfree;
		state->candidates[state->ncandidates].firstoldonright = firstoldonright;
		state->candidates[state->ncandidates].newitemonleft = newitemonleft;
		state->candidates[state->ncandidates].lastleft_tuple = lastleftitem;
		state->candidates[state->ncandidates].firstright_tuple = firstrightitem;
		state->ncandidates++;
	}
}

/*
 * Recursive helper function, to find the best split location among the
 * candidates in given range (inclusive).
 *
 * The best candidate is one that has needs the least amount of "distinguishing"
 * keys.  The caller already calculated that, in 'state.optimal_natts'.
 * We find the candidate with the smallest (or "good enough") delta value,
 * among the ones with the optimal number of distinguishing keys.
 *
 * We try to optimize for both properties at the same time.  First, we use
 * _bt_keep_natts_fast() to search for the position, or positions, where
 * there are enough distinguishing keys.  We do that recursively, like in a
 * binary search, and whenever we find a branch with duplicates, we skip
 * processing that half.  If a page only has a few places that satisfy the
 * "distinguishing keys" requirement, that zooms in quite quickly to the
 * those locations.
 *
 * _bt_keep_natts_fast() is relatively expensive, though, so once we have
 * "zoomed in" to a small range (32 items or less), we scan the range
 * sequentially, looking for the smallest delta first, and only checking
 * _bt_keep_natts_fast() if the current candidate's delta is smaller than
 * the previous best.  That avoids wasting a lot of effort when almost
 * all split points satisfy the distinguishing keys requirement.
 */
static void
_bt_findbestsplitloc(FindSplitData *state, int left, int right, int level)
{
	SplitPoint *left_candidate = &state->candidates[left];
	SplitPoint *right_candidate = &state->candidates[right];
	int			center;
	int			this_natts;

	/*
	 * First check if there is any split point in this range, with the optimal
	 * distinguishness.  If not, skip this range.
	 *
	 * For example, if we found out earlier, that there is a split point on
	 * the page, such the left and half pages would differ on the first key
	 * column, but all the values in the range we're considering have the same
	 * value on the first column, then there must be a better split point
	 * elsewhere.
	 */
	if (level > 0 && !state->all_duplicates)
	{
		this_natts = _bt_keep_natts_fast(state->rel,
										 left_candidate->lastleft_tuple,
										 right_candidate->firstright_tuple);

		if (this_natts > state->optimal_natts)
			return;
	}

#if 0
	elog(NOTICE, "recurse %d: %4d - %4d", level, left, right);
#endif

	/*
	 * Once we have a sufficiently small range left, scan all the entries,
	 * looking for the split with best balance between the pages (i.e.
	 * smallest delta)
	 */
	if (right - left < 32)
	{
		int			i;

		for (i = left; i <= right; i++)
		{
			SplitPoint *candidate = &state->candidates[i];
			int			leftfree = candidate->leftfree;
			int			rightfree = candidate->rightfree;
			double		delta;

			if (state->is_weighted)
				delta = state->propfullonleft * leftfree -
					(1.0 - state->propfullonleft) * rightfree;
			else
				delta = leftfree - rightfree;

			if (delta < 0)
				delta = -delta;

			if (delta < state->best_delta)
			{
				/*
				 * This candidate is better balanced than the previous best.
				 * But is it optimally distinguishing?
				 */
				bool		distinguishing_enough;

				if (state->all_duplicates)
					distinguishing_enough = true;
				else
				{
					this_natts = _bt_keep_natts_fast(state->rel,
													 candidate->lastleft_tuple,
													 candidate->firstright_tuple);
					if (this_natts <= state->optimal_natts)
						distinguishing_enough = true;
					else
						distinguishing_enough = false;
				}

#if 0
				elog(NOTICE, "candidate %3ld: %4d - %4d delta %f this_natts %d",
					 candidate - state->candidates,
					 candidate->leftfree,
					 candidate->rightfree,
					 delta, this_natts);
#endif

				if (distinguishing_enough)
				{
					state->best_delta = delta;
					state->best_candidate = candidate;

					if (state->best_delta <= state->goodenough)
						return;
				}
			}
		}
	}
	else
	{
		/*
		 * Recurse into both halves.  As an optimization, use the goal
		 * fillfactor as a guide on which half to recurse first.  For example,
		 * if we're aiming for a 50/50 split, try the splits around the
		 * midpoint first.  If we manage to find a "good enough" split early,
		 * we can skip scanning the rest.  This is approximate, as we look for
		 * the midpoint of the split candidates, rather than free space, but
		 * it's close in the common case that the tuples are roughly the same
		 * size.
		 */
		center = (left + right) / 2;

		if (state->ncandidates * state->propfullonleft < center)
		{
			_bt_findbestsplitloc(state, left, center, level + 1);

			if (state->best_delta <= state->goodenough)
				return;

			_bt_findbestsplitloc(state, center + 1, right, level + 1);
		}
		else
		{
			_bt_findbestsplitloc(state, center + 1, right, level + 1);

			if (state->best_delta <= state->goodenough)
				return;

			_bt_findbestsplitloc(state, left, center, level + 1);
		}
	}
}