From 7a0dd644a1b26cc3c80469aea0e2c4edc3d86f8a Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Mon, 1 Oct 2018 16:48:08 -0700 Subject: [PATCH v11 5/7] Add split-at-new-tuple page split optimization. Add additional heuristics to the algorithm for locating an optimal split location. New logic identifies localized monotonically increasing values by recognizing cases where a newly inserted tuple has a heap TID that's slightly greater than that of the existing tuple to the immediate left, but isn't just a duplicate. It can greatly help space utilization to split between two groups of localized monotonically increasing values. Without this patch, affected cases will reliably leave leaf pages no more than about 50% full. 50/50 page splits are only appropriate with a pattern of truly random insertions. The optimization is very similar to the long established fillfactor optimization used during rightmost page splits, where we usually leave the new left side of the split 90% full. Split-at-new-tuple page splits target essentially the same case. The splits targeted are those at the rightmost point of a localized grouping of values, rather than those at the rightmost point of the entire key space. This enhancement is very effective at avoiding index bloat when initial bulk INSERTs for the TPC-C benchmark are run, and throughout the TPC-C benchmark. The TPC-C issue has been independently observed and reported on [1]. Evidently, the primary keys for all of the largest indexes in the TPC-C schema are populated through localized, monotonically increasing values: Master ====== order_line_pkey: 774 MB stock_pkey: 181 MB idx_customer_name: 107 MB oorder_pkey: 78 MB customer_pkey: 75 MB oorder_o_w_id_o_d_id_o_c_id_o_id_key: 60 MB new_order_pkey: 22 MB item_pkey: 2216 kB district_pkey: 40 kB warehouse_pkey: 24 kB Patch series, up to and including this commit ============================================= order_line_pkey: 451 MB stock_pkey: 114 MB idx_customer_name: 105 MB oorder_pkey: 45 MB customer_pkey: 48 MB oorder_o_w_id_o_d_id_o_c_id_o_id_key: 61 MB new_order_pkey: 13 MB item_pkey: 2216 kB district_pkey: 40 kB warehouse_pkey: 24 kB Without this patch, but with all previous patches in the series, a much more modest reduction in the volume of bloat occurs when the same test case is run. There is a reduction in the size of the largest index (the order line primary key) of ~5% of its original size, whereas we see a reduction of ~42% here. The problem can easily be recreated by bulk loading using benchmarkSQL (a fair use TPC-C implementation) while avoiding building indexes with CREATE INDEX [2]. Note that the patch series generally has less of an advantage over master if the indexes are initially built with CREATE INDEX (use my fork of BenchmarkSQL [3] to run a TPC-C benchmark while avoiding having CREATE INDEX mask the problems on the master branch). [1] https://www.commandprompt.com/blog/postgres_autovacuum_bloat_tpc-c [2] https://bitbucket.org/openscg/benchmarksql/issues/6/making-it-easier-to-recreate-postgres-tpc [3] https://github.com/petergeoghegan/benchmarksql/tree/nbtree-customizations --- src/backend/access/nbtree/nbtsplitloc.c | 184 ++++++++++++++++++++++++ 1 file changed, 184 insertions(+) diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c index 7f337bac55..3edf97bfeb 100644 --- a/src/backend/access/nbtree/nbtsplitloc.c +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -62,6 +62,9 @@ static OffsetNumber _bt_dofindsplitloc(Relation rel, Page page, static int _bt_checksplitloc(FindSplitData *state, OffsetNumber firstoldonright, bool newitemonleft, int dataitemstoleft, Size firstoldonrightsz); +static bool _bt_splitatnewitem(Relation rel, Page page, int leaffillfactor, + OffsetNumber newitemoff, IndexTuple newitem, + double *propfullonleft); static OffsetNumber _bt_bestsplitloc(Relation rel, Page page, FindSplitData *state, int perfectpenalty, @@ -72,6 +75,7 @@ static int _bt_perfect_penalty(Relation rel, Page page, FindSplitData *state, SplitMode *secondmode); static int _bt_split_penalty(Relation rel, Page page, OffsetNumber newitemoff, IndexTuple newitem, SplitPoint *split, bool is_leaf); +static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid); /* @@ -243,6 +247,12 @@ _bt_dofindsplitloc(Relation rel, state.propfullonleft = leaffillfactor / 100.0; state.is_weighted = true; } + else if (_bt_splitatnewitem(rel, page, leaffillfactor, newitemoff, + newitem, &state.propfullonleft)) + { + /* propfullonleft was set for us */ + state.is_weighted = true; + } else { /* propfullonleft won't be used, but be tidy */ @@ -540,6 +550,152 @@ _bt_checksplitloc(FindSplitData *state, return INT_MAX; } +/* + * Subroutine to determine whether or not the page should be split at the + * point that the new/incoming item would have been inserted, leaving the + * incoming tuple as the last tuple on the new left page. When the new item + * is at the first or last offset, a fillfactor is applied so that space + * utilization is comparable to the traditional rightmost split case. + * + * This routine targets splits in composite indexes that consist of one or + * more leading columns that describe some grouping, plus a trailing column + * with ascending (or descending) values. This pattern is prevalent in real + * world applications. Consider the example of a composite index on + * (supplier_id, invoice_id), where there are a small, nearly-fixed number of + * suppliers, and invoice_id is an identifier assigned in ascending order (it + * doesn't matter whether or not suppliers are assigned invoice_id values from + * the same counter, or their own counter). Without this optimization, + * approximately 50% of space in leaf pages will be wasted by unweighted/50:50 + * page splits. With this optimization, space utilization will be close to + * that of a similar index where all tuple insertions modify the current + * rightmost leaf page in the index. + * + * This optimization may leave extra free space remaining on the rightmost + * page of a "most significant column" grouping of tuples if that grouping + * never ends up having future insertions that use the free space. Testing + * has shown the effect to be self-limiting; a future grouping that becomes + * the "nearest on the right" grouping of the affected grouping usually puts + * the extra free space to good use instead. + * + * Caller uses propfullonleft rather than using the new item offset directly + * because not all offsets will be deemed legal as split points. This also + * allows us to apply leaf fillfactor in the common case where the new + * insertion is after the last offset (or at the first offset). + */ +static bool +_bt_splitatnewitem(Relation rel, Page page, int leaffillfactor, + OffsetNumber newitemoff, IndexTuple newitem, + double *propfullonleft) +{ + OffsetNumber maxoff; + int16 nkeyatts; + ItemId itemid; + IndexTuple tup; + Size tupspace; + Size hikeysize; + int keepnatts; + + maxoff = PageGetMaxOffsetNumber(page); + + /* Proceed only when items on page look fairly short */ + if (maxoff < MaxIndexTuplesPerPage / 2) + return false; + + nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + + /* Single key indexes not considered here */ + if (nkeyatts == 1) + return false; + + /* + * Avoid applying optimization when tuples are not all of uniform size, + * with the exception of the high key (existing high key may be smaller + * due to truncation). Surmise that page has equisized tuples when page + * layout is consistent with having maxoff-1 non-pivot tuples that are all + * the same size as the newly inserted tuple. + */ + tupspace = ((PageHeader) page)->pd_special - ((PageHeader) page)->pd_upper; + Assert(!P_RIGHTMOST((BTPageOpaque) PageGetSpecialPointer(page))); + itemid = PageGetItemId(page, P_HIKEY); + hikeysize = ItemIdGetLength(itemid); + if (IndexTupleSize(newitem) * (maxoff - 1) != tupspace - hikeysize) + return false; + + /* + * At least the first attribute's value must be equal to the corresponding + * value in antecedent tuple to apply optimization. New item cannot be a + * duplicate, either. + */ + if (newitemoff == P_FIRSTKEY) + { + /* Try to infer descending insertion pattern */ + itemid = PageGetItemId(page, P_FIRSTKEY); + tup = (IndexTuple) PageGetItem(page, itemid); + keepnatts = _bt_keep_natts_fast(rel, tup, newitem); + + if (keepnatts > 1 && keepnatts <= nkeyatts) + { + *propfullonleft = (double) Max(100 - leaffillfactor, + BTREE_MIN_FILLFACTOR) / 100.0; + return true; + } + + return false; + } + else if (newitemoff > maxoff) + { + /* Try to infer ascending insertion pattern */ + itemid = PageGetItemId(page, maxoff); + tup = (IndexTuple) PageGetItem(page, itemid); + keepnatts = _bt_keep_natts_fast(rel, tup, newitem); + + if (keepnatts > 1 && keepnatts <= nkeyatts) + { + *propfullonleft = (double) leaffillfactor / 100.0; + return true; + } + + return false; + } + + /* + * When item isn't first or last on page, try to infer ascending insertion + * pattern. We try to split at the precise point of the insertion here, + * rather than applying leaf fillfactor. + * + * "Low cardinality leading column, high cardinality suffix column" + * indexes with a random insertion pattern (e.g. an index on '(country_id, + * event_uuid)') may sometimes end up having the optimization applied + * instead of getting a 50:50 (unweighted) page split. This is + * suboptimal. + * + * We're willing to accept that outcome when an incoming/new tuple is + * either to the left or to the right of all existing items on the page, + * since that's expected for less than 1% of all page splits that occur in + * the index's lifetime (assuming default BLCKSZ). More care must be + * taken here, where we consider splits involving the new item being + * inserted at neither edge of the page: we proceed only when new item's + * heap TID is "adjacent" to the heap TID of the existing tuple to the + * immediate left of the offset for the new item. Heap TID adjacency + * strongly suggests that the item just to the left was inserted very + * recently. + */ + itemid = PageGetItemId(page, OffsetNumberPrev(newitemoff)); + tup = (IndexTuple) PageGetItem(page, itemid); + if (!_bt_adjacenthtid(&tup->t_tid, &newitem->t_tid)) + return false; + /* Also check the usual conditions */ + keepnatts = _bt_keep_natts_fast(rel, tup, newitem); + + if (keepnatts > 1 && keepnatts <= nkeyatts) + { + *propfullonleft = (double) newitemoff / (((double) maxoff + 1)); + return true; + } + + return false; +} + /* * Subroutine to find the "best" split point among an array of acceptable * candidate split points that split without there being an excessively high @@ -820,3 +976,31 @@ _bt_split_penalty(Relation rel, Page page, OffsetNumber newitemoff, Assert(lastleft != firstright); return _bt_keep_natts_fast(rel, lastleft, firstright); } + +/* + * Subroutine for determining if two heap TIDS are "adjacent". + * + * Adjacent means that the high TID is very likely to have been inserted into + * heap relation immediately after the low TID, probably by the same + * transaction. + */ +static bool +_bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid) +{ + BlockNumber lowblk, + highblk; + + lowblk = ItemPointerGetBlockNumber(lowhtid); + highblk = ItemPointerGetBlockNumber(highhtid); + + /* Make optimistic assumption of adjacency when heap blocks match */ + if (lowblk == highblk) + return true; + + /* When heap block one up, second offset should be FirstOffsetNumber */ + if (lowblk + 1 == highblk && + ItemPointerGetOffsetNumber(highhtid) == FirstOffsetNumber) + return true; + + return false; +} -- 2.17.1