From 4fd6fa5c21b79f56f5d3f8f8881778a3d8fb82c5 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Wed, 25 Sep 2019 10:08:53 -0700 Subject: [PATCH v20 1/2] Add deduplication to nbtree --- contrib/amcheck/verify_nbtree.c | 164 ++++- src/backend/access/common/reloptions.c | 11 +- src/backend/access/index/genam.c | 4 + src/backend/access/nbtree/README | 74 +- src/backend/access/nbtree/nbtinsert.c | 860 +++++++++++++++++++++++- src/backend/access/nbtree/nbtpage.c | 211 +++++- src/backend/access/nbtree/nbtree.c | 175 ++++- src/backend/access/nbtree/nbtsearch.c | 244 ++++++- src/backend/access/nbtree/nbtsort.c | 144 +++- src/backend/access/nbtree/nbtsplitloc.c | 49 +- src/backend/access/nbtree/nbtutils.c | 326 ++++++++- src/backend/access/nbtree/nbtxlog.c | 222 +++++- src/backend/access/rmgrdesc/nbtdesc.c | 28 +- src/include/access/nbtree.h | 319 ++++++++- src/include/access/nbtxlog.h | 68 +- src/include/access/rmgrlist.h | 2 +- src/tools/valgrind.supp | 21 + 17 files changed, 2732 insertions(+), 190 deletions(-) diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 05e7d678ed..bdb0ede577 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -145,6 +145,7 @@ static void bt_tuple_present_callback(Relation index, HeapTuple htup, bool tupleIsAlive, void *checkstate); static IndexTuple bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup); +static inline IndexTuple bt_posting_logical_tuple(IndexTuple itup, int n); static bool bt_rootdescend(BtreeCheckState *state, IndexTuple itup); static inline bool offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset); @@ -419,12 +420,13 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, /* * Size Bloom filter based on estimated number of tuples in index, * while conservatively assuming that each block must contain at least - * MaxIndexTuplesPerPage / 5 non-pivot tuples. (Non-leaf pages cannot - * contain non-pivot tuples. That's okay because they generally make - * up no more than about 1% of all pages in the index.) + * MaxPostingIndexTuplesPerPage / 3 "logical" tuples. heapallindexed + * verification fingerprints posting list heap TIDs as plain non-pivot + * tuples, complete with index keys. This allows its heap scan to + * behave as if posting lists do not exist. */ total_pages = RelationGetNumberOfBlocks(rel); - total_elems = Max(total_pages * (MaxIndexTuplesPerPage / 5), + total_elems = Max(total_pages * (MaxPostingIndexTuplesPerPage / 3), (int64) state->rel->rd_rel->reltuples); /* Random seed relies on backend srandom() call to avoid repetition */ seed = random(); @@ -924,6 +926,7 @@ bt_target_page_check(BtreeCheckState *state) size_t tupsize; BTScanInsert skey; bool lowersizelimit; + ItemPointer scantid; CHECK_FOR_INTERRUPTS(); @@ -994,29 +997,73 @@ bt_target_page_check(BtreeCheckState *state) /* * Readonly callers may optionally verify that non-pivot tuples can - * each be found by an independent search that starts from the root + * each be found by an independent search that starts from the root. + * Note that we deliberately don't do individual searches for each + * "logical" posting list tuple, since the posting list itself is + * validated by other checks. */ if (state->rootdescend && P_ISLEAF(topaque) && !bt_rootdescend(state, itup)) { char *itid, *htid; + ItemPointer tid = BTreeTupleGetHeapTID(itup); itid = psprintf("(%u,%u)", state->targetblock, offset); htid = psprintf("(%u,%u)", - ItemPointerGetBlockNumber(&(itup->t_tid)), - ItemPointerGetOffsetNumber(&(itup->t_tid))); + ItemPointerGetBlockNumber(tid), + ItemPointerGetOffsetNumber(tid)); ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("could not find tuple using search from root page in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Index tid=%s points to heap tid=%s page lsn=%X/%X.", + errdetail_internal("Index tid=%s min heap tid=%s page lsn=%X/%X.", itid, htid, (uint32) (state->targetlsn >> 32), (uint32) state->targetlsn))); } + /* + * If tuple is actually a posting list, make sure posting list TIDs + * are in order. + */ + if (BTreeTupleIsPosting(itup)) + { + ItemPointerData last; + ItemPointer current; + + ItemPointerCopy(BTreeTupleGetHeapTID(itup), &last); + + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + + current = BTreeTupleGetPostingN(itup, i); + + if (ItemPointerCompare(current, &last) <= 0) + { + char *itid, + *htid; + + itid = psprintf("(%u,%u)", state->targetblock, offset); + htid = psprintf("(%u,%u)", + ItemPointerGetBlockNumberNoCheck(current), + ItemPointerGetOffsetNumberNoCheck(current)); + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("posting list heap TIDs out of order in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Index tid=%s min heap tid=%s page lsn=%X/%X.", + itid, htid, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn))); + } + + ItemPointerCopy(current, &last); + } + } + /* Build insertion scankey for current page offset */ skey = bt_mkscankey_pivotsearch(state->rel, itup); @@ -1074,12 +1121,32 @@ bt_target_page_check(BtreeCheckState *state) { IndexTuple norm; - norm = bt_normalize_tuple(state, itup); - bloom_add_element(state->filter, (unsigned char *) norm, - IndexTupleSize(norm)); - /* Be tidy */ - if (norm != itup) - pfree(norm); + if (BTreeTupleIsPosting(itup)) + { + /* Fingerprint all elements as distinct "logical" tuples */ + for (int i = 0; i < BTreeTupleGetNPosting(itup); i++) + { + IndexTuple logtuple; + + logtuple = bt_posting_logical_tuple(itup, i); + norm = bt_normalize_tuple(state, logtuple); + bloom_add_element(state->filter, (unsigned char *) norm, + IndexTupleSize(norm)); + /* Be tidy */ + if (norm != logtuple) + pfree(norm); + pfree(logtuple); + } + } + else + { + norm = bt_normalize_tuple(state, itup); + bloom_add_element(state->filter, (unsigned char *) norm, + IndexTupleSize(norm)); + /* Be tidy */ + if (norm != itup) + pfree(norm); + } } /* @@ -1087,7 +1154,8 @@ bt_target_page_check(BtreeCheckState *state) * * If there is a high key (if this is not the rightmost page on its * entire level), check that high key actually is upper bound on all - * page items. + * page items. If this is a posting list tuple, we'll need to set + * scantid to be highest TID in posting list. * * We prefer to check all items against high key rather than checking * just the last and trusting that the operator class obeys the @@ -1127,6 +1195,9 @@ bt_target_page_check(BtreeCheckState *state) * tuple. (See also: "Notes About Data Representation" in the nbtree * README.) */ + scantid = skey->scantid; + if (state->heapkeyspace && !BTreeTupleIsPivot(itup)) + skey->scantid = BTreeTupleGetMaxHeapTID(itup); if (!P_RIGHTMOST(topaque) && !(P_ISLEAF(topaque) ? invariant_leq_offset(state, skey, P_HIKEY) : invariant_l_offset(state, skey, P_HIKEY))) @@ -1150,6 +1221,7 @@ bt_target_page_check(BtreeCheckState *state) (uint32) (state->targetlsn >> 32), (uint32) state->targetlsn))); } + skey->scantid = scantid; /* * * Item order check * @@ -1164,11 +1236,13 @@ bt_target_page_check(BtreeCheckState *state) *htid, *nitid, *nhtid; + ItemPointer tid; itid = psprintf("(%u,%u)", state->targetblock, offset); + tid = BTreeTupleGetHeapTID(itup); htid = psprintf("(%u,%u)", - ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)), - ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid))); + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); nitid = psprintf("(%u,%u)", state->targetblock, OffsetNumberNext(offset)); @@ -1177,9 +1251,11 @@ bt_target_page_check(BtreeCheckState *state) state->target, OffsetNumberNext(offset)); itup = (IndexTuple) PageGetItem(state->target, itemid); + + tid = BTreeTupleGetHeapTID(itup); nhtid = psprintf("(%u,%u)", - ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)), - ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid))); + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), @@ -1189,10 +1265,10 @@ bt_target_page_check(BtreeCheckState *state) "higher index tid=%s (points to %s tid=%s) " "page lsn=%X/%X.", itid, - P_ISLEAF(topaque) ? "heap" : "index", + P_ISLEAF(topaque) ? "min heap" : "index", htid, nitid, - P_ISLEAF(topaque) ? "heap" : "index", + P_ISLEAF(topaque) ? "min heap" : "index", nhtid, (uint32) (state->targetlsn >> 32), (uint32) state->targetlsn))); @@ -1953,10 +2029,10 @@ bt_tuple_present_callback(Relation index, HeapTuple htup, Datum *values, * verification. In particular, it won't try to normalize opclass-equal * datums with potentially distinct representations (e.g., btree/numeric_ops * index datums will not get their display scale normalized-away here). - * Normalization may need to be expanded to handle more cases in the future, - * though. For example, it's possible that non-pivot tuples could in the - * future have alternative logically equivalent representations due to using - * the INDEX_ALT_TID_MASK bit to implement intelligent deduplication. + * Caller does normalization for non-pivot tuples that have a posting list, + * since dummy CREATE INDEX callback code generates new tuples with the same + * normalized representation. Deduplication is performed opportunistically, + * and in general there is no guarantee about how or when it will be applied. */ static IndexTuple bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup) @@ -1969,6 +2045,9 @@ bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup) IndexTuple reformed; int i; + /* Caller should only pass "logical" non-pivot tuples here */ + Assert(!BTreeTupleIsPosting(itup) && !BTreeTupleIsPivot(itup)); + /* Easy case: It's immediately clear that tuple has no varlena datums */ if (!IndexTupleHasVarwidths(itup)) return itup; @@ -2031,6 +2110,30 @@ bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup) return reformed; } +/* + * Produce palloc()'d "logical" tuple for nth posting list entry. + * + * In general, deduplication is not supposed to change the logical contents of + * an index. Multiple logical index tuples are folded together into one + * physical posting list index tuple when convenient. + * + * heapallindexed verification must normalize-away this variation in + * representation by converting posting list tuples into two or more "logical" + * tuples. Each logical tuple must be fingerprinted separately -- there must + * be one logical tuple for each corresponding Bloom filter probe during the + * heap scan. + * + * Note: Caller needs to call bt_normalize_tuple() with returned tuple. + */ +static inline IndexTuple +bt_posting_logical_tuple(IndexTuple itup, int n) +{ + Assert(BTreeTupleIsPosting(itup)); + + /* Returns non-posting-list tuple */ + return BTreeFormPostingTuple(itup, BTreeTupleGetPostingN(itup, n), 1); +} + /* * Search for itup in index, starting from fast root page. itup must be a * non-pivot tuple. This is only supported with heapkeyspace indexes, since @@ -2087,6 +2190,7 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup) insertstate.itup = itup; insertstate.itemsz = MAXALIGN(IndexTupleSize(itup)); insertstate.itup_key = key; + insertstate.postingoff = 0; insertstate.bounds_valid = false; insertstate.buf = lbuf; @@ -2094,7 +2198,9 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup) offnum = _bt_binsrch_insert(state->rel, &insertstate); /* Compare first >= matching item on leaf page, if any */ page = BufferGetPage(lbuf); + /* Should match on first heap TID when tuple has a posting list */ if (offnum <= PageGetMaxOffsetNumber(page) && + insertstate.postingoff <= 0 && _bt_compare(state->rel, key, page, offnum) == 0) exists = true; _bt_relbuf(state->rel, lbuf); @@ -2560,14 +2666,18 @@ static inline ItemPointer BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, IndexTuple itup, bool nonpivot) { - ItemPointer result = BTreeTupleGetHeapTID(itup); + ItemPointer result; BlockNumber targetblock = state->targetblock; - if (result == NULL && nonpivot) + /* Shouldn't be called with heapkeyspace index */ + Assert(state->heapkeyspace); + if (BTreeTupleIsPivot(itup) == nonpivot) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("block %u or its right sibling block or child block in index \"%s\" contains non-pivot tuple that lacks a heap TID", targetblock, RelationGetRelationName(state->rel)))); + result = BTreeTupleGetHeapTID(itup); + return result; } diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index b5072c00fe..e6448e4a86 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -158,6 +158,15 @@ static relopt_bool boolRelOpts[] = }, true }, + { + { + "deduplication", + "Enables deduplication on btree index leaf pages", + RELOPT_KIND_BTREE, + ShareUpdateExclusiveLock + }, + true + }, /* list terminator */ {{NULL}} }; @@ -1513,8 +1522,6 @@ default_reloptions(Datum reloptions, bool validate, relopt_kind kind) offsetof(StdRdOptions, user_catalog_table)}, {"parallel_workers", RELOPT_TYPE_INT, offsetof(StdRdOptions, parallel_workers)}, - {"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL, - offsetof(StdRdOptions, vacuum_cleanup_index_scale_factor)}, {"vacuum_index_cleanup", RELOPT_TYPE_BOOL, offsetof(StdRdOptions, vacuum_index_cleanup)}, {"vacuum_truncate", RELOPT_TYPE_BOOL, diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 2599b5d342..6e1dc596e1 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -276,6 +276,10 @@ BuildIndexValueDescription(Relation indexRelation, /* * Get the latestRemovedXid from the table entries pointed at by the index * tuples being deleted. + * + * Note: index access methods that don't consistently use the standard + * IndexTuple + heap TID item pointer representation will need to provide + * their own version of this function. */ TransactionId index_compute_xid_horizon_for_tuples(Relation irel, diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 6db203e75c..54cb9db49d 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -432,7 +432,10 @@ because we allow LP_DEAD to be set with only a share lock (it's exactly like a hint bit for a heap tuple), but physically removing tuples requires exclusive lock. In the current code we try to remove LP_DEAD tuples when we are otherwise faced with having to split a page to do an insertion (and -hence have exclusive lock on it already). +hence have exclusive lock on it already). Deduplication can also prevent +a page split, but removing LP_DEAD tuples is the preferred approach. +(Note that posting list tuples can only have their LP_DEAD bit set when +every "logical" tuple represented within the posting list is known dead.) This leaves the index in a state where it has no entry for a dead tuple that still exists in the heap. This is not a problem for the current @@ -710,6 +713,75 @@ the fallback strategy assumes that duplicates are mostly inserted in ascending heap TID order. The page is split in a way that leaves the left half of the page mostly full, and the right half of the page mostly empty. +Notes about deduplication +------------------------- + +We deduplicate non-pivot tuples in non-unique indexes to reduce storage +overhead, and to avoid or at least delay page splits. Deduplication alters +the physical representation of tuples without changing the logical contents +of the index, and without adding overhead to read queries. Non-pivot +tuples are folded together into a single physical tuple with a posting list +(a simple array of heap TIDs with the standard item pointer format). +Deduplication is always applied lazily, at the point where it would +otherwise be necessary to perform a page split. It occurs only when +LP_DEAD items have been removed, as our last line of defense against +splitting a leaf page. We can set the LP_DEAD bit with posting list +tuples, though only when all table tuples are known dead. (Bitmap scans +cannot perform LP_DEAD bit setting, and are the common case with indexes +that contain lots of duplicates, so this downside is considered +acceptable.) + +Large groups of logical duplicates tend to appear together on the same leaf +page due to the special duplicate logic used when choosing a split point. +This facilitates lazy/dynamic deduplication. Deduplication can reliably +deduplicate a large localized group of duplicates before it can span +multiple leaf pages. Posting list tuples are subject to the same 1/3 of a +page restriction as any other tuple. + +Lazy deduplication allows the page space accounting used during page splits +to have absolutely minimal special case logic for posting lists. A posting +list can be thought of as extra payload that suffix truncation will +reliably truncate away as needed during page splits, just like non-key +columns from an INCLUDE index tuple. An incoming tuple (which might cause +a page split) can always be thought of as a non-posting-list tuple that +must be inserted alongside existing items, without needing to consider +deduplication. Most of the time, that's what actually happens: incoming +tuples are either not duplicates, or are duplicates with a heap TID that +doesn't overlap with any existing posting list tuple. When the incoming +tuple really does overlap with an existing posting list, a posting list +split is performed. Posting list splits work in a way that more or less +preserves the illusion that all incoming tuples do not need to be merged +with any existing posting list tuple. + +Posting list splits work by "overriding" the details of the incoming tuple. +The heap TID of the incoming tuple is altered to make it match the +rightmost heap TID from the existing/originally overlapping posting list. +The offset number that the new/incoming tuple is to be inserted at is +incremented so that it will be inserted to the right of the existing +posting list. The insertion (or page split) operation that completes the +insert does one extra step: an in-place update of the posting list. The +update changes the posting list such that the "true" heap TID from the +original incoming tuple is now contained in the posting list. We make +space in the posting list by removing the heap TID that became the new +item. The size of the posting list won't change, and so the page split +space accounting does not need to care about posting lists. Also, overall +space utilization is improved by keeping existing posting lists large. + +The representation of posting lists is identical to the posting lists used +by GIN, so it would be straightforward to apply GIN's varbyte encoding +compression scheme to individual posting lists. Posting list compression +would break the assumptions made by posting list splits about page space +accounting, though, so it's not clear how compression could be integrated +with nbtree. Besides, posting list compression does not offer a compelling +trade-off for nbtree, since in general nbtree is optimized for consistent +performance with many concurrent readers and writers. A major goal of +nbtree's lazy approach to deduplication is to limit the performance impact +of deduplication with random updates. Even concurrent append-only inserts +of the same key value will tend to have inserts of individual index tuples +in an order that doesn't quite match heap TID order. In general, delaying +deduplication avoids many unnecessary posting list splits, and minimizes +page level fragmentation. + Notes About Data Representation ------------------------------- diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index b84bf1c3df..3d213dfd2d 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -47,21 +47,27 @@ static void _bt_insertonpg(Relation rel, BTScanInsert itup_key, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, + int postingoff, bool split_only_page); static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, - IndexTuple newitem); + IndexTuple newitem, IndexTuple orignewitem, + IndexTuple nposting, OffsetNumber postingoff); static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only); static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off); static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel); +static void _bt_dedup_one_page(Relation rel, Buffer buffer, Relation heapRel, + IndexTuple newitem, Size newitemsz, + bool checkingunique); /* * _bt_doinsert() -- Handle insertion of a single index tuple in the tree. * * This routine is called by the public interface routine, btinsert. - * By here, itup is filled in, including the TID. + * By here, itup is filled in, including the TID. Caller should be + * prepared for us to scribble on 'itup'. * * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or @@ -123,6 +129,7 @@ _bt_doinsert(Relation rel, IndexTuple itup, /* PageAddItem will MAXALIGN(), but be consistent */ insertstate.itemsz = MAXALIGN(IndexTupleSize(itup)); insertstate.itup_key = itup_key; + insertstate.postingoff = 0; insertstate.bounds_valid = false; insertstate.buf = InvalidBuffer; @@ -300,7 +307,7 @@ top: newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique, stack, heapRel); _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack, - itup, newitemoff, false); + itup, newitemoff, insertstate.postingoff, false); } else { @@ -428,14 +435,36 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, if (!ItemIdIsDead(curitemid)) { ItemPointerData htid; + bool posting; bool all_dead; + bool posting_all_dead; + int npost; + if (_bt_compare(rel, itup_key, page, offset) != 0) break; /* we're past all the equal tuples */ /* okay, we gotta fetch the heap tuple ... */ curitup = (IndexTuple) PageGetItem(page, curitemid); - htid = curitup->t_tid; + + if (!BTreeTupleIsPosting(curitup)) + { + htid = curitup->t_tid; + posting = false; + posting_all_dead = true; + } + else + { + posting = true; + /* Initial assumption */ + posting_all_dead = true; + } + + npost = 0; + doposttup: + if (posting) + htid = *BTreeTupleGetPostingN(curitup, npost); + /* * If we are doing a recheck, we expect to find the tuple we @@ -446,6 +475,9 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, ItemPointerCompare(&htid, &itup->t_tid) == 0) { found = true; + posting_all_dead = false; + if (posting) + goto nextpost; } /* @@ -511,8 +543,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, * not part of this chain because it had a different index * entry. */ - htid = itup->t_tid; - if (table_index_fetch_tuple_check(heapRel, &htid, + if (table_index_fetch_tuple_check(heapRel, &itup->t_tid, SnapshotSelf, NULL)) { /* Normal case --- it's still live */ @@ -570,7 +601,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, RelationGetRelationName(rel)))); } } - else if (all_dead) + else if (all_dead && !posting) { /* * The conflicting tuple (or whole HOT chain) is dead to @@ -589,6 +620,35 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, else MarkBufferDirtyHint(insertstate->buf, true); } + else if (posting) + { + nextpost: + if (!all_dead) + posting_all_dead = false; + + /* Iterate over single posting list tuple */ + npost++; + if (npost < BTreeTupleGetNPosting(curitup)) + goto doposttup; + + /* + * Mark posting tuple dead if all hot chains whose root is + * contained in posting tuple have tuples that are all + * dead + */ + if (posting_all_dead) + { + ItemIdMarkDead(curitemid); + opaque->btpo_flags |= BTP_HAS_GARBAGE; + + if (nbuf != InvalidBuffer) + MarkBufferDirtyHint(nbuf, true); + else + MarkBufferDirtyHint(insertstate->buf, true); + } + + /* Move on to next index tuple */ + } } } @@ -689,6 +749,7 @@ _bt_findinsertloc(Relation rel, BTScanInsert itup_key = insertstate->itup_key; Page page = BufferGetPage(insertstate->buf); BTPageOpaque lpageop; + OffsetNumber location; lpageop = (BTPageOpaque) PageGetSpecialPointer(page); @@ -751,13 +812,26 @@ _bt_findinsertloc(Relation rel, /* * If the target page is full, see if we can obtain enough space by - * erasing LP_DEAD items + * erasing LP_DEAD items. If that doesn't work out, and if the index + * deduplication is both possible and enabled, try deduplication. */ - if (PageGetFreeSpace(page) < insertstate->itemsz && - P_HAS_GARBAGE(lpageop)) + if (PageGetFreeSpace(page) < insertstate->itemsz) { - _bt_vacuum_one_page(rel, insertstate->buf, heapRel); - insertstate->bounds_valid = false; + if (P_HAS_GARBAGE(lpageop)) + { + _bt_vacuum_one_page(rel, insertstate->buf, heapRel); + insertstate->bounds_valid = false; + } + + if (insertstate->itup_key->dedup_is_possible && + BtreeGetDoDedupOption(rel) && + PageGetFreeSpace(page) < insertstate->itemsz) + { + _bt_dedup_one_page(rel, insertstate->buf, heapRel, + insertstate->itup, insertstate->itemsz, + checkingunique); + insertstate->bounds_valid = false; + } } } else @@ -839,7 +913,38 @@ _bt_findinsertloc(Relation rel, Assert(P_RIGHTMOST(lpageop) || _bt_compare(rel, itup_key, page, P_HIKEY) <= 0); - return _bt_binsrch_insert(rel, insertstate); + location = _bt_binsrch_insert(rel, insertstate); + + /* + * Insertion is not prepared for the case where an LP_DEAD posting list + * tuple must be split. In the unlikely event that this happens, call + * _bt_dedup_one_page() to force it to kill all LP_DEAD items. + */ + if (unlikely(insertstate->postingoff == -1)) + { + Assert(insertstate->itup_key->dedup_is_possible); + + /* + * Don't check if the option is enabled, since no actual deduplication + * will be done, just cleanup. + */ + _bt_dedup_one_page(rel, insertstate->buf, heapRel, insertstate->itup, + 0, checkingunique); + Assert(!P_HAS_GARBAGE(lpageop)); + + /* Must reset insertstate ahead of new _bt_binsrch_insert() call */ + insertstate->bounds_valid = false; + insertstate->postingoff = 0; + location = _bt_binsrch_insert(rel, insertstate); + + /* + * Might still have to split some other posting list now, but that + * should never be LP_DEAD + */ + Assert(insertstate->postingoff >= 0); + } + + return location; } /* @@ -900,15 +1005,81 @@ _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack) insertstate->bounds_valid = false; } +/* + * Form a new posting list during a posting split. + * + * If caller determines that its new tuple 'newitem' is a duplicate with a + * heap TID that falls inside the range of an existing posting list tuple + * 'oposting', it must generate a new posting tuple to replace the original. + * The new posting list is guaranteed to be the same size as the original. + * Caller must also change newitem to have the heap TID of the rightmost TID + * in the original posting list. Both steps are always handled by calling + * here. + * + * Returns new posting list palloc()'d in caller's context. Also modifies + * caller's newitem to contain final/effective heap TID, which is what caller + * actually inserts on the page. + * + * Exported for use by recovery. Note that recovery path must recreate the + * same version of newitem that is passed here on the primary, even though + * that differs from the final newitem actually added to the page. This + * optimization avoids explicit WAL-logging of entire posting lists, which + * tend to be rather large. + */ +IndexTuple +_bt_posting_split(IndexTuple newitem, IndexTuple oposting, + OffsetNumber postingoff) +{ + int nhtids; + char *replacepos; + char *rightpos; + Size nbytes; + IndexTuple nposting; + + Assert(BTreeTupleIsPosting(oposting)); + nhtids = BTreeTupleGetNPosting(oposting); + Assert(postingoff < nhtids); + + nposting = CopyIndexTuple(oposting); + replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff); + rightpos = replacepos + sizeof(ItemPointerData); + nbytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData); + + /* + * Move item pointers in posting list to make a gap for the new item's + * heap TID (shift TIDs one place to the right, losing original rightmost + * TID). + */ + memmove(rightpos, replacepos, nbytes); + + /* + * Fill the gap with the TID of the new item. + */ + ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos); + + /* + * Copy original (not new original) posting list's last TID into new item + */ + ItemPointerCopy(BTreeTupleGetPostingN(oposting, nhtids - 1), + &newitem->t_tid); + Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting), + BTreeTupleGetHeapTID(newitem)) < 0); + Assert(BTreeTupleGetNPosting(nposting) == BTreeTupleGetNPosting(oposting)); + + return nposting; +} + /*---------- * _bt_insertonpg() -- Insert a tuple on a particular page in the index. * * This recursive procedure does the following things: * + * + if necessary, splits an existing posting list on page. + * This is only needed when 'postingoff' is non-zero. * + if necessary, splits the target page, using 'itup_key' for * suffix truncation on leaf pages (caller passes NULL for * non-leaf pages). - * + inserts the tuple. + * + inserts the new tuple (could be from split posting list). * + if the page was split, pops the parent stack, and finds the * right place to insert the new child pointer (by walking * right using information stored in the parent stack). @@ -918,7 +1089,8 @@ _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack) * * On entry, we must have the correct buffer in which to do the * insertion, and the buffer must be pinned and write-locked. On return, - * we will have dropped both the pin and the lock on the buffer. + * we will have dropped both the pin and the lock on the buffer. Caller + * should be prepared for us to scribble on 'itup'. * * This routine only performs retail tuple insertions. 'itup' should * always be either a non-highkey leaf item, or a downlink (new high @@ -936,11 +1108,15 @@ _bt_insertonpg(Relation rel, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, + int postingoff, bool split_only_page) { Page page; BTPageOpaque lpageop; Size itemsz; + IndexTuple oposting; + IndexTuple origitup = NULL; + IndexTuple nposting = NULL; page = BufferGetPage(buf); lpageop = (BTPageOpaque) PageGetSpecialPointer(page); @@ -954,6 +1130,8 @@ _bt_insertonpg(Relation rel, Assert(P_ISLEAF(lpageop) || BTreeTupleGetNAtts(itup, rel) <= IndexRelationGetNumberOfKeyAttributes(rel)); + /* retail insertions of posting list tuples are disallowed */ + Assert(!BTreeTupleIsPosting(itup)); /* The caller should've finished any incomplete splits already. */ if (P_INCOMPLETE_SPLIT(lpageop)) @@ -964,6 +1142,46 @@ _bt_insertonpg(Relation rel, itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we * need to be consistent */ + /* + * Do we need to split an existing posting list item? + */ + if (postingoff != 0) + { + ItemId itemid = PageGetItemId(page, newitemoff); + + /* + * The new tuple is a duplicate with a heap TID that falls inside the + * range of an existing posting list tuple, so split posting list. + * + * Posting list splits always replace some existing TID in the posting + * list with the new item's heap TID (based on a posting list offset + * from caller) by removing rightmost heap TID from posting list. The + * new item's heap TID is swapped with that rightmost heap TID, almost + * as if the tuple inserted never overlapped with a posting list in + * the first place. This allows the insertion and page split code to + * have minimal special case handling of posting lists. + * + * The only extra handling required is to overwrite the original + * posting list with nposting, which is guaranteed to be the same size + * as the original, keeping the page space accounting simple. This + * takes place in either the page insert or page split critical + * section. + */ + Assert(P_ISLEAF(lpageop)); + Assert(!ItemIdIsDead(itemid)); + Assert(postingoff > 0); + oposting = (IndexTuple) PageGetItem(page, itemid); + + /* save a copy of itup with unchanged TID to write it into xlog record */ + origitup = CopyIndexTuple(itup); + nposting = _bt_posting_split(itup, oposting, postingoff); + + Assert(BTreeTupleGetNPosting(nposting) == + BTreeTupleGetNPosting(oposting)); + /* Alter new item offset, since effective new item changed */ + newitemoff = OffsetNumberNext(newitemoff); + } + /* * Do we need to split the page to fit the item on it? * @@ -996,7 +1214,8 @@ _bt_insertonpg(Relation rel, BlockNumberIsValid(RelationGetTargetBlock(rel)))); /* split the buffer into left and right halves */ - rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup); + rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup, + origitup, nposting, postingoff); PredicateLockPageSplit(rel, BufferGetBlockNumber(buf), BufferGetBlockNumber(rbuf)); @@ -1075,6 +1294,18 @@ _bt_insertonpg(Relation rel, elog(PANIC, "failed to add new item to block %u in index \"%s\"", itup_blkno, RelationGetRelationName(rel)); + if (nposting) + { + /* + * Posting list split requires an in-place update of the existing + * posting list + */ + Assert(P_ISLEAF(lpageop)); + Assert(MAXALIGN(IndexTupleSize(oposting)) == + MAXALIGN(IndexTupleSize(nposting))); + memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting))); + } + MarkBufferDirty(buf); if (BufferIsValid(metabuf)) @@ -1116,6 +1347,7 @@ _bt_insertonpg(Relation rel, XLogRecPtr recptr; xlrec.offnum = itup_off; + xlrec.postingoff = postingoff; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert); @@ -1144,6 +1376,7 @@ _bt_insertonpg(Relation rel, xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact; xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; + xlmeta.btm_dedup_is_possible = metad->btm_dedup_is_possible; XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata)); @@ -1152,7 +1385,19 @@ _bt_insertonpg(Relation rel, } XLogRegisterBuffer(0, buf, REGBUF_STANDARD); - XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup)); + + /* + * We always write newitem to the page, but when there is an + * original newitem due to a posting list split then we log the + * original item instead. REDO routine must reconstruct the final + * newitem at the same time it reconstructs nposting. + */ + if (postingoff == 0) + XLogRegisterBufData(0, (char *) itup, + IndexTupleSize(itup)); + else + XLogRegisterBufData(0, (char *) origitup, + IndexTupleSize(origitup)); recptr = XLogInsert(RM_BTREE_ID, xlinfo); @@ -1194,6 +1439,13 @@ _bt_insertonpg(Relation rel, _bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL) RelationSetTargetBlock(rel, cachedBlock); } + + /* be tidy */ + if (postingoff != 0) + { + pfree(nposting); + pfree(origitup); + } } /* @@ -1209,12 +1461,25 @@ _bt_insertonpg(Relation rel, * This function will clear the INCOMPLETE_SPLIT flag on it, and * release the buffer. * + * orignewitem, nposting, and postingoff are needed when an insert of + * orignewitem results in both a posting list split and a page split. + * newitem and nposting are replacements for orignewitem and the + * existing posting list on the page respectively. These extra + * posting list split details are used here in the same way as they + * are used in the more common case where a posting list split does + * not coincide with a page split. We need to deal with posting list + * splits directly in order to ensure that everything that follows + * from the insert of orignewitem is handled as a single atomic + * operation (though caller's insert of a new pivot/downlink into + * parent page will still be a separate operation). + * * Returns the new right sibling of buf, pinned and write-locked. * The pin and lock on buf are maintained. */ static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, - OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem) + OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, + IndexTuple orignewitem, IndexTuple nposting, OffsetNumber postingoff) { Buffer rbuf; Page origpage; @@ -1236,12 +1501,23 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber firstright; OffsetNumber maxoff; OffsetNumber i; + OffsetNumber replacepostingoff = InvalidOffsetNumber; bool newitemonleft, isleaf; IndexTuple lefthikey; int indnatts = IndexRelationGetNumberOfAttributes(rel); int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + /* + * Determine offset number of existing posting list on page when a split + * of a posting list needs to take place as the page is split + */ + if (nposting != NULL) + { + Assert(itup_key->heapkeyspace); + replacepostingoff = OffsetNumberPrev(newitemoff); + } + /* * origpage is the original page to be split. leftpage is a temporary * buffer that receives the left-sibling data, which will be copied back @@ -1273,6 +1549,13 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, * newitemoff == firstright. In all other cases it's clear which side of * the split every tuple goes on from context. newitemonleft is usually * (but not always) redundant information. + * + * Note: In theory, the split point choice logic should operate against a + * version of the page that already replaced the posting list at offset + * replacepostingoff with nposting where applicable. We don't bother with + * that, though. Both versions of the posting list must be the same size, + * and both will have the same base tuple key values, so split point + * choice is never affected. */ firstright = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz, newitem, &newitemonleft); @@ -1340,6 +1623,9 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, itemid = PageGetItemId(origpage, firstright); itemsz = ItemIdGetLength(itemid); item = (IndexTuple) PageGetItem(origpage, itemid); + /* Behave as if origpage posting list has already been swapped */ + if (firstright == replacepostingoff) + item = nposting; } /* @@ -1373,6 +1659,9 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque)); itemid = PageGetItemId(origpage, lastleftoff); lastleft = (IndexTuple) PageGetItem(origpage, itemid); + /* Behave as if origpage posting list has already been swapped */ + if (lastleftoff == replacepostingoff) + lastleft = nposting; } Assert(lastleft != item); @@ -1480,8 +1769,23 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, itemsz = ItemIdGetLength(itemid); item = (IndexTuple) PageGetItem(origpage, itemid); + /* + * did caller pass new replacement posting list tuple due to posting + * list split? + */ + if (i == replacepostingoff) + { + /* + * swap origpage posting list with post-posting-list-split version + * from caller + */ + Assert(isleaf); + Assert(itemsz == MAXALIGN(IndexTupleSize(nposting))); + item = nposting; + } + /* does new item belong before this one? */ - if (i == newitemoff) + else if (i == newitemoff) { if (newitemonleft) { @@ -1650,8 +1954,12 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, XLogRecPtr recptr; xlrec.level = ropaque->btpo.level; + /* See comments below on newitem, orignewitem, and posting lists */ xlrec.firstright = firstright; xlrec.newitemoff = newitemoff; + xlrec.postingoff = InvalidOffsetNumber; + if (replacepostingoff < firstright) + xlrec.postingoff = postingoff; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit); @@ -1670,11 +1978,46 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, * because it's included with all the other items on the right page.) * Show the new item as belonging to the left page buffer, so that it * is not stored if XLogInsert decides it needs a full-page image of - * the left page. We store the offset anyway, though, to support - * archive compression of these records. + * the left page. We always store newitemoff in record, though. + * + * The details are often slightly different for page splits that + * coincide with a posting list split. If both the replacement + * posting list and newitem go on the right page, then we don't need + * to log anything extra, just like the simple !newitemonleft + * no-posting-split case (postingoff isn't set in the WAL record, so + * recovery can't even tell the difference). Otherwise, we set + * postingoff and log orignewitem instead of newitem, despite having + * actually inserted newitem. Recovery must reconstruct nposting and + * newitem by repeating the actions of our caller (i.e. by passing + * original posting list and orignewitem to _bt_posting_split()). + * + * Note: It's possible that our page split point is the point that + * makes the posting list lastleft and newitem firstright. This is + * the only case where we log orignewitem despite newitem going on the + * right page. If XLogInsert decides that it can omit orignewitem due + * to logging a full-page image of the left page, everything still + * works out, since recovery only needs to log orignewitem for items + * on the left page (just like the regular newitem-logged case). */ - if (newitemonleft) - XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz)); + if (newitemonleft || xlrec.postingoff != InvalidOffsetNumber) + { + if (xlrec.postingoff == InvalidOffsetNumber) + { + /* Must WAL-log newitem, since it's on left page */ + Assert(newitemonleft); + Assert(orignewitem == NULL && nposting == NULL); + XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz)); + } + else + { + /* Must WAL-log orignewitem following posting list split */ + Assert(newitemonleft || firstright == newitemoff); + Assert(ItemPointerCompare(&orignewitem->t_tid, + &newitem->t_tid) < 0); + XLogRegisterBufData(0, (char *) orignewitem, + MAXALIGN(IndexTupleSize(orignewitem))); + } + } /* Log the left page's new high key */ itemid = PageGetItemId(origpage, P_HIKEY); @@ -1834,7 +2177,7 @@ _bt_insert_parent(Relation rel, /* Recursively insert into the parent */ _bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent, - new_item, stack->bts_offset + 1, + new_item, stack->bts_offset + 1, 0, is_only); /* be tidy */ @@ -2190,6 +2533,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) md.fastlevel = metad->btm_level; md.oldest_btpo_xact = metad->btm_oldest_btpo_xact; md.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; + md.btm_dedup_is_possible = metad->btm_dedup_is_possible; XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata)); @@ -2304,6 +2648,472 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel) * Note: if we didn't find any LP_DEAD items, then the page's * BTP_HAS_GARBAGE hint bit is falsely set. We do not bother expending a * separate write to clear it, however. We will clear it when we split - * the page. + * the page (or when deduplication runs). */ } + +/* + * Try to deduplicate items to free at least enough space to avoid a page + * split. This function should be called after LP_DEAD items were removed by + * _bt_vacuum_one_page() to prevent a page split. (We'll have to kill LP_DEAD + * items here when the page's BTP_HAS_GARBAGE hint was not set, but that + * should be rare.) + * + * The strategy for !checkingunique callers is to perform as much + * deduplication as possible to free as much space as possible now, since + * making it harder to set LP_DEAD bits is considered an acceptable price for + * not having to deduplicate the same page many times. It is unlikely that + * the items on the page will have their LP_DEAD bit set in the future, since + * that hasn't happened before now (besides, entire posting lists can still + * have their LP_DEAD bit set). + * + * The strategy for checkingunique callers is rather different, since the + * overall goal is different. Deduplication cooperates with and enhances + * garbage collection, especially the LP_DEAD bit setting that takes place in + * _bt_check_unique(). Deduplication does as little as possible while still + * preventing a page split for caller, since it's less likely that posting + * lists will have their LP_DEAD bit set. Deduplication avoids creating new + * posting lists with only two heap TIDs, and also avoids creating new posting + * lists from an existing posting list. Deduplication is only useful when it + * delays a page split long enough for garbage collection to prevent the page + * split altogether. checkingunique deduplication can make all the difference + * in cases where VACUUM keeps up with dead index tuples, but "recently dead" + * index tuples are still numerous enough to cause page splits that are truly + * unnecessary. + * + * Note: If newitem contains NULL values in key attributes, caller will be + * !checkingunique even when rel is a unique index. The page in question will + * usually have many existing items with NULLs. + */ +static void +_bt_dedup_one_page(Relation rel, Buffer buffer, Relation heapRel, + IndexTuple newitem, Size newitemsz, bool checkingunique) +{ + OffsetNumber offnum, + minoff, + maxoff; + Page page = BufferGetPage(buffer); + BTPageOpaque oopaque; + BTDedupState *state = NULL; + int natts = IndexRelationGetNumberOfAttributes(rel); + OffsetNumber deletable[MaxIndexTuplesPerPage]; + bool minimal = checkingunique; + int ndeletable = 0; + Size pagesaving = 0; + + oopaque = (BTPageOpaque) PageGetSpecialPointer(page); + /* init deduplication state needed to build posting tuples */ + state = (BTDedupState *) palloc(sizeof(BTDedupState)); + state->rel = rel; + + state->maxitemsize = BTMaxItemSize(page); + state->newitem = newitem; + state->checkingunique = checkingunique; + /* Metadata about current pending posting list */ + state->htids = NULL; + state->nhtids = 0; + state->nitems = 0; + state->alltupsize = 0; + state->overlap = false; + /* Metadata about based tuple of current pending posting list */ + state->base = NULL; + state->baseoff = InvalidOffsetNumber; + state->basetupsize = 0; + + minoff = P_FIRSTDATAKEY(oopaque); + maxoff = PageGetMaxOffsetNumber(page); + + /* + * Delete dead tuples if any. We cannot simply skip them in the cycle + * below, because it's necessary to generate special Xlog record + * containing such tuples to compute latestRemovedXid on a standby server + * later. + * + * This should not affect performance, since it only can happen in a rare + * situation when BTP_HAS_GARBAGE flag was not set and _bt_vacuum_one_page + * was not called, or _bt_vacuum_one_page didn't remove all dead items. + */ + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + + if (ItemIdIsDead(itemid)) + deletable[ndeletable++] = offnum; + } + + if (ndeletable > 0) + { + /* + * Skip duplication in rare cases where there were LP_DEAD items + * encountered here when that frees sufficient space for caller to + * avoid a page split + */ + _bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel); + if (PageGetFreeSpace(page) >= newitemsz) + { + pfree(state); + return; + } + + /* Continue with deduplication */ + minoff = P_FIRSTDATAKEY(oopaque); + maxoff = PageGetMaxOffsetNumber(page); + } + + /* Make sure that new page won't have garbage flag set */ + oopaque->btpo_flags &= ~BTP_HAS_GARBAGE; + + /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ + newitemsz += sizeof(ItemIdData); + /* Conservatively size array */ + state->htids = palloc(state->maxitemsize); + + /* + * Iterate over tuples on the page, try to deduplicate them into posting + * lists and insert into new page. NOTE: It's essential to reassess the + * max offset on each iteration, since it will change as items are + * deduplicated. + */ +retry: + offnum = minoff; + while (offnum <= PageGetMaxOffsetNumber(page)) + { + ItemId itemid = PageGetItemId(page, offnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + + Assert(!ItemIdIsDead(itemid)); + + if (state->nitems == 0) + { + /* + * No previous/base tuple for the data item -- use the data item + * as base tuple of pending posting list + */ + _bt_dedup_start_pending(state, itup, offnum); + } + else if (_bt_keep_natts_fast(rel, state->base, itup) > natts && + _bt_dedup_save_htid(state, itup)) + { + /* + * Tuple is equal to base tuple of pending posting list, and + * merging itup into pending posting list won't exceed the + * BTMaxItemSize() limit. Heap TID(s) for itup have been saved in + * state. The next iteration will also end up here if it's + * possible to merge the next tuple into the same pending posting + * list. + */ + } + else + { + /* + * Tuple is not equal to pending posting list tuple, or + * BTMaxItemSize() limit was reached. + * + * If state contains pending posting list with more than one item, + * form new posting tuple, and update the page, otherwise, just + * reset the state and move on. + */ + pagesaving += _bt_dedup_finish_pending(buffer, state, + RelationNeedsWAL(rel)); + + /* + * When caller is a checkingunique caller and we have deduplicated + * enough to avoid a page split, do minimal deduplication. Don't + * prematurely deduplicate items that could still have their + * LP_DEAD bits set. + */ + if (minimal && pagesaving >= newitemsz) + break; + + /* Continue iteration from base tuple's offnum */ + offnum = state->baseoff; + } + + offnum = OffsetNumberNext(offnum); + } + + /* Handle the last item when pending posting list is not empty */ + if (state->nitems != 0) + pagesaving += _bt_dedup_finish_pending(buffer, state, + RelationNeedsWAL(rel)); + + if (state->checkingunique && pagesaving < newitemsz) + { + /* + * Try again. The second pass over the page may deduplicate items + * that were passed over the first time due to concerns about limiting + * the effectiveness of LP_DEAD bit setting within _bt_check_unique(). + * Note that we will still stop deduplicating as soon as enough space + * has been freed to avoid caller's page split. + * + * FIXME: Don't bother with this when it's clearly a total waste of + * time. Maybe don't do any checkingunique deduplication for the + * rightmost page, either. + */ + state->checkingunique = false; + state->alltupsize = 0; + state->nitems = 0; + state->base = NULL; + state->baseoff = InvalidOffsetNumber; + state->basetupsize = 0; + goto retry; + } + + /* be tidy */ + pfree(state->htids); + pfree(state); +} + +/* + * Create a new pending posting list tuple based on caller's tuple. + * + * Every tuple processed by the deduplication routines either becomes the base + * tuple for a posting list, or gets its heap TID(s) accepted into a pending + * posting list. A tuple that starts out as the base tuple for a posting list + * will only actually be rewritten within _bt_dedup_finish_pending() when + * there was at least one successful call to _bt_dedup_save_htid(). + * + * Exported for use by nbtsort.c and recovery. + */ +void +_bt_dedup_start_pending(BTDedupState *state, IndexTuple base, + OffsetNumber baseoff) +{ + Assert(state->nhtids == 0); + Assert(state->nitems == 0); + + /* + * Copy heap TIDs from new base tuple for new candidate posting list into + * ipd array. Assume that we'll eventually create a new posting tuple by + * merging later tuples with this existing one, though we may not. + */ + if (!BTreeTupleIsPosting(base)) + { + memcpy(state->htids, base, sizeof(ItemPointerData)); + state->nhtids = 1; + /* Save size of tuple without any posting list */ + state->basetupsize = IndexTupleSize(base); + } + else + { + int nposting; + + nposting = BTreeTupleGetNPosting(base); + memcpy(state->htids, BTreeTupleGetPosting(base), + sizeof(ItemPointerData) * nposting); + state->nhtids = nposting; + /* Save size of tuple without any posting list */ + state->basetupsize = BTreeTupleGetPostingOffset(base); + } + + /* + * Save new base tuple itself -- it'll be needed if we actually create a + * new posting list from new pending posting list. + * + * Must maintain size of all tuples (including line pointer overhead) to + * calculate space savings on page within _bt_dedup_finish_pending(). + * Also, save number of base tuple logical tuples so that we can save + * cycles in the common case where an existing posting list can't or won't + * be merged with other tuples on the page. + */ + state->nitems = 1; + state->base = base; + state->baseoff = baseoff; + state->alltupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData); + /* Also save baseoff in pending state for interval */ + state->interval.baseoff = state->baseoff; + state->overlap = false; + if (state->newitem) + { + /* Might overlap with new item -- mark it as possible if it is */ + if (BTreeTupleGetHeapTID(base) < BTreeTupleGetHeapTID(state->newitem)) + state->overlap = true; + } +} + +/* + * Save itup heap TID(s) into pending posting list where possible. + * + * Returns bool indicating if the pending posting list managed by state has + * itup's heap TID(s) saved. When this is false, enlarging the pending + * posting list by the required amount would exceed the maxitemsize limit, so + * caller must finish the pending posting list tuple. (Generally itup becomes + * the base tuple of caller's new pending posting list). + * + * Exported for use by nbtsort.c and recovery. + */ +bool +_bt_dedup_save_htid(BTDedupState *state, IndexTuple itup) +{ + int nhtids; + ItemPointer htids; + Size mergedtupsz; + + if (!BTreeTupleIsPosting(itup)) + { + nhtids = 1; + htids = &itup->t_tid; + } + else + { + nhtids = BTreeTupleGetNPosting(itup); + htids = BTreeTupleGetPosting(itup); + } + + /* + * Don't append (have caller finish pending posting list as-is) if + * appending heap TID(s) from itup would put us over limit + */ + mergedtupsz = MAXALIGN(state->basetupsize + + (state->nhtids + nhtids) * + sizeof(ItemPointerData)); + + if (mergedtupsz > state->maxitemsize) + return false; + + /* Don't merge existing posting lists with checkingunique */ + if (state->checkingunique && BTreeTupleIsPosting(state->base)) + return false; + if (state->checkingunique && nhtids > 1) + return false; + + if (state->overlap) + { + if (BTreeTupleGetMaxHeapTID(itup) > BTreeTupleGetHeapTID(state->newitem)) + { + /* + * newitem has heap TID in the range of the would-be new posting + * list. Avoid an immediate posting list split for caller. + */ + if (_bt_keep_natts_fast(state->rel, state->newitem, itup) > + IndexRelationGetNumberOfAttributes(state->rel)) + { + state->newitem = NULL; /* avoid unnecessary comparisons */ + return false; + } + } + } + + /* + * Save heap TIDs to pending posting list tuple -- itup can be merged into + * pending posting list + */ + state->nitems++; + memcpy(state->htids + state->nhtids, htids, + sizeof(ItemPointerData) * nhtids); + state->nhtids += nhtids; + state->alltupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData); + + return true; +} + +/* + * Finalize pending posting list tuple, and add it to the page. Final tuple + * is based on saved base tuple, and saved list of heap TIDs. + * + * Returns space saving from deduplicating to make a new posting list tuple. + * Note that this includes line pointer overhead. This is zero in the case + * where no deduplication was possible. + * + * Exported for use by recovery. + */ +Size +_bt_dedup_finish_pending(Buffer buffer, BTDedupState *state, bool need_wal) +{ + Size spacesaving = 0; + Page page = BufferGetPage(buffer); + int minimum = 2; + + Assert(state->nitems > 0); + Assert(state->nitems <= state->nhtids); + Assert(state->interval.baseoff == state->baseoff); + + /* + * Only create a posting list when at least 3 heap TIDs will appear in the + * checkingunique case (checkingunique strategy won't merge existing + * posting list tuples, so we know that the number of items here must also + * be the total number of heap TIDs). Creating a new posting lists with + * only two heap TIDs won't even save enough space to fit another + * duplicate with the same key as the posting list. This is a bad + * trade-off if there is a chance that the LP_DEAD bit can be set for + * either existing tuple by putting off deduplication. + * + * (Note that a second pass over the page can deduplicate the item if that + * is truly the only way to avoid a page split for checkingunique caller) + */ + Assert(!state->checkingunique || + state->nitems == 1 || state->nhtids == state->nitems); + if (state->checkingunique) + minimum = 3; + + if (state->nitems >= minimum) + { + IndexTuple final; + Size finalsz; + OffsetNumber offnum; + OffsetNumber deletable[MaxOffsetNumber]; + int ndeletable = 0; + + /* find all tuples that will be replaced with this new posting tuple */ + for (offnum = state->baseoff; + offnum < state->baseoff + state->nitems; + offnum = OffsetNumberNext(offnum)) + deletable[ndeletable++] = offnum; + + /* Form a tuple with a posting list */ + final = BTreeFormPostingTuple(state->base, state->htids, + state->nhtids); + finalsz = IndexTupleSize(final); + spacesaving = state->alltupsize - (finalsz + sizeof(ItemIdData)); + /* Must have saved some space */ + Assert(spacesaving > 0 && spacesaving < BLCKSZ); + + /* Save final number of items for posting list */ + state->interval.nitems = state->nitems; + + Assert(finalsz <= state->maxitemsize); + Assert(finalsz == MAXALIGN(IndexTupleSize(final))); + + START_CRIT_SECTION(); + + /* Delete items to replace */ + PageIndexMultiDelete(page, deletable, ndeletable); + /* Insert posting tuple */ + if (PageAddItem(page, (Item) final, finalsz, state->baseoff, false, + false) == InvalidOffsetNumber) + elog(ERROR, "deduplication failed to add tuple to page"); + + MarkBufferDirty(buffer); + + /* Log deduplicated items */ + if (need_wal) + { + XLogRecPtr recptr; + xl_btree_dedup xlrec_dedup; + + xlrec_dedup.baseoff = state->interval.baseoff; + xlrec_dedup.nitems = state->interval.nitems; + + XLogBeginInsert(); + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup); + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP_PAGE); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + pfree(final); + } + + /* Reset state for next pending posting list */ + state->nhtids = 0; + state->nitems = 0; + state->alltupsize = 0; + + return spacesaving; +} diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 268f869a36..c08f850595 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -24,6 +24,7 @@ #include "access/nbtree.h" #include "access/nbtxlog.h" +#include "access/tableam.h" #include "access/transam.h" #include "access/xlog.h" #include "access/xloginsert.h" @@ -42,12 +43,17 @@ static bool _bt_lock_branch_parent(Relation rel, BlockNumber child, BlockNumber *target, BlockNumber *rightsib); static void _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid); +static TransactionId _bt_compute_xid_horizon_for_tuples(Relation rel, + Relation heapRel, + Buffer buf, + OffsetNumber *itemnos, + int nitems); /* * _bt_initmetapage() -- Fill a page buffer with a correct metapage image */ void -_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level) +_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool dedup_is_possible) { BTMetaPageData *metad; BTPageOpaque metaopaque; @@ -63,6 +69,7 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level) metad->btm_fastlevel = level; metad->btm_oldest_btpo_xact = InvalidTransactionId; metad->btm_last_cleanup_num_heap_tuples = -1.0; + metad->btm_dedup_is_possible = dedup_is_possible; metaopaque = (BTPageOpaque) PageGetSpecialPointer(page); metaopaque->btpo_flags = BTP_META; @@ -213,6 +220,7 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, md.fastlevel = metad->btm_fastlevel; md.oldest_btpo_xact = oldestBtpoXact; md.last_cleanup_num_heap_tuples = numHeapTuples; + md.btm_dedup_is_possible = metad->btm_dedup_is_possible; XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata)); @@ -394,6 +402,7 @@ _bt_getroot(Relation rel, int access) md.fastlevel = 0; md.oldest_btpo_xact = InvalidTransactionId; md.last_cleanup_num_heap_tuples = -1.0; + md.btm_dedup_is_possible = metad->btm_dedup_is_possible; XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata)); @@ -683,6 +692,63 @@ _bt_heapkeyspace(Relation rel) return metad->btm_version > BTREE_NOVAC_VERSION; } +/* + * _bt_get_dedupispossible() -- is deduplication possible for the index? + * get information from metapage + */ +bool +_bt_getdedupispossible(Relation rel) +{ + BTMetaPageData *metad; + + if (rel->rd_amcache == NULL) + { + Buffer metabuf; + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); + metad = _bt_getmeta(rel, metabuf); + + /* + * If there's no root page yet, _bt_getroot() doesn't expect a cache + * to be made, so just stop here. (XXX perhaps _bt_getroot() should + * be changed to allow this case.) + * + * FIXME: Think some more about pg_upgrade'd !heapkeyspace indexes + * here, and the need for aa version bump to go with new metapage + * field. + */ + if (metad->btm_root == P_NONE) + { + _bt_relbuf(rel, metabuf); + return metad->btm_dedup_is_possible;; + } + + /* + * Cache the metapage data for next time + * + * An on-the-fly version upgrade performed by _bt_upgrademetapage() + * can change the nbtree version for an index without invalidating any + * local cache. This is okay because it can only happen when moving + * from version 2 to version 3, both of which are !heapkeyspace + * versions. + */ + rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt, + sizeof(BTMetaPageData)); + memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData)); + _bt_relbuf(rel, metabuf); + } + + /* Get cached page */ + metad = (BTMetaPageData *) rel->rd_amcache; + /* We shouldn't have cached it if any of these fail */ + Assert(metad->btm_magic == BTREE_MAGIC); + Assert(metad->btm_version >= BTREE_MIN_VERSION); + Assert(metad->btm_version <= BTREE_VERSION); + Assert(metad->btm_fastroot != P_NONE); + + return metad->btm_dedup_is_possible; +} + /* * _bt_checkpage() -- Verify that a freshly-read page looks sane. */ @@ -983,14 +1049,52 @@ _bt_page_recyclable(Page page) void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, + OffsetNumber *updateitemnos, + IndexTuple *updated, int nupdatable, BlockNumber lastBlockVacuumed) { Page page = BufferGetPage(buf); BTPageOpaque opaque; + Size itemsz; + Size updated_sz = 0; + char *updated_buf = NULL; + + /* XLOG stuff, buffer for updateds */ + if (nupdatable > 0 && RelationNeedsWAL(rel)) + { + Size offset = 0; + + for (int i = 0; i < nupdatable; i++) + updated_sz += MAXALIGN(IndexTupleSize(updated[i])); + + updated_buf = palloc(updated_sz); + for (int i = 0; i < nupdatable; i++) + { + itemsz = IndexTupleSize(updated[i]); + memcpy(updated_buf + offset, (char *) updated[i], itemsz); + offset += MAXALIGN(itemsz); + } + Assert(offset == updated_sz); + } /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); + /* Handle posting tuples here */ + for (int i = 0; i < nupdatable; i++) + { + /* At first, delete the old tuple. */ + PageIndexTupleDelete(page, updateitemnos[i]); + + itemsz = IndexTupleSize(updated[i]); + itemsz = MAXALIGN(itemsz); + + /* Add tuple with updated ItemPointers to the page. */ + if (PageAddItem(page, (Item) updated[i], itemsz, updateitemnos[i], + false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to rewrite posting list item in index while doing vacuum"); + } + /* Fix the page */ if (nitems > 0) PageIndexMultiDelete(page, itemnos, nitems); @@ -1020,6 +1124,8 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, xl_btree_vacuum xlrec_vacuum; xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed; + xlrec_vacuum.nupdated = nupdatable; + xlrec_vacuum.ndeleted = nitems; XLogBeginInsert(); XLogRegisterBuffer(0, buf, REGBUF_STANDARD); @@ -1033,6 +1139,19 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, if (nitems > 0) XLogRegisterBufData(0, (char *) itemnos, nitems * sizeof(OffsetNumber)); + /* + * Here we should save offnums and updated tuples themselves. It's + * important to restore them in correct order. At first, we must + * handle updated tuples and only after that other deleted items. + */ + if (nupdatable > 0) + { + Assert(updated_buf != NULL); + XLogRegisterBufData(0, (char *) updateitemnos, + nupdatable * sizeof(OffsetNumber)); + XLogRegisterBufData(0, updated_buf, updated_sz); + } + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM); PageSetLSN(page, recptr); @@ -1041,6 +1160,91 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, END_CRIT_SECTION(); } +/* + * Get the latestRemovedXid from the table entries pointed at by the index + * tuples being deleted. + * + * This is a version of index_compute_xid_horizon_for_tuples() specialized to + * nbtree, which can handle posting lists. + */ +static TransactionId +_bt_compute_xid_horizon_for_tuples(Relation rel, Relation heapRel, + Buffer buf, OffsetNumber *itemnos, + int nitems) +{ + ItemPointer htids; + TransactionId latestRemovedXid = InvalidTransactionId; + Page page = BufferGetPage(buf); + int arraynitems; + int finalnitems; + + /* + * Initial size of array can fit everything when it turns out that are no + * posting lists + */ + arraynitems = nitems; + htids = (ItemPointer) palloc(sizeof(ItemPointerData) * arraynitems); + + finalnitems = 0; + /* identify what the index tuples about to be deleted point to */ + for (int i = 0; i < nitems; i++) + { + ItemId itemid; + IndexTuple itup; + + itemid = PageGetItemId(page, itemnos[i]); + itup = (IndexTuple) PageGetItem(page, itemid); + + Assert(ItemIdIsDead(itemid)); + + if (!BTreeTupleIsPosting(itup)) + { + /* Make sure that we have space for additional heap TID */ + if (finalnitems + 1 > arraynitems) + { + arraynitems = arraynitems * 2; + htids = (ItemPointer) + repalloc(htids, sizeof(ItemPointerData) * arraynitems); + } + + Assert(ItemPointerIsValid(&itup->t_tid)); + ItemPointerCopy(&itup->t_tid, &htids[finalnitems]); + finalnitems++; + } + else + { + int nposting = BTreeTupleGetNPosting(itup); + + /* Make sure that we have space for additional heap TIDs */ + if (finalnitems + nposting > arraynitems) + { + arraynitems = Max(arraynitems * 2, finalnitems + nposting); + htids = (ItemPointer) + repalloc(htids, sizeof(ItemPointerData) * arraynitems); + } + + for (int j = 0; j < nposting; j++) + { + ItemPointer htid = BTreeTupleGetPostingN(itup, j); + + Assert(ItemPointerIsValid(htid)); + ItemPointerCopy(htid, &htids[finalnitems]); + finalnitems++; + } + } + } + + Assert(finalnitems >= nitems); + + /* determine the actual xid horizon */ + latestRemovedXid = + table_compute_xid_horizon_for_tuples(heapRel, htids, finalnitems); + + pfree(htids); + + return latestRemovedXid; +} + /* * Delete item(s) from a btree page during single-page cleanup. * @@ -1067,8 +1271,8 @@ _bt_delitems_delete(Relation rel, Buffer buf, if (XLogStandbyInfoActive() && RelationNeedsWAL(rel)) latestRemovedXid = - index_compute_xid_horizon_for_tuples(rel, heapRel, buf, - itemnos, nitems); + _bt_compute_xid_horizon_for_tuples(rel, heapRel, buf, + itemnos, nitems); /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); @@ -2066,6 +2270,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) xlmeta.fastlevel = metad->btm_fastlevel; xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact; xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; + xlmeta.btm_dedup_is_possible = metad->btm_dedup_is_possible; XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata)); xlinfo = XLOG_BTREE_UNLINK_PAGE_META; diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 4cfd5289ad..d70607e71a 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -97,6 +97,8 @@ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, BTCycleId cycleid, TransactionId *oldestBtpoXact); static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno); +static ItemPointer btreevacuumposting(BTVacState *vstate, IndexTuple itup, + int *nremaining); /* @@ -157,10 +159,11 @@ void btbuildempty(Relation index) { Page metapage; + bool dedup_is_possible = _bt_dedup_is_possible(index); /* Construct metapage. */ metapage = (Page) palloc(BLCKSZ); - _bt_initmetapage(metapage, P_NONE, 0); + _bt_initmetapage(metapage, P_NONE, 0, dedup_is_possible); /* * Write the page and log it. It might seem that an immediate sync would @@ -263,8 +266,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) */ if (so->killedItems == NULL) so->killedItems = (int *) - palloc(MaxIndexTuplesPerPage * sizeof(int)); - if (so->numKilled < MaxIndexTuplesPerPage) + palloc(MaxPostingIndexTuplesPerPage * sizeof(int)); + if (so->numKilled < MaxPostingIndexTuplesPerPage) so->killedItems[so->numKilled++] = so->currPos.itemIndex; } @@ -816,7 +819,7 @@ _bt_vacuum_needs_cleanup(IndexVacuumInfo *info) } else { - StdRdOptions *relopts; + BtreeOptions *relopts; float8 cleanup_scale_factor; float8 prev_num_heap_tuples; @@ -827,7 +830,7 @@ _bt_vacuum_needs_cleanup(IndexVacuumInfo *info) * tuples exceeds vacuum_cleanup_index_scale_factor fraction of * original tuples count. */ - relopts = (StdRdOptions *) info->index->rd_options; + relopts = (BtreeOptions *) info->index->rd_options; cleanup_scale_factor = (relopts && relopts->vacuum_cleanup_index_scale_factor >= 0) ? relopts->vacuum_cleanup_index_scale_factor @@ -1069,7 +1072,8 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, RBM_NORMAL, info->strategy); LockBufferForCleanup(buf); _bt_checkpage(rel, buf); - _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed); + _bt_delitems_vacuum(rel, buf, NULL, 0, NULL, NULL, 0, + vstate.lastBlockVacuumed); _bt_relbuf(rel, buf); } @@ -1188,8 +1192,17 @@ restart: } else if (P_ISLEAF(opaque)) { + /* Deletable item state */ OffsetNumber deletable[MaxOffsetNumber]; int ndeletable; + int nhtidsdead; + int nhtidslive; + + /* Updatable item state (for posting lists) */ + IndexTuple updated[MaxOffsetNumber]; + OffsetNumber updatable[MaxOffsetNumber]; + int nupdatable; + OffsetNumber offnum, minoff, maxoff; @@ -1229,6 +1242,10 @@ restart: * callback function. */ ndeletable = 0; + nupdatable = 0; + /* Maintain stats counters for index tuple versions/heap TIDs */ + nhtidsdead = 0; + nhtidslive = 0; minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); if (callback) @@ -1238,11 +1255,9 @@ restart: offnum = OffsetNumberNext(offnum)) { IndexTuple itup; - ItemPointer htup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); - htup = &(itup->t_tid); /* * During Hot Standby we currently assume that @@ -1265,8 +1280,71 @@ restart: * applies to *any* type of index that marks index tuples as * killed. */ - if (callback(htup, callback_state)) - deletable[ndeletable++] = offnum; + if (!BTreeTupleIsPosting(itup)) + { + /* Regular tuple, standard heap TID representation */ + ItemPointer htid = &(itup->t_tid); + + if (callback(htid, callback_state)) + { + deletable[ndeletable++] = offnum; + nhtidsdead++; + } + else + nhtidslive++; + } + else + { + ItemPointer newhtids; + int nremaining; + + /* + * Posting list tuple, a physical tuple that represents + * two or more logical tuples, any of which could be an + * index row version that must be removed + */ + newhtids = btreevacuumposting(vstate, itup, &nremaining); + if (newhtids == NULL) + { + /* + * All TIDs/logical tuples from the posting tuple + * remain, so no update or delete required + */ + Assert(nremaining == BTreeTupleGetNPosting(itup)); + } + else if (nremaining > 0) + { + IndexTuple updatedtuple; + + /* + * Form new tuple that contains only remaining TIDs. + * Remember this tuple and the offset of the old tuple + * for when we update it in place + */ + Assert(nremaining < BTreeTupleGetNPosting(itup)); + updatedtuple = BTreeFormPostingTuple(itup, newhtids, + nremaining); + updated[nupdatable] = updatedtuple; + updatable[nupdatable++] = offnum; + nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining; + pfree(newhtids); + } + else + { + /* + * All TIDs/logical tuples from the posting list must + * be deleted. We'll delete the physical tuple + * completely. + */ + deletable[ndeletable++] = offnum; + nhtidsdead += BTreeTupleGetNPosting(itup); + + /* Free empty array of live items */ + pfree(newhtids); + } + + nhtidslive += nremaining; + } } } @@ -1274,7 +1352,7 @@ restart: * Apply any needed deletes. We issue just one _bt_delitems_vacuum() * call per page, so as to minimize WAL traffic. */ - if (ndeletable > 0) + if (ndeletable > 0 || nupdatable > 0) { /* * Notice that the issued XLOG_BTREE_VACUUM WAL record includes @@ -1290,7 +1368,8 @@ restart: * doesn't seem worth the amount of bookkeeping it'd take to avoid * that. */ - _bt_delitems_vacuum(rel, buf, deletable, ndeletable, + _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable, + updated, nupdatable, vstate->lastBlockVacuumed); /* @@ -1300,7 +1379,7 @@ restart: if (blkno > vstate->lastBlockVacuumed) vstate->lastBlockVacuumed = blkno; - stats->tuples_removed += ndeletable; + stats->tuples_removed += nhtidsdead; /* must recompute maxoff */ maxoff = PageGetMaxOffsetNumber(page); } @@ -1315,6 +1394,7 @@ restart: * We treat this like a hint-bit update because there's no need to * WAL-log it. */ + Assert(nhtidsdead == 0); if (vstate->cycleid != 0 && opaque->btpo_cycleid == vstate->cycleid) { @@ -1324,15 +1404,16 @@ restart: } /* - * If it's now empty, try to delete; else count the live tuples. We - * don't delete when recursing, though, to avoid putting entries into + * If it's now empty, try to delete; else count the live tuples (live + * heap TIDs in posting lists are counted as live tuples). We don't + * delete when recursing, though, to avoid putting entries into * freePages out-of-order (doesn't seem worth any extra code to handle * the case). */ if (minoff > maxoff) delete_now = (blkno == orig_blkno); else - stats->num_index_tuples += maxoff - minoff + 1; + stats->num_index_tuples += nhtidslive; } if (delete_now) @@ -1375,6 +1456,68 @@ restart: } } +/* + * btreevacuumposting() -- determines which logical tuples must remain when + * VACUUMing a posting list tuple. + * + * Returns new palloc'd array of item pointers needed to build replacement + * posting list without the index row versions that are to be deleted. + * + * Note that returned array is NULL in the common case where there is nothing + * to delete in caller's posting list tuple. The number of TIDs that should + * remain in the posting list tuple is set for caller in *nremaining. This is + * also the size of the returned array (though only when array isn't just + * NULL). + */ +static ItemPointer +btreevacuumposting(BTVacState *vstate, IndexTuple itup, int *nremaining) +{ + int live = 0; + int nitem = BTreeTupleGetNPosting(itup); + ItemPointer tmpitems = NULL, + items = BTreeTupleGetPosting(itup); + + Assert(BTreeTupleIsPosting(itup)); + + /* + * Check each tuple in the posting list. Save live tuples into tmpitems, + * though try to avoid memory allocation as an optimization. + */ + for (int i = 0; i < nitem; i++) + { + if (!vstate->callback(items + i, vstate->callback_state)) + { + /* + * Live heap TID. + * + * Only save live TID when we know that we're going to have to + * kill at least one TID, and have already allocated memory. + */ + if (tmpitems) + tmpitems[live] = items[i]; + live++; + } + + /* Dead heap TID */ + else if (tmpitems == NULL) + { + /* + * Turns out we need to delete one or more dead heap TIDs, so + * start maintaining an array of live TIDs for caller to + * reconstruct smaller replacement posting list tuple + */ + tmpitems = palloc(sizeof(ItemPointerData) * nitem); + + /* Copy live heap TIDs from previous loop iterations */ + if (live > 0) + memcpy(tmpitems, items, sizeof(ItemPointerData) * live); + } + } + + *nremaining = live; + return tmpitems; +} + /* * btcanreturn() -- Check whether btree indexes support index-only scans. * diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 8e512461a0..9db73d070d 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -26,10 +26,18 @@ static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf); +static int _bt_binsrch_posting(BTScanInsert key, Page page, + OffsetNumber offnum); static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum); static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup); +static void _bt_setuppostingitems(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, ItemPointer heapTid, + IndexTuple itup); +static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, + ItemPointer heapTid); static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir); static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir); static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, @@ -434,7 +442,10 @@ _bt_binsrch(Relation rel, * low) makes bounds invalid. * * Caller is responsible for invalidating bounds when it modifies the page - * before calling here a second time. + * before calling here a second time, and for dealing with posting list + * tuple matches (callers can use insertstate's postingoff field to + * determine which existing heap TID will need to be replaced by their + * scantid/new heap TID). */ OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate) @@ -453,6 +464,7 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate) Assert(P_ISLEAF(opaque)); Assert(!key->nextkey); + Assert(insertstate->postingoff == 0); if (!insertstate->bounds_valid) { @@ -509,6 +521,16 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate) if (result != 0) stricthigh = high; } + + /* + * If tuple at offset located by binary search is a posting list whose + * TID range overlaps with caller's scantid, perform posting list + * binary search to set postingoff for caller. Caller must split the + * posting list when postingoff is set. This should happen + * infrequently. + */ + if (unlikely(result == 0 && key->scantid != NULL)) + insertstate->postingoff = _bt_binsrch_posting(key, page, mid); } /* @@ -528,6 +550,68 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate) return low; } +/*---------- + * _bt_binsrch_posting() -- posting list binary search. + * + * Returns offset into posting list where caller's scantid belongs. + *---------- + */ +static int +_bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum) +{ + IndexTuple itup; + ItemId itemid; + int low, + high, + mid, + res; + + /* + * If this isn't a posting tuple, then the index must be corrupt (if it is + * an ordinary non-pivot tuple then there must be an existing tuple with a + * heap TID that equals inserter's new heap TID/scantid). Defensively + * check that tuple is a posting list tuple whose posting list range + * includes caller's scantid. + * + * (This is also needed because contrib/amcheck's rootdescend option needs + * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().) + */ + Assert(P_ISLEAF((BTPageOpaque) PageGetSpecialPointer(page))); + Assert(!key->nextkey); + Assert(key->scantid != NULL); + itemid = PageGetItemId(page, offnum); + itup = (IndexTuple) PageGetItem(page, itemid); + if (!BTreeTupleIsPosting(itup)) + return 0; + + /* + * In the unlikely event that posting list tuple has LP_DEAD bit set, + * signal to caller that it should kill the item and restart its binary + * search. + */ + if (ItemIdIsDead(itemid)) + return -1; + + /* "high" is past end of posting list for loop invariant */ + low = 0; + high = BTreeTupleGetNPosting(itup); + Assert(high >= 2); + + while (high > low) + { + mid = low + ((high - low) / 2); + res = ItemPointerCompare(key->scantid, + BTreeTupleGetPostingN(itup, mid)); + + if (res >= 1) + low = mid + 1; + else + high = mid; + } + + return low; +} + /*---------- * _bt_compare() -- Compare insertion-type scankey to tuple on a page. * @@ -537,9 +621,18 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate) * <0 if scankey < tuple at offnum; * 0 if scankey == tuple at offnum; * >0 if scankey > tuple at offnum. - * NULLs in the keys are treated as sortable values. Therefore - * "equality" does not necessarily mean that the item should be - * returned to the caller as a matching key! + * + * NULLs in the keys are treated as sortable values. Therefore + * "equality" does not necessarily mean that the item should be returned + * to the caller as a matching key. Similarly, an insertion scankey + * with its scantid set is treated as equal to a posting tuple whose TID + * range overlaps with their scantid. There generally won't be a + * matching TID in the posting tuple, which caller must handle + * themselves (e.g., by splitting the posting list tuple). + * + * It is generally guaranteed that any possible scankey with scantid set + * will have zero or one tuples in the index that are considered equal + * here. * * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be * "minus infinity": this routine will always claim it is less than the @@ -563,6 +656,7 @@ _bt_compare(Relation rel, ScanKey scankey; int ncmpkey; int ntupatts; + int32 result; Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum)); Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel)); @@ -597,7 +691,6 @@ _bt_compare(Relation rel, { Datum datum; bool isNull; - int32 result; datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull); @@ -713,8 +806,25 @@ _bt_compare(Relation rel, if (heapTid == NULL) return 1; + /* + * scankey must be treated as equal to a posting list tuple if its scantid + * value falls within the range of the posting list. In all other cases + * there can only be a single heap TID value, which is compared directly + * as a simple scalar value. + */ Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel)); - return ItemPointerCompare(key->scantid, heapTid); + result = ItemPointerCompare(key->scantid, heapTid); + if (!BTreeTupleIsPosting(itup) || result <= 0) + return result; + else + { + result = ItemPointerCompare(key->scantid, + BTreeTupleGetMaxHeapTID(itup)); + if (result > 0) + return 1; + } + + return 0; } /* @@ -1233,6 +1343,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) inskey.anynullkeys = false; /* unused */ inskey.nextkey = nextkey; inskey.pivotsearch = false; + inskey.dedup_is_possible = false; inskey.scantid = NULL; inskey.keysz = keysCount; @@ -1451,6 +1562,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) /* initialize tuple workspace to empty */ so->currPos.nextTupleOffset = 0; + so->currPos.postingTupleOffset = 0; /* * Now that the current page has been made consistent, the macro should be @@ -1485,8 +1597,29 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan)) { /* tuple passes all scan key conditions, so remember it */ - _bt_saveitem(so, itemIndex, offnum, itup); - itemIndex++; + if (!BTreeTupleIsPosting(itup)) + { + _bt_saveitem(so, itemIndex, offnum, itup); + itemIndex++; + } + else + { + /* + * Setup state to return posting list, and save first + * "logical" tuple + */ + _bt_setuppostingitems(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, 0), + itup); + itemIndex++; + /* Save additional posting list "logical" tuples */ + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + _bt_savepostingitem(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i)); + itemIndex++; + } + } } /* When !continuescan, there can't be any more matches, so stop */ if (!continuescan) @@ -1519,7 +1652,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) if (!continuescan) so->currPos.moreRight = false; - Assert(itemIndex <= MaxIndexTuplesPerPage); + Assert(itemIndex <= MaxPostingIndexTuplesPerPage); so->currPos.firstItem = 0; so->currPos.lastItem = itemIndex - 1; so->currPos.itemIndex = 0; @@ -1527,7 +1660,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) else { /* load items[] in descending order */ - itemIndex = MaxIndexTuplesPerPage; + itemIndex = MaxPostingIndexTuplesPerPage; offnum = Min(offnum, maxoff); @@ -1569,8 +1702,36 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) if (passes_quals && tuple_alive) { /* tuple passes all scan key conditions, so remember it */ - itemIndex--; - _bt_saveitem(so, itemIndex, offnum, itup); + if (!BTreeTupleIsPosting(itup)) + { + itemIndex--; + _bt_saveitem(so, itemIndex, offnum, itup); + } + else + { + int i = BTreeTupleGetNPosting(itup) - 1; + + /* + * Setup state to return posting list, and save last + * "logical" tuple from posting list (since it's the first + * that will be returned to scan). + */ + itemIndex--; + _bt_setuppostingitems(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i--), + itup); + + /* + * Return posting list "logical" tuples -- do this in + * descending order, to match overall scan order + */ + for (; i >= 0; i--) + { + itemIndex--; + _bt_savepostingitem(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i)); + } + } } if (!continuescan) { @@ -1584,8 +1745,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) Assert(itemIndex >= 0); so->currPos.firstItem = itemIndex; - so->currPos.lastItem = MaxIndexTuplesPerPage - 1; - so->currPos.itemIndex = MaxIndexTuplesPerPage - 1; + so->currPos.lastItem = MaxPostingIndexTuplesPerPage - 1; + so->currPos.itemIndex = MaxPostingIndexTuplesPerPage - 1; } return (so->currPos.firstItem <= so->currPos.lastItem); @@ -1598,6 +1759,8 @@ _bt_saveitem(BTScanOpaque so, int itemIndex, { BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + Assert(!BTreeTupleIsPosting(itup)); + currItem->heapTid = itup->t_tid; currItem->indexOffset = offnum; if (so->currTuples) @@ -1610,6 +1773,59 @@ _bt_saveitem(BTScanOpaque so, int itemIndex, } } +/* + * Setup state to save posting items from a single posting list tuple. Saves + * the logical tuple that will be returned to scan first in passing. + * + * Saves an index item into so->currPos.items[itemIndex] for logical tuple + * that is returned to scan first. Second or subsequent heap TID for posting + * list should be saved by calling _bt_savepostingitem(). + */ +static void +_bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, + ItemPointer heapTid, IndexTuple itup) +{ + BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + + currItem->heapTid = *heapTid; + currItem->indexOffset = offnum; + + if (so->currTuples) + { + /* Save a base version of the IndexTuple */ + Size itupsz = BTreeTupleGetPostingOffset(itup); + + itupsz = MAXALIGN(itupsz); + currItem->tupleOffset = so->currPos.nextTupleOffset; + memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz); + so->currPos.nextTupleOffset += itupsz; + so->currPos.postingTupleOffset = currItem->tupleOffset; + } +} + +/* + * Save an index item into so->currPos.items[itemIndex] for posting tuple. + * + * Assumes that _bt_setuppostingitems() has already been called for current + * posting list tuple. + */ +static inline void +_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, + ItemPointer heapTid) +{ + BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + + currItem->heapTid = *heapTid; + currItem->indexOffset = offnum; + + /* + * Have index-only scans return the same base IndexTuple for every logical + * tuple that originates from the same posting list + */ + if (so->currTuples) + currItem->tupleOffset = so->currPos.postingTupleOffset; +} + /* * _bt_steppage() -- Step to next page containing valid data for scan * diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index ab19692006..a138fafeb1 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -287,6 +287,9 @@ static void _bt_sortaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off); static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup); +static void _bt_sort_dedup_finish_pending(BTWriteState *wstate, + BTPageState *state, + BTDedupState *dstate); static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state); static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2); @@ -725,8 +728,8 @@ _bt_pagestate(BTWriteState *wstate, uint32 level) if (level > 0) state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100); else - state->btps_full = RelationGetTargetPageFreeSpace(wstate->index, - BTREE_DEFAULT_FILLFACTOR); + state->btps_full = BtreeGetTargetPageFreeSpace(wstate->index, + BTREE_DEFAULT_FILLFACTOR); /* no parent level, yet */ state->btps_next = NULL; @@ -799,7 +802,8 @@ _bt_sortaddtup(Page page, } /*---------- - * Add an item to a disk page from the sort output. + * Add an item to a disk page from the sort output (or add a posting list + * item formed from the sort output). * * We must be careful to observe the page layout conventions of nbtsearch.c: * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY. @@ -1002,6 +1006,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) * the minimum key for the new page. */ state->btps_minkey = CopyIndexTuple(oitup); + Assert(BTreeTupleIsPivot(state->btps_minkey)); /* * Set the sibling links for both pages. @@ -1043,6 +1048,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) Assert(state->btps_minkey == NULL); state->btps_minkey = CopyIndexTuple(itup); /* _bt_sortaddtup() will perform full truncation later */ + BTreeTupleClearBtIsPosting(state->btps_minkey); BTreeTupleSetNAtts(state->btps_minkey, 0); } @@ -1057,6 +1063,42 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) state->btps_lastoff = last_off; } +/* + * Finalize pending posting list tuple, and add it to the index. Final tuple + * is based on saved base tuple, and saved list of heap TIDs. + * + * This is almost like nbtinsert.c's _bt_dedup_finish_pending(), but it adds a + * new tuple using _bt_buildadd() and does not maintain the intervals array. + */ +static void +_bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state, + BTDedupState *dstate) +{ + IndexTuple final; + + Assert(dstate->nitems > 0); + if (dstate->nitems == 1) + final = dstate->base; + else + { + IndexTuple postingtuple; + + /* form a tuple with a posting list */ + postingtuple = BTreeFormPostingTuple(dstate->base, + dstate->htids, + dstate->nhtids); + final = postingtuple; + } + + _bt_buildadd(wstate, state, final); + + if (dstate->nitems > 1) + pfree(final); + /* Don't maintain dedup_intervals array, or alltupsize */ + dstate->nhtids = 0; + dstate->nitems = 0; +} + /* * Finish writing out the completed btree. */ @@ -1123,7 +1165,8 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) * by filling in a valid magic number in the metapage. */ metapage = (Page) palloc(BLCKSZ); - _bt_initmetapage(metapage, rootblkno, rootlevel); + + _bt_initmetapage(metapage, rootblkno, rootlevel, wstate->inskey->dedup_is_possible); _bt_blwritepage(wstate, metapage, BTREE_METAPAGE); } @@ -1144,6 +1187,10 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index); SortSupport sortKeys; int64 tuples_done = 0; + bool deduplicate; + + deduplicate = wstate->inskey->dedup_is_possible && + BtreeGetDoDedupOption(wstate->index); if (merge) { @@ -1255,9 +1302,96 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) } pfree(sortKeys); } + else if (deduplicate) + { + /* merge is unnecessary, deduplicate into posting lists */ + BTDedupState *dstate; + IndexTuple newbase; + + dstate = (BTDedupState *) palloc(sizeof(BTDedupState)); + dstate->maxitemsize = 0; /* set later */ + dstate->checkingunique = false; /* unused */ + dstate->newitem = NULL; + /* Metadata about current pending posting list */ + dstate->htids = NULL; + dstate->nhtids = 0; + dstate->nitems = 0; + dstate->overlap = false; + dstate->alltupsize = 0; /* unused */ + /* Metadata about based tuple of current pending posting list */ + dstate->base = NULL; + dstate->baseoff = InvalidOffsetNumber; /* unused */ + dstate->basetupsize = 0; + + while ((itup = tuplesort_getindextuple(btspool->sortstate, + true)) != NULL) + { + /* When we see first tuple, create first index page */ + if (state == NULL) + { + state = _bt_pagestate(wstate, 0); + dstate->maxitemsize = BTMaxItemSize(state->btps_page); + /* Conservatively size array */ + dstate->htids = palloc(dstate->maxitemsize); + + /* + * No previous/base tuple, since itup is the first item + * returned by the tuplesort -- use itup as base tuple of + * first pending posting list for entire index build + */ + newbase = CopyIndexTuple(itup); + _bt_dedup_start_pending(dstate, newbase, InvalidOffsetNumber); + } + else if (_bt_keep_natts_fast(wstate->index, dstate->base, + itup) > keysz && + _bt_dedup_save_htid(dstate, itup)) + { + /* + * Tuple is equal to base tuple of pending posting list, and + * merging itup into pending posting list won't exceed the + * BTMaxItemSize() limit. Heap TID(s) for itup have been + * saved in state. The next iteration will also end up here + * if it's possible to merge the next tuple into the same + * pending posting list. + */ + } + else + { + /* + * Tuple is not equal to pending posting list tuple, or + * BTMaxItemSize() limit was reached + */ + _bt_sort_dedup_finish_pending(wstate, state, dstate); + /* Base tuple is always a copy */ + pfree(dstate->base); + + /* itup starts new pending posting list */ + newbase = CopyIndexTuple(itup); + _bt_dedup_start_pending(dstate, newbase, InvalidOffsetNumber); + } + + /* Report progress */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, + ++tuples_done); + } + + /* + * Handle the last item (there must be a last item when the tuplesort + * returned one or more tuples) + */ + if (state) + { + _bt_sort_dedup_finish_pending(wstate, state, dstate); + /* Base tuple is always a copy */ + pfree(dstate->base); + pfree(dstate->htids); + } + + pfree(dstate); + } else { - /* merge is unnecessary */ + /* merging and deduplication are both unnecessary */ while ((itup = tuplesort_getindextuple(btspool->sortstate, true)) != NULL) { diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c index 1c1029b6c4..df976d4b7a 100644 --- a/src/backend/access/nbtree/nbtsplitloc.c +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -167,7 +167,7 @@ _bt_findsplitloc(Relation rel, /* Count up total space in data items before actually scanning 'em */ olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page); - leaffillfactor = RelationGetFillFactor(rel, BTREE_DEFAULT_FILLFACTOR); + leaffillfactor = BtreeGetFillFactor(rel, BTREE_DEFAULT_FILLFACTOR); /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ newitemsz += sizeof(ItemIdData); @@ -183,6 +183,9 @@ _bt_findsplitloc(Relation rel, state.minfirstrightsz = SIZE_MAX; state.newitemoff = newitemoff; + /* newitem cannot be a posting list item */ + Assert(!BTreeTupleIsPosting(newitem)); + /* * maxsplits should never exceed maxoff because there will be at most as * many candidate split points as there are points _between_ tuples, once @@ -459,17 +462,52 @@ _bt_recsplitloc(FindSplitData *state, int16 leftfree, rightfree; Size firstrightitemsz; + Size postingsubhikey = 0; bool newitemisfirstonright; /* Is the new item going to be the first item on the right page? */ newitemisfirstonright = (firstoldonright == state->newitemoff && !newitemonleft); + /* + * FIXME: Accessing every single tuple like this adds cycles to cases that + * cannot possibly benefit (i.e. cases where we know that there cannot be + * posting lists). Maybe we should add a way to not bother when we are + * certain that this is the case. + * + * We could either have _bt_split() pass us a flag, or invent a page flag + * that indicates that the page might have posting lists, as an + * optimization. There is no shortage of btpo_flags bits for stuff like + * this. + */ if (newitemisfirstonright) + { firstrightitemsz = state->newitemsz; + + /* Calculate posting list overhead, if any */ + if (state->is_leaf && BTreeTupleIsPosting(state->newitem)) + postingsubhikey = IndexTupleSize(state->newitem) - + BTreeTupleGetPostingOffset(state->newitem); + } else + { firstrightitemsz = firstoldonrightsz; + /* Calculate posting list overhead, if any */ + if (state->is_leaf) + { + ItemId itemid; + IndexTuple newhighkey; + + itemid = PageGetItemId(state->page, firstoldonright); + newhighkey = (IndexTuple) PageGetItem(state->page, itemid); + + if (BTreeTupleIsPosting(newhighkey)) + postingsubhikey = IndexTupleSize(newhighkey) - + BTreeTupleGetPostingOffset(newhighkey); + } + } + /* Account for all the old tuples */ leftfree = state->leftspace - olddataitemstoleft; rightfree = state->rightspace - @@ -492,9 +530,13 @@ _bt_recsplitloc(FindSplitData *state, * adding a heap TID to the left half's new high key when splitting at the * leaf level. In practice the new high key will often be smaller and * will rarely be larger, but conservatively assume the worst case. + * Truncation always truncates away any posting list that appears in the + * first right tuple, though, so it's safe to subtract that overhead + * (while still conservatively assuming that truncation might have to add + * back a single heap TID using the pivot tuple heap TID representation). */ if (state->is_leaf) - leftfree -= (int16) (firstrightitemsz + + leftfree -= (int16) ((firstrightitemsz - postingsubhikey) + MAXALIGN(sizeof(ItemPointerData))); else leftfree -= (int16) firstrightitemsz; @@ -691,7 +733,8 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, itemid = PageGetItemId(state->page, OffsetNumberPrev(state->newitemoff)); tup = (IndexTuple) PageGetItem(state->page, itemid); /* Do cheaper test first */ - if (!_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid)) + if (BTreeTupleIsPosting(tup) || + !_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid)) return false; /* Check same conditions as rightmost item case, too */ keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index bc855dd25d..6fdd776ea5 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -97,8 +97,6 @@ _bt_mkscankey(Relation rel, IndexTuple itup) indoption = rel->rd_indoption; tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0; - Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel)); - /* * We'll execute search using scan key constructed on key columns. * Truncated attributes and non-key attributes are omitted from the final @@ -110,9 +108,23 @@ _bt_mkscankey(Relation rel, IndexTuple itup) key->anynullkeys = false; /* initial assumption */ key->nextkey = false; key->pivotsearch = false; + key->scantid = NULL; key->keysz = Min(indnkeyatts, tupnatts); - key->scantid = key->heapkeyspace && itup ? - BTreeTupleGetHeapTID(itup) : NULL; + /* get information from relation info or from btree metapage */ + key->dedup_is_possible = (itup == NULL) ? _bt_dedup_is_possible(rel) : + _bt_getdedupispossible(rel); + + Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel)); + Assert(!itup || !BTreeTupleIsPosting(itup) || key->heapkeyspace); + + /* + * When caller passes a tuple with a heap TID, use it to set scantid. Note + * that this handles posting list tuples by setting scantid to the lowest + * heap TID in the posting list. + */ + if (itup && key->heapkeyspace) + key->scantid = BTreeTupleGetHeapTID(itup); + skey = key->scankeys; for (i = 0; i < indnkeyatts; i++) { @@ -1386,6 +1398,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, * attribute passes the qual. */ Assert(ScanDirectionIsForward(dir)); + Assert(BTreeTupleIsPivot(tuple)); continue; } @@ -1547,6 +1560,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, * attribute passes the qual. */ Assert(ScanDirectionIsForward(dir)); + Assert(BTreeTupleIsPivot(tuple)); cmpresult = 0; if (subkey->sk_flags & SK_ROW_END) break; @@ -1786,10 +1800,35 @@ _bt_killitems(IndexScanDesc scan) { ItemId iid = PageGetItemId(page, offnum); IndexTuple ituple = (IndexTuple) PageGetItem(page, iid); + bool killtuple = false; - if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid)) + if (BTreeTupleIsPosting(ituple)) { - /* found the item */ + int pi = i + 1; + int nposting = BTreeTupleGetNPosting(ituple); + int j; + + for (j = 0; j < nposting; j++) + { + ItemPointer item = BTreeTupleGetPostingN(ituple, j); + + if (!ItemPointerEquals(item, &kitem->heapTid)) + break; /* out of posting list loop */ + + /* Read-ahead to later kitems */ + if (pi < numKilled) + kitem = &so->currPos.items[so->killedItems[pi++]]; + } + + if (j == nposting) + killtuple = true; + } + else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid)) + killtuple = true; + + if (killtuple) + { + /* found the item/all posting list items */ ItemIdMarkDead(iid); killedsomething = true; break; /* out of inner search loop */ @@ -2027,7 +2066,30 @@ BTreeShmemInit(void) bytea * btoptions(Datum reloptions, bool validate) { - return default_reloptions(reloptions, validate, RELOPT_KIND_BTREE); + relopt_value *options; + BtreeOptions *rdopts; + int numoptions; + static const relopt_parse_elt tab[] = { + {"fillfactor", RELOPT_TYPE_INT, offsetof(BtreeOptions, fillfactor)}, + {"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL, + offsetof(BtreeOptions, vacuum_cleanup_index_scale_factor)}, + {"deduplication", RELOPT_TYPE_BOOL, offsetof(BtreeOptions, do_deduplication)} + }; + + options = parseRelOptions(reloptions, validate, RELOPT_KIND_BTREE, + &numoptions); + + /* if none set, we're done */ + if (numoptions == 0) + return NULL; + + rdopts = allocateReloptStruct(sizeof(BtreeOptions), options, numoptions); + + fillRelOptions((void *) rdopts, sizeof(BtreeOptions), options, numoptions, + validate, tab, lengthof(tab)); + + pfree(options); + return (bytea *) rdopts; } /* @@ -2140,6 +2202,24 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, pivot = index_truncate_tuple(itupdesc, firstright, keepnatts); + if (BTreeTupleIsPosting(firstright)) + { + BTreeTupleClearBtIsPosting(pivot); + BTreeTupleSetNAtts(pivot, keepnatts); + if (keepnatts == natts) + { + /* + * index_truncate_tuple() just returned a copy of the + * original, so make sure that the size of the new pivot tuple + * doesn't have posting list overhead + */ + pivot->t_info &= ~INDEX_SIZE_MASK; + pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright)); + } + } + + Assert(!BTreeTupleIsPosting(pivot)); + /* * If there is a distinguishing key attribute within new pivot tuple, * there is no need to add an explicit heap TID attribute @@ -2156,6 +2236,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, * attribute to the new pivot tuple. */ Assert(natts != nkeyatts); + Assert(!BTreeTupleIsPosting(lastleft) && + !BTreeTupleIsPosting(firstright)); newsize = IndexTupleSize(pivot) + MAXALIGN(sizeof(ItemPointerData)); tidpivot = palloc0(newsize); memcpy(tidpivot, pivot, IndexTupleSize(pivot)); @@ -2163,6 +2245,24 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, pfree(pivot); pivot = tidpivot; } + else if (BTreeTupleIsPosting(firstright)) + { + /* + * No truncation was possible, since key attributes are all equal. We + * can always truncate away a posting list, though. + * + * It's necessary to add a heap TID attribute to the new pivot tuple. + */ + newsize = MAXALIGN(BTreeTupleGetPostingOffset(firstright)) + + MAXALIGN(sizeof(ItemPointerData)); + pivot = palloc0(newsize); + memcpy(pivot, firstright, BTreeTupleGetPostingOffset(firstright)); + + pivot->t_info &= ~INDEX_SIZE_MASK; + pivot->t_info |= newsize; + BTreeTupleClearBtIsPosting(pivot); + BTreeTupleSetAltHeapTID(pivot); + } else { /* @@ -2170,7 +2270,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, * It's necessary to add a heap TID attribute to the new pivot tuple. */ Assert(natts == nkeyatts); - newsize = IndexTupleSize(firstright) + MAXALIGN(sizeof(ItemPointerData)); + newsize = MAXALIGN(IndexTupleSize(firstright)) + + MAXALIGN(sizeof(ItemPointerData)); pivot = palloc0(newsize); memcpy(pivot, firstright, IndexTupleSize(firstright)); } @@ -2188,6 +2289,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, * nbtree (e.g., there is no pg_attribute entry). */ Assert(itup_key->heapkeyspace); + Assert(!BTreeTupleIsPosting(pivot)); pivot->t_info &= ~INDEX_SIZE_MASK; pivot->t_info |= newsize; @@ -2200,7 +2302,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, */ pivotheaptid = (ItemPointer) ((char *) pivot + newsize - sizeof(ItemPointerData)); - ItemPointerCopy(&lastleft->t_tid, pivotheaptid); + ItemPointerCopy(BTreeTupleGetMaxHeapTID(lastleft), pivotheaptid); /* * Lehman and Yao require that the downlink to the right page, which is to @@ -2211,9 +2313,12 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, * tiebreaker. */ #ifndef DEBUG_NO_TRUNCATE - Assert(ItemPointerCompare(&lastleft->t_tid, &firstright->t_tid) < 0); - Assert(ItemPointerCompare(pivotheaptid, &lastleft->t_tid) >= 0); - Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0); + Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(lastleft), + BTreeTupleGetHeapTID(firstright)) < 0); + Assert(ItemPointerCompare(pivotheaptid, + BTreeTupleGetHeapTID(lastleft)) >= 0); + Assert(ItemPointerCompare(pivotheaptid, + BTreeTupleGetHeapTID(firstright)) < 0); #else /* @@ -2226,7 +2331,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, * attribute values along with lastleft's heap TID value when lastleft's * TID happens to be greater than firstright's TID. */ - ItemPointerCopy(&firstright->t_tid, pivotheaptid); + ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid); /* * Pivot heap TID should never be fully equal to firstright. Note that @@ -2235,7 +2340,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, */ ItemPointerSetOffsetNumber(pivotheaptid, OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid))); - Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0); + Assert(ItemPointerCompare(pivotheaptid, + BTreeTupleGetHeapTID(firstright)) < 0); #endif BTreeTupleSetNAtts(pivot, nkeyatts); @@ -2316,15 +2422,25 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * The approach taken here usually provides the same answer as _bt_keep_natts * will (for the same pair of tuples from a heapkeyspace index), since the * majority of btree opclasses can never indicate that two datums are equal - * unless they're bitwise equal (once detoasted). Similarly, result may - * differ from the _bt_keep_natts result when either tuple has TOASTed datums, - * though this is barely possible in practice. + * unless they're bitwise equal after detoasting. * * These issues must be acceptable to callers, typically because they're only * concerned about making suffix truncation as effective as possible without * leaving excessive amounts of free space on either side of page split. * Callers can rely on the fact that attributes considered equal here are * definitely also equal according to _bt_keep_natts. + * + * When an index only uses opclasses where equality is "precise", this + * function is guaranteed to give the same result as _bt_keep_natts(). This + * makes it safe to use this function to determine whether or not two tuples + * can be folded together into a single posting tuple. Posting list + * deduplication cannot be used with nondeterministic collations for this + * reason. + * + * FIXME: Actually invent the needed "equality-is-precise" opclass + * infrastructure. See dedicated -hackers thread: + * + * https://postgr.es/m/CAH2-Wzn3Ee49Gmxb7V1VJ3-AC8fWn-Fr8pfWQebHe8rYRxt5OQ@mail.gmail.com */ int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) @@ -2349,8 +2465,38 @@ _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) if (isNull1 != isNull2) break; + /* + * XXX: The ideal outcome from the point of view of the posting list + * patch is that the definition of an opclass with "precise equality" + * becomes: "equality operator function must give exactly the same + * answer as datum_image_eq() would, provided that we aren't using a + * nondeterministic collation". (Nondeterministic collations are + * clearly not compatible with deduplication.) + * + * This will be a lot faster than actually using the authoritative + * insertion scankey in some cases. This approach also seems more + * elegant, since suffix truncation gets to follow exactly the same + * definition of "equal" as posting list deduplication -- there is a + * subtle interplay between deduplication and suffix truncation, and + * it would be nice to know for sure that they have exactly the same + * idea about what equality is. + * + * This ideal outcome still avoids problems with TOAST. We cannot + * repeat bugs like the amcheck bug that was fixed in bugfix commit + * eba775345d23d2c999bbb412ae658b6dab36e3e8. datum_image_eq() + * considers binary equality, though only _after_ each datum is + * decompressed. + * + * If this ideal solution isn't possible, then we can fall back on + * defining "precise equality" as: "type's output function must + * produce identical textual output for any two datums that compare + * equal when using a safe/equality-is-precise operator class (unless + * using a nondeterministic collation)". That would mean that we'd + * have to make deduplication call _bt_keep_natts() instead (or some + * other function that uses authoritative insertion scankey). + */ if (!isNull1 && - !datumIsEqual(datum1, datum2, att->attbyval, att->attlen)) + !datum_image_eq(datum1, datum2, att->attbyval, att->attlen)) break; keepnatts++; @@ -2402,22 +2548,30 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum) itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); tupnatts = BTreeTupleGetNAtts(itup, rel); + /* !heapkeyspace indexes do not support deduplication */ + if (!heapkeyspace && BTreeTupleIsPosting(itup)) + return false; + + /* INCLUDE indexes do not support deduplication */ + if (natts != nkeyatts && BTreeTupleIsPosting(itup)) + return false; + if (P_ISLEAF(opaque)) { if (offnum >= P_FIRSTDATAKEY(opaque)) { /* - * Non-pivot tuples currently never use alternative heap TID - * representation -- even those within heapkeyspace indexes + * Non-pivot tuple should never be explicitly marked as a pivot + * tuple */ - if ((itup->t_info & INDEX_ALT_TID_MASK) != 0) + if (BTreeTupleIsPivot(itup)) return false; /* * Leaf tuples that are not the page high key (non-pivot tuples) * should never be truncated. (Note that tupnatts must have been - * inferred, rather than coming from an explicit on-disk - * representation.) + * inferred, even with a posting list tuple, because only pivot + * tuples store tupnatts directly.) */ return tupnatts == natts; } @@ -2461,12 +2615,12 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum) * non-zero, or when there is no explicit representation and the * tuple is evidently not a pre-pg_upgrade tuple. * - * Prior to v11, downlinks always had P_HIKEY as their offset. Use - * that to decide if the tuple is a pre-v11 tuple. + * Prior to v11, downlinks always had P_HIKEY as their offset. + * Accept that as an alternative indication of a valid + * !heapkeyspace negative infinity tuple. */ return tupnatts == 0 || - ((itup->t_info & INDEX_ALT_TID_MASK) == 0 && - ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY); + ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY; } else { @@ -2492,7 +2646,11 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum) * heapkeyspace index pivot tuples, regardless of whether or not there are * non-key attributes. */ - if ((itup->t_info & INDEX_ALT_TID_MASK) == 0) + if (!BTreeTupleIsPivot(itup)) + return false; + + /* Pivot tuple should not use posting list representation (redundant) */ + if (BTreeTupleIsPosting(itup)) return false; /* @@ -2562,11 +2720,119 @@ _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, BTMaxItemSizeNoHeapTid(page), RelationGetRelationName(rel)), errdetail("Index row references tuple (%u,%u) in relation \"%s\".", - ItemPointerGetBlockNumber(&newtup->t_tid), - ItemPointerGetOffsetNumber(&newtup->t_tid), + ItemPointerGetBlockNumber(BTreeTupleGetHeapTID(newtup)), + ItemPointerGetOffsetNumber(BTreeTupleGetHeapTID(newtup)), RelationGetRelationName(heap)), errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" "Consider a function index of an MD5 hash of the value, " "or use full text indexing."), errtableconstraint(heap, RelationGetRelationName(rel)))); } + +/* + * Given a basic tuple that contains key datum and posting list, build a + * posting tuple. Caller's "htids" array must be sorted in ascending order. + * + * Basic tuple can be a posting tuple, but we only use key part of it, all + * ItemPointers must be passed via htids. + * + * If nhtids == 1, just build a non-posting tuple. It is necessary to avoid + * storage overhead after posting tuple was vacuumed. + */ +IndexTuple +BTreeFormPostingTuple(IndexTuple tuple, ItemPointer htids, int nhtids) +{ + uint32 keysize, + newsize = 0; + IndexTuple itup; + + /* We only need key part of the tuple */ + if (BTreeTupleIsPosting(tuple)) + keysize = BTreeTupleGetPostingOffset(tuple); + else + keysize = IndexTupleSize(tuple); + + Assert(nhtids > 0); + + /* Add space needed for posting list */ + if (nhtids > 1) + newsize = SHORTALIGN(keysize) + sizeof(ItemPointerData) * nhtids; + else + newsize = keysize; + + newsize = MAXALIGN(newsize); + itup = palloc0(newsize); + memcpy(itup, tuple, keysize); + itup->t_info &= ~INDEX_SIZE_MASK; + itup->t_info |= newsize; + + if (nhtids > 1) + { + /* Form posting tuple, fill posting fields */ + + itup->t_info |= INDEX_ALT_TID_MASK; + BTreeSetPostingMeta(itup, nhtids, SHORTALIGN(keysize)); + /* Copy posting list into the posting tuple */ + memcpy(BTreeTupleGetPosting(itup), htids, + sizeof(ItemPointerData) * nhtids); + +#ifdef USE_ASSERT_CHECKING + { + /* Assert that htid array is sorted and has unique TIDs */ + ItemPointerData last; + ItemPointer current; + + ItemPointerCopy(BTreeTupleGetHeapTID(itup), &last); + + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + current = BTreeTupleGetPostingN(itup, i); + Assert(ItemPointerCompare(current, &last) > 0); + ItemPointerCopy(current, &last); + } + } +#endif + } + else + { + /* To finish building of a non-posting tuple, copy TID from htids */ + itup->t_info &= ~INDEX_ALT_TID_MASK; + ItemPointerCopy(htids, &itup->t_tid); + } + + return itup; +} + +/* + * Note: This does not account for pg_uggrade'd !heapkeyspace indexes + */ +bool +_bt_dedup_is_possible(Relation index) +{ + int dedup_is_possible = false; + + if (IndexRelationGetNumberOfAttributes(index) == + IndexRelationGetNumberOfKeyAttributes(index)) + { + int i; + + dedup_is_possible = true; + + for (i = 0; i < IndexRelationGetNumberOfKeyAttributes(index); i++) + { + Oid opfamily = index->rd_opfamily[i]; + Oid collation = index->rd_indcollation[i]; + + /* TODO add adequate check of opclasses and collations */ + elog(DEBUG4, "index %s column i %d opfamilyOid %u collationOid %u", + RelationGetRelationName(index), i, opfamily, collation); + /* NUMERIC BTREE OPFAMILY OID is 1988 */ + if (opfamily == 1988) + { + return false; + } + } + } + + return dedup_is_possible; +} diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index dd5315c1aa..747ab4235c 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -21,8 +21,11 @@ #include "access/xlog.h" #include "access/xlogutils.h" #include "storage/procarray.h" +#include "utils/memutils.h" #include "miscadmin.h" +static MemoryContext opCtx; /* working memory for operations */ + /* * _bt_restore_page -- re-enter all the index tuples on a page * @@ -111,6 +114,7 @@ _bt_restore_meta(XLogReaderState *record, uint8 block_id) Assert(md->btm_version >= BTREE_NOVAC_VERSION); md->btm_oldest_btpo_xact = xlrec->oldest_btpo_xact; md->btm_last_cleanup_num_heap_tuples = xlrec->last_cleanup_num_heap_tuples; + md->btm_dedup_is_possible = xlrec->btm_dedup_is_possible; pageop = (BTPageOpaque) PageGetSpecialPointer(metapg); pageop->btpo_flags = BTP_META; @@ -181,9 +185,46 @@ btree_xlog_insert(bool isleaf, bool ismeta, XLogReaderState *record) page = BufferGetPage(buffer); - if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum, - false, false) == InvalidOffsetNumber) - elog(PANIC, "btree_xlog_insert: failed to add item"); + if (xlrec->postingoff == InvalidOffsetNumber) + { + /* Simple retail insertion */ + if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum, + false, false) == InvalidOffsetNumber) + elog(PANIC, "btree_xlog_insert: failed to add item"); + } + else + { + ItemId itemid; + IndexTuple oposting, + newitem, + nposting; + + /* + * A posting list split occurred during insertion. + * + * Use _bt_posting_split() to repeat posting list split steps from + * primary. Note that newitem from WAL record is 'orignewitem', + * not the final version of newitem that is actually inserted on + * page. + */ + Assert(isleaf); + itemid = PageGetItemId(page, OffsetNumberPrev(xlrec->offnum)); + oposting = (IndexTuple) PageGetItem(page, itemid); + + /* newitem must be mutable copy for _bt_posting_split() */ + newitem = CopyIndexTuple((IndexTuple) datapos); + nposting = _bt_posting_split(newitem, oposting, + xlrec->postingoff); + + /* Replace existing posting list with post-split version */ + memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting))); + + /* insert new item */ + Assert(IndexTupleSize(newitem) == datalen); + if (PageAddItem(page, (Item) newitem, datalen, xlrec->offnum, + false, false) == InvalidOffsetNumber) + elog(PANIC, "btree_xlog_insert: failed to add posting split new item"); + } PageSetLSN(page, lsn); MarkBufferDirty(buffer); @@ -265,20 +306,42 @@ btree_xlog_split(bool onleft, XLogReaderState *record) BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage); OffsetNumber off; IndexTuple newitem = NULL, - left_hikey = NULL; + left_hikey = NULL, + nposting = NULL; Size newitemsz = 0, left_hikeysz = 0; Page newlpage; - OffsetNumber leftoff; + OffsetNumber leftoff, + replacepostingoff = InvalidOffsetNumber; datapos = XLogRecGetBlockData(record, 0, &datalen); - if (onleft) + if (onleft || xlrec->postingoff != 0) { newitem = (IndexTuple) datapos; newitemsz = MAXALIGN(IndexTupleSize(newitem)); datapos += newitemsz; datalen -= newitemsz; + + if (xlrec->postingoff != 0) + { + /* + * Use _bt_posting_split() to repeat posting list split steps + * from primary + */ + ItemId itemid; + IndexTuple oposting; + + /* Posting list must be at offset number before new item's */ + replacepostingoff = OffsetNumberPrev(xlrec->newitemoff); + + /* newitem must be mutable copy for _bt_posting_split() */ + newitem = CopyIndexTuple(newitem); + itemid = PageGetItemId(lpage, replacepostingoff); + oposting = (IndexTuple) PageGetItem(lpage, itemid); + nposting = _bt_posting_split(newitem, oposting, + xlrec->postingoff); + } } /* Extract left hikey and its size (assuming 16-bit alignment) */ @@ -304,8 +367,20 @@ btree_xlog_split(bool onleft, XLogReaderState *record) Size itemsz; IndexTuple item; + /* Add replacement posting list when required */ + if (off == replacepostingoff) + { + Assert(onleft || xlrec->firstright == xlrec->newitemoff); + if (PageAddItem(newlpage, (Item) nposting, + MAXALIGN(IndexTupleSize(nposting)), leftoff, + false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add new posting list item to left page after split"); + leftoff = OffsetNumberNext(leftoff); + continue; + } + /* add the new item if it was inserted on left page */ - if (onleft && off == xlrec->newitemoff) + else if (onleft && off == xlrec->newitemoff) { if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff, false, false) == InvalidOffsetNumber) @@ -379,6 +454,83 @@ btree_xlog_split(bool onleft, XLogReaderState *record) } } +static void +btree_xlog_dedup(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + Buffer buf; + xl_btree_dedup *xlrec = (xl_btree_dedup *) XLogRecGetData(record); + + if (XLogReadBufferForRedo(record, 0, &buf) == BLK_NEEDS_REDO) + { + /* + * Initialize a temporary empty page and copy all the items to that in + * item number order. + */ + Page page = (Page) BufferGetPage(buf); + OffsetNumber offnum; + BTDedupState *state; + + state = (BTDedupState *) palloc(sizeof(BTDedupState)); + + state->maxitemsize = BTMaxItemSize(page); + state->checkingunique = false; /* unused */ + state->newitem = NULL; + /* Metadata about current pending posting list */ + state->htids = NULL; + state->nhtids = 0; + state->nitems = 0; + state->alltupsize = 0; + state->overlap = false; + /* Metadata about based tuple of current pending posting list */ + state->base = NULL; + state->baseoff = InvalidOffsetNumber; + state->basetupsize = 0; + + /* Conservatively size array */ + state->htids = palloc(state->maxitemsize); + + /* + * Iterate over tuples on the page belonging to the interval to + * deduplicate them into a posting list. + */ + for (offnum = xlrec->baseoff; + offnum < xlrec->baseoff + xlrec->nitems; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + + Assert(!ItemIdIsDead(itemid)); + + if (offnum == xlrec->baseoff) + { + /* + * No previous/base tuple for first data item -- use first + * data item as base tuple of first pending posting list + */ + _bt_dedup_start_pending(state, itup, offnum); + } + else + { + /* Heap TID(s) for itup will be saved in state */ + if (!_bt_dedup_save_htid(state, itup)) + elog(ERROR, "could not add heap tid to pending posting list"); + } + } + + Assert(state->nitems == xlrec->nitems); + /* Handle the last item */ + _bt_dedup_finish_pending(buf, state, false); + + PageSetLSN(page, lsn); + MarkBufferDirty(buf); + } + + if (BufferIsValid(buf)) + UnlockReleaseBuffer(buf); +} + static void btree_xlog_vacuum(XLogReaderState *record) { @@ -386,8 +538,8 @@ btree_xlog_vacuum(XLogReaderState *record) Buffer buffer; Page page; BTPageOpaque opaque; -#ifdef UNUSED xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record); +#ifdef UNUSED /* * This section of code is thought to be no longer needed, after analysis @@ -478,14 +630,34 @@ btree_xlog_vacuum(XLogReaderState *record) if (len > 0) { - OffsetNumber *unused; - OffsetNumber *unend; + if (xlrec->nupdated > 0) + { + OffsetNumber *updatedoffsets; + IndexTuple updated; + Size itemsz; - unused = (OffsetNumber *) ptr; - unend = (OffsetNumber *) ((char *) ptr + len); + updatedoffsets = (OffsetNumber *) + (ptr + xlrec->ndeleted * sizeof(OffsetNumber)); + updated = (IndexTuple) ((char *) updatedoffsets + + xlrec->nupdated * sizeof(OffsetNumber)); - if ((unend - unused) > 0) - PageIndexMultiDelete(page, unused, unend - unused); + /* Handle posting tuples */ + for (int i = 0; i < xlrec->nupdated; i++) + { + PageIndexTupleDelete(page, updatedoffsets[i]); + + itemsz = MAXALIGN(IndexTupleSize(updated)); + + if (PageAddItem(page, (Item) updated, itemsz, updatedoffsets[i], + false, false) == InvalidOffsetNumber) + elog(PANIC, "btree_xlog_vacuum: failed to add updated posting list item"); + + updated = (IndexTuple) ((char *) updated + itemsz); + } + } + + if (xlrec->ndeleted) + PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted); } /* @@ -820,7 +992,9 @@ void btree_redo(XLogReaderState *record) { uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + MemoryContext oldCtx; + oldCtx = MemoryContextSwitchTo(opCtx); switch (info) { case XLOG_BTREE_INSERT_LEAF: @@ -838,6 +1012,9 @@ btree_redo(XLogReaderState *record) case XLOG_BTREE_SPLIT_R: btree_xlog_split(false, record); break; + case XLOG_BTREE_DEDUP_PAGE: + btree_xlog_dedup(record); + break; case XLOG_BTREE_VACUUM: btree_xlog_vacuum(record); break; @@ -863,6 +1040,23 @@ btree_redo(XLogReaderState *record) default: elog(PANIC, "btree_redo: unknown op code %u", info); } + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(opCtx); +} + +void +btree_xlog_startup(void) +{ + opCtx = AllocSetContextCreate(CurrentMemoryContext, + "Btree recovery temporary context", + ALLOCSET_DEFAULT_SIZES); +} + +void +btree_xlog_cleanup(void) +{ + MemoryContextDelete(opCtx); + opCtx = NULL; } /* diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c index 4ee6d04a68..1dde2da285 100644 --- a/src/backend/access/rmgrdesc/nbtdesc.c +++ b/src/backend/access/rmgrdesc/nbtdesc.c @@ -30,7 +30,8 @@ btree_desc(StringInfo buf, XLogReaderState *record) { xl_btree_insert *xlrec = (xl_btree_insert *) rec; - appendStringInfo(buf, "off %u", xlrec->offnum); + appendStringInfo(buf, "off %u; postingoff %u", + xlrec->offnum, xlrec->postingoff); break; } case XLOG_BTREE_SPLIT_L: @@ -38,16 +39,30 @@ btree_desc(StringInfo buf, XLogReaderState *record) { xl_btree_split *xlrec = (xl_btree_split *) rec; - appendStringInfo(buf, "level %u, firstright %d, newitemoff %d", - xlrec->level, xlrec->firstright, xlrec->newitemoff); + appendStringInfo(buf, "level %u, firstright %d, newitemoff %d, postingoff %d", + xlrec->level, + xlrec->firstright, + xlrec->newitemoff, + xlrec->postingoff); + break; + } + case XLOG_BTREE_DEDUP_PAGE: + { + xl_btree_dedup *xlrec = (xl_btree_dedup *) rec; + + appendStringInfo(buf, "baseoff %u; nitems %u", + xlrec->baseoff, + xlrec->nitems); break; } case XLOG_BTREE_VACUUM: { xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec; - appendStringInfo(buf, "lastBlockVacuumed %u", - xlrec->lastBlockVacuumed); + appendStringInfo(buf, "lastBlockVacuumed %u; nupdated %u; ndeleted %u", + xlrec->lastBlockVacuumed, + xlrec->nupdated, + xlrec->ndeleted); break; } case XLOG_BTREE_DELETE: @@ -131,6 +146,9 @@ btree_identify(uint8 info) case XLOG_BTREE_SPLIT_R: id = "SPLIT_R"; break; + case XLOG_BTREE_DEDUP_PAGE: + id = "DEDUPLICATE"; + break; case XLOG_BTREE_VACUUM: id = "VACUUM"; break; diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 4a80e84aa7..593f74c26e 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -107,11 +107,43 @@ typedef struct BTMetaPageData * pages */ float8 btm_last_cleanup_num_heap_tuples; /* number of heap tuples * during last cleanup */ + bool btm_dedup_is_possible; /* whether the deduplication can be + * applied to the index */ } BTMetaPageData; #define BTPageGetMeta(p) \ ((BTMetaPageData *) PageGetContents(p)) +/* Storage type for Btree's reloptions */ +typedef struct BtreeOptions +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + int fillfactor; + double vacuum_cleanup_index_scale_factor; + bool do_deduplication; +} BtreeOptions; + +/* + * By default deduplication is enabled for non unique indexes + * and disabled for unique ones + * + * XXX: Actually, we use deduplication everywhere for now. Re-review this + * decision later on. + */ +#define BtreeDefaultDoDedup(relation) \ + (relation->rd_index->indisunique ? true : true) + +#define BtreeGetDoDedupOption(relation) \ + ((relation)->rd_options ? \ + ((BtreeOptions *) (relation)->rd_options)->do_deduplication : BtreeDefaultDoDedup(relation)) + +#define BtreeGetFillFactor(relation, defaultff) \ + ((relation)->rd_options ? \ + ((BtreeOptions *) (relation)->rd_options)->fillfactor : (defaultff)) + +#define BtreeGetTargetPageFreeSpace(relation, defaultff) \ + (BLCKSZ * (100 - BtreeGetFillFactor(relation, defaultff)) / 100) + /* * The current Btree version is 4. That's what you'll get when you create * a new index. @@ -234,8 +266,7 @@ typedef struct BTMetaPageData * t_tid | t_info | key values | INCLUDE columns, if any * * t_tid points to the heap TID, which is a tiebreaker key column as of - * BTREE_VERSION 4. Currently, the INDEX_ALT_TID_MASK status bit is never - * set for non-pivot tuples. + * BTREE_VERSION 4. * * All other types of index tuples ("pivot" tuples) only have key columns, * since pivot tuples only exist to represent how the key space is @@ -252,6 +283,38 @@ typedef struct BTMetaPageData * omitted rather than truncated, since its representation is different to * the non-pivot representation.) * + * Non-pivot posting tuple format: + * t_tid | t_info | key values | INCLUDE columns, if any | posting_list[] + * + * In order to store duplicated keys more effectively, we use special format + * of tuples - posting tuples. posting_list is an array of ItemPointerData. + * + * Deduplication never applies to unique indexes or indexes with INCLUDEd + * columns. + * + * To differ posting tuples we use INDEX_ALT_TID_MASK flag in t_info and + * BT_IS_POSTING flag in t_tid. + * These flags redefine the content of the posting tuple's tid: + * - t_tid.ip_blkid contains offset of the posting list. + * - t_tid offset field contains number of posting items this tuple contain + * + * The 12 least significant offset bits from t_tid are used to represent + * the number of posting items in posting tuples, leaving 4 status + * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for + * future use. + * BT_N_POSTING_OFFSET_MASK is large enough to store any number of posting + * tuples, which is constrainted by BTMaxItemSize. + + * If page contains so many duplicates, that they do not fit into one posting + * tuple (bounded by BTMaxItemSize and ), page may contain several posting + * tuples with the same key. + * Also page can contain both posting and non-posting tuples with the same key. + * Currently, posting tuples always contain at least two TIDs in the posting + * list. + * + * Posting tuples always have the same number of attributes as the index has + * generally. + * * Pivot tuple format: * * t_tid | t_info | key values | [heap TID] @@ -281,23 +344,152 @@ typedef struct BTMetaPageData * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for * future use. BT_N_KEYS_OFFSET_MASK should be large enough to store any * number of columns/attributes <= INDEX_MAX_KEYS. + * BT_IS_POSTING bit must be unset for pivot tuples, since we use it + * to distinct posting tuples from pivot tuples. * * Note well: The macros that deal with the number of attributes in tuples - * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple, - * and that a tuple without INDEX_ALT_TID_MASK set must be a non-pivot - * tuple (or must have the same number of attributes as the index has - * generally in the case of !heapkeyspace indexes). They will need to be - * updated if non-pivot tuples ever get taught to use INDEX_ALT_TID_MASK - * for something else. + * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple or + * non-pivot posting tuple, and that a tuple without INDEX_ALT_TID_MASK set + * must be a non-pivot tuple (or must have the same number of attributes as + * the index has generally in the case of !heapkeyspace indexes). */ #define INDEX_ALT_TID_MASK INDEX_AM_RESERVED_BIT /* Item pointer offset bits */ #define BT_RESERVED_OFFSET_MASK 0xF000 #define BT_N_KEYS_OFFSET_MASK 0x0FFF +#define BT_N_POSTING_OFFSET_MASK 0x0FFF #define BT_HEAP_TID_ATTR 0x1000 +#define BT_IS_POSTING 0x2000 -/* Get/set downlink block number */ +/* + * MaxPostingIndexTuplesPerPage is an upper bound on the number of tuples + * that can fit on one btree leaf page. + * + * Btree leaf pages may contain posting tuples, which store duplicates + * in a more effective way, so MaxPostingIndexTuplesPerPage is larger then + * MaxIndexTuplesPerPage. + * + * Each leaf page must contain at least three items, so estimate it as + * if we have three posting tuples with minimal size keys. + */ +#define MaxPostingIndexTuplesPerPage \ + ((int) ((BLCKSZ - SizeOfPageHeaderData - \ + 3*((MAXALIGN(sizeof(IndexTupleData) + 1) + sizeof(ItemIdData))) )) / \ + (sizeof(ItemPointerData))) + +/* + * State used to representing a pending posting list during deduplication. + * + * Each entry represents a group of consecutive items from the page, starting + * from page offset number 'baseoff', which is the offset number of the "base" + * tuple on the page undergoing deduplication. 'nitems' is the total number + * of items from the page that will be merged to make a new posting tuple. + * + * Note: 'nitems' means the number of physical index tuples/line pointers on + * the page, starting with and including the item at offset number 'baseoff' + * (so nitems should be at least 2 when interval is used). These existing + * tuples may be posting list tuples or regular tuples. + */ +typedef struct BTDedupInterval +{ + OffsetNumber baseoff; + OffsetNumber nitems; +} BTDedupInterval; + +/* + * Btree-private state needed to build posting tuples. htids is an array of + * ItemPointers for pending posting list. + * + * Iterating over tuples during index build or applying deduplication to a + * single page, we remember a "base" tuple, then compare the next one with it. + * If tuples are equal, save their TIDs in the posting list. + */ +typedef struct BTDedupState +{ + Relation rel; + /* Deduplication status info for entire page/operation */ + Size maxitemsize; /* BTMaxItemSize() limit for page */ + IndexTuple newitem; + bool checkingunique; + + /* Metadata about current pending posting list */ + ItemPointer htids; /* Heap TIDs in pending posting list */ + int nhtids; /* # valid heap TIDs in nhtids array */ + int nitems; /* See BTDedupInterval definition */ + Size alltupsize; /* Includes line pointer overhead */ + bool overlap; /* Avoid overlapping posting lists? */ + + /* Metadata about base tuple of current pending posting list */ + IndexTuple base; /* Use to form new posting list */ + OffsetNumber baseoff; /* page offset of base */ + Size basetupsize; /* base size without posting list */ + + /* + * Pending posting list. Contains information about a group of + * consecutive items that will be deduplicated by creating a new posting + * list tuple. + */ + BTDedupInterval interval; +} BTDedupState; + +/* + * N.B.: BTreeTupleIsPivot() should only be used in code that deals with + * heapkeyspace indexes specifically. BTreeTupleIsPosting() works with all + * nbtree indexes, though. + */ +#define BTreeTupleIsPivot(itup) \ + ( \ + ((itup)->t_info & INDEX_ALT_TID_MASK && \ + ((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) == 0))\ + ) +#define BTreeTupleIsPosting(itup) \ + ( \ + ((itup)->t_info & INDEX_ALT_TID_MASK && \ + ((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) != 0))\ + ) + +#define BTreeTupleClearBtIsPosting(itup) \ + do { \ + ItemPointerSetOffsetNumber(&(itup)->t_tid, \ + ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & ~BT_IS_POSTING); \ + } while(0) + +#define BTreeTupleGetNPosting(itup) \ + ( \ + AssertMacro(BTreeTupleIsPosting(itup)), \ + ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_POSTING_OFFSET_MASK \ + ) +#define BTreeTupleSetNPosting(itup, n) \ + do { \ + ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_POSTING_OFFSET_MASK); \ + Assert((itup)->t_info & INDEX_ALT_TID_MASK); \ + Assert(!((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) != 0)); \ + ItemPointerSetOffsetNumber(&(itup)->t_tid, \ + ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_IS_POSTING); \ + } while(0) + +/* + * If tuple is posting, t_tid.ip_blkid contains offset of the posting list + */ +#define BTreeTupleGetPostingOffset(itup) \ + ( \ + AssertMacro(BTreeTupleIsPosting(itup)), \ + ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid)) \ + ) +#define BTreeSetPostingMeta(itup, nposting, off) \ + do { \ + BTreeTupleSetNPosting(itup, nposting); \ + Assert(BTreeTupleIsPosting(itup)); \ + ItemPointerSetBlockNumber(&((itup)->t_tid), (off)); \ + } while(0) + +#define BTreeTupleGetPosting(itup) \ + (ItemPointer) ((char*) (itup) + BTreeTupleGetPostingOffset(itup)) +#define BTreeTupleGetPostingN(itup,n) \ + (BTreeTupleGetPosting(itup) + (n)) + +/* Get/set downlink block number */ #define BTreeInnerTupleGetDownLink(itup) \ ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid)) #define BTreeInnerTupleSetDownLink(itup, blkno) \ @@ -326,40 +518,73 @@ typedef struct BTMetaPageData */ #define BTreeTupleGetNAtts(itup, rel) \ ( \ - (itup)->t_info & INDEX_ALT_TID_MASK ? \ + ((itup)->t_info & INDEX_ALT_TID_MASK && \ + ((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) == 0)) ? \ ( \ ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK \ ) \ : \ IndexRelationGetNumberOfAttributes(rel) \ ) -#define BTreeTupleSetNAtts(itup, n) \ - do { \ - (itup)->t_info |= INDEX_ALT_TID_MASK; \ - ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_KEYS_OFFSET_MASK); \ - } while(0) + +static inline void +BTreeTupleSetNAtts(IndexTuple itup, int n) +{ + Assert(!BTreeTupleIsPosting(itup)); + itup->t_info |= INDEX_ALT_TID_MASK; + ItemPointerSetOffsetNumber(&itup->t_tid, n & BT_N_KEYS_OFFSET_MASK); +} /* - * Get tiebreaker heap TID attribute, if any. Macro works with both pivot - * and non-pivot tuples, despite differences in how heap TID is represented. + * Get tiebreaker heap TID attribute, if any. Works with both pivot and + * non-pivot tuples, despite differences in how heap TID is represented. + * + * This returns the first/lowest heap TID in the case of a posting list tuple. */ -#define BTreeTupleGetHeapTID(itup) \ - ( \ - (itup)->t_info & INDEX_ALT_TID_MASK && \ - (ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_HEAP_TID_ATTR) != 0 ? \ - ( \ - (ItemPointer) (((char *) (itup) + IndexTupleSize(itup)) - \ - sizeof(ItemPointerData)) \ - ) \ - : (itup)->t_info & INDEX_ALT_TID_MASK ? NULL : (ItemPointer) &((itup)->t_tid) \ - ) +static inline ItemPointer +BTreeTupleGetHeapTID(IndexTuple itup) +{ + if (BTreeTupleIsPivot(itup)) + { + /* Pivot tuple heap TID representation? */ + if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & + BT_HEAP_TID_ATTR) != 0) + return (ItemPointer) ((char *) itup + IndexTupleSize(itup) - + sizeof(ItemPointerData)); + + /* Heap TID attribute was truncated */ + return NULL; + } + else if (BTreeTupleIsPosting(itup)) + return BTreeTupleGetPosting(itup); + + return &(itup->t_tid); +} + +/* + * Get maximum heap TID attribute, which could be the only TID in the case of + * a non-pivot tuple that does not have a posting list tuple. Works with + * non-pivot tuples only. + */ +static inline ItemPointer +BTreeTupleGetMaxHeapTID(IndexTuple itup) +{ + Assert(!BTreeTupleIsPivot(itup)); + + if (BTreeTupleIsPosting(itup)) + return (ItemPointer) (BTreeTupleGetPosting(itup) + + (BTreeTupleGetNPosting(itup) - 1)); + + return &(itup->t_tid); +} + /* * Set the heap TID attribute for a tuple that uses the INDEX_ALT_TID_MASK - * representation (currently limited to pivot tuples) + * representation */ #define BTreeTupleSetAltHeapTID(itup) \ do { \ - Assert((itup)->t_info & INDEX_ALT_TID_MASK); \ + Assert(BTreeTupleIsPivot(itup)); \ ItemPointerSetOffsetNumber(&(itup)->t_tid, \ ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_HEAP_TID_ATTR); \ } while(0) @@ -472,6 +697,7 @@ typedef struct BTScanInsertData bool anynullkeys; bool nextkey; bool pivotsearch; + bool dedup_is_possible; ItemPointer scantid; /* tiebreaker for scankeys */ int keysz; /* Size of scankeys array */ ScanKeyData scankeys[INDEX_MAX_KEYS]; /* Must appear last */ @@ -499,6 +725,13 @@ typedef struct BTInsertStateData /* Buffer containing leaf page we're likely to insert itup on */ Buffer buf; + /* + * if _bt_binsrch_insert() found the location inside existing posting + * list, save the position inside the list. This will be -1 in rare cases + * where the overlapping posting list is LP_DEAD. + */ + int postingoff; + /* * Cache of bounds within the current buffer. Only used for insertions * where _bt_check_unique is called. See _bt_binsrch_insert and @@ -534,7 +767,9 @@ typedef BTInsertStateData *BTInsertState; * If we are doing an index-only scan, we save the entire IndexTuple for each * matched item, otherwise only its heap TID and offset. The IndexTuples go * into a separate workspace array; each BTScanPosItem stores its tuple's - * offset within that array. + * offset within that array. Posting list tuples store a version of the + * tuple that does not include the posting list, allowing the same key to be + * returned for each logical tuple associated with the posting list. */ typedef struct BTScanPosItem /* what we remember about each match */ @@ -563,9 +798,13 @@ typedef struct BTScanPosData /* * If we are doing an index-only scan, nextTupleOffset is the first free - * location in the associated tuple storage workspace. + * location in the associated tuple storage workspace. Posting list + * tuples need postingTupleOffset to store the current location of the + * tuple that is returned multiple times (once per heap TID in posting + * list). */ int nextTupleOffset; + int postingTupleOffset; /* * The items array is always ordered in index order (ie, increasing @@ -578,7 +817,7 @@ typedef struct BTScanPosData int lastItem; /* last valid index in items[] */ int itemIndex; /* current index in items[] */ - BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */ + BTScanPosItem items[MaxPostingIndexTuplesPerPage]; /* MUST BE LAST */ } BTScanPosData; typedef BTScanPosData *BTScanPos; @@ -730,8 +969,15 @@ extern void _bt_parallel_advance_array_keys(IndexScanDesc scan); */ extern bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel); +extern IndexTuple _bt_posting_split(IndexTuple newitem, IndexTuple oposting, + OffsetNumber postingoff); extern void _bt_finish_split(Relation rel, Buffer bbuf, BTStack stack); extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child); +extern void _bt_dedup_start_pending(BTDedupState *state, IndexTuple base, + OffsetNumber base_off); +extern bool _bt_dedup_save_htid(BTDedupState *state, IndexTuple itup); +extern Size _bt_dedup_finish_pending(Buffer buffer, BTDedupState *state, + bool need_wal); /* * prototypes for functions in nbtsplitloc.c @@ -743,7 +989,8 @@ extern OffsetNumber _bt_findsplitloc(Relation rel, Page page, /* * prototypes for functions in nbtpage.c */ -extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level); +extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, + bool dedup_is_possible); extern void _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, float8 numHeapTuples); extern void _bt_upgrademetapage(Page page); @@ -751,6 +998,7 @@ extern Buffer _bt_getroot(Relation rel, int access); extern Buffer _bt_gettrueroot(Relation rel); extern int _bt_getrootheight(Relation rel); extern bool _bt_heapkeyspace(Relation rel); +extern bool _bt_getdedupispossible(Relation rel); extern void _bt_checkpage(Relation rel, Buffer buf); extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access); extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, @@ -762,6 +1010,8 @@ extern void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, Relation heapRel); extern void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, + OffsetNumber *updateitemnos, + IndexTuple *updated, int nupdateable, BlockNumber lastBlockVacuumed); extern int _bt_pagedel(Relation rel, Buffer buf); @@ -812,6 +1062,9 @@ extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum); extern void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup); +extern IndexTuple BTreeFormPostingTuple(IndexTuple tuple, ItemPointer htids, + int nhtids); +extern bool _bt_dedup_is_possible(Relation index); /* * prototypes for functions in nbtvalidate.c diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index 91b9ee00cf..71f6568234 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -28,7 +28,8 @@ #define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */ #define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */ #define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */ -/* 0x50 and 0x60 are unused */ +#define XLOG_BTREE_DEDUP_PAGE 0x50 /* deduplicate tuples on leaf page */ +/* 0x60 is unused */ #define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */ #define XLOG_BTREE_UNLINK_PAGE 0x80 /* delete a half-dead page */ #define XLOG_BTREE_UNLINK_PAGE_META 0x90 /* same, and update metapage */ @@ -53,6 +54,7 @@ typedef struct xl_btree_metadata uint32 fastlevel; TransactionId oldest_btpo_xact; float8 last_cleanup_num_heap_tuples; + bool btm_dedup_is_possible; } xl_btree_metadata; /* @@ -61,16 +63,21 @@ typedef struct xl_btree_metadata * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META. * Note that INSERT_META implies it's not a leaf page. * - * Backup Blk 0: original page (data contains the inserted tuple) + * Backup Blk 0: original page (data contains the inserted tuple); + * if postingoff is set, this started out as an insertion + * into an existing posting tuple at the offset before + * offnum (i.e. it's a posting list split). (REDO will + * have to update split posting list, too.) * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META * Backup Blk 2: xl_btree_metadata, if INSERT_META */ typedef struct xl_btree_insert { OffsetNumber offnum; + OffsetNumber postingoff; } xl_btree_insert; -#define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber)) +#define SizeOfBtreeInsert (offsetof(xl_btree_insert, postingoff) + sizeof(OffsetNumber)) /* * On insert with split, we save all the items going into the right sibling @@ -91,9 +98,19 @@ typedef struct xl_btree_insert * * Backup Blk 0: original page / new left page * - * The left page's data portion contains the new item, if it's the _L variant. - * An IndexTuple representing the high key of the left page must follow with - * either variant. + * The left page's data portion contains the new item, if it's the _L variant + * (though _R variant page split records with a posting list split sometimes + * need to include newitem). An IndexTuple representing the high key of the + * left page must follow in all cases. + * + * The newitem is actually an "original" newitem when a posting list split + * occurs that requires than the original posting list be updated in passing. + * Recovery recognizes this case when postingoff is set, and must use the + * posting offset to do an in-place update of the existing posting list that + * was actually split, and change the newitem to the "final" newitem. This + * corresponds to the xl_btree_insert postingoff-is-set case. postingoff + * won't be set when a posting list split occurs where both original posting + * list and newitem go on the right page. * * Backup Blk 1: new right page * @@ -111,10 +128,26 @@ typedef struct xl_btree_split { uint32 level; /* tree level of page being split */ OffsetNumber firstright; /* first item moved to right page */ - OffsetNumber newitemoff; /* new item's offset (useful for _L variant) */ + OffsetNumber newitemoff; /* new item's offset */ + OffsetNumber postingoff; /* offset inside orig posting tuple */ } xl_btree_split; -#define SizeOfBtreeSplit (offsetof(xl_btree_split, newitemoff) + sizeof(OffsetNumber)) +#define SizeOfBtreeSplit (offsetof(xl_btree_split, postingoff) + sizeof(OffsetNumber)) + +/* + * When page is deduplicated, consecutive groups of tuples with equal keys are + * merged together into posting list tuples. + * + * The WAL record represents the interval that describes the posing tuple + * that should be added to the page. + */ +typedef struct xl_btree_dedup +{ + OffsetNumber baseoff; + OffsetNumber nitems; +} xl_btree_dedup; + +#define SizeOfBtreeDedup (offsetof(xl_btree_dedup, nitems) + sizeof(OffsetNumber)) /* * This is what we need to know about delete of individual leaf index tuples. @@ -166,16 +199,27 @@ typedef struct xl_btree_reuse_page * block numbers aren't given. * * Note that the *last* WAL record in any vacuum of an index is allowed to - * have a zero length array of offsets. Earlier records must have at least one. + * have a zero length array of target offsets (i.e. no deletes or updates). + * Earlier records must have at least one. */ typedef struct xl_btree_vacuum { BlockNumber lastBlockVacuumed; - /* TARGET OFFSET NUMBERS FOLLOW */ + /* + * This field helps us to find beginning of the updated versions of tuples + * which follow array of offset numbers, needed when a posting list is + * vacuumed without killing all of its logical tuples. + */ + uint32 nupdated; + uint32 ndeleted; + + /* UPDATED TARGET OFFSET NUMBERS FOLLOW (if any) */ + /* UPDATED TUPLES TO ADD BACK FOLLOW (if any) */ + /* DELETED TARGET OFFSET NUMBERS FOLLOW (if any) */ } xl_btree_vacuum; -#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber)) +#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, ndeleted) + sizeof(BlockNumber)) /* * This is what we need to know about marking an empty branch for deletion. @@ -256,6 +300,8 @@ typedef struct xl_btree_newroot extern void btree_redo(XLogReaderState *record); extern void btree_desc(StringInfo buf, XLogReaderState *record); extern const char *btree_identify(uint8 info); +extern void btree_xlog_startup(void); +extern void btree_xlog_cleanup(void); extern void btree_mask(char *pagedata, BlockNumber blkno); #endif /* NBTXLOG_H */ diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h index 3c0db2ccf5..2b8c6c7fc8 100644 --- a/src/include/access/rmgrlist.h +++ b/src/include/access/rmgrlist.h @@ -36,7 +36,7 @@ PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL) PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask) PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask) -PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, NULL, NULL, btree_mask) +PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, btree_xlog_startup, btree_xlog_cleanup, btree_mask) PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask) PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask) PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask) diff --git a/src/tools/valgrind.supp b/src/tools/valgrind.supp index ec47a228ae..71a03e39d3 100644 --- a/src/tools/valgrind.supp +++ b/src/tools/valgrind.supp @@ -212,3 +212,24 @@ Memcheck:Cond fun:PyObject_Realloc } + +# Temporarily work around bug in datum_image_eq's handling of the cstring +# (typLen == -2) case. datumIsEqual() is not affected, but also doesn't handle +# TOAST'ed values correctly. +# +# FIXME: Remove both suppressions when bug is fixed on master branch +{ + temporary_workaround_1 + Memcheck:Addr1 + fun:bcmp + fun:datum_image_eq + fun:_bt_keep_natts_fast +} + +{ + temporary_workaround_8 + Memcheck:Addr8 + fun:bcmp + fun:datum_image_eq + fun:_bt_keep_natts_fast +} -- 2.17.1