From 1df33bd12aaf21179da6d3aedaa7a2084e577d25 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Fri, 19 Jul 2019 18:57:31 -0700 Subject: [PATCH v5 1/3] Compression/deduplication in nbtree. Version with some revisions by me. --- contrib/amcheck/verify_nbtree.c | 124 +++++-- src/backend/access/nbtree/nbtinsert.c | 430 +++++++++++++++++++++++- src/backend/access/nbtree/nbtpage.c | 53 +++ src/backend/access/nbtree/nbtree.c | 142 ++++++-- src/backend/access/nbtree/nbtsearch.c | 283 +++++++++++++--- src/backend/access/nbtree/nbtsort.c | 197 ++++++++++- src/backend/access/nbtree/nbtsplitloc.c | 30 +- src/backend/access/nbtree/nbtutils.c | 164 ++++++++- src/backend/access/nbtree/nbtxlog.c | 34 +- src/backend/access/rmgrdesc/nbtdesc.c | 6 +- src/include/access/itup.h | 4 + src/include/access/nbtree.h | 202 ++++++++++- src/include/access/nbtxlog.h | 13 +- 13 files changed, 1528 insertions(+), 154 deletions(-) diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 55a3a4bbe0..da79dd1b62 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -889,6 +889,7 @@ bt_target_page_check(BtreeCheckState *state) size_t tupsize; BTScanInsert skey; bool lowersizelimit; + ItemPointer scantid; CHECK_FOR_INTERRUPTS(); @@ -959,29 +960,73 @@ bt_target_page_check(BtreeCheckState *state) /* * Readonly callers may optionally verify that non-pivot tuples can - * each be found by an independent search that starts from the root + * each be found by an independent search that starts from the root. + * Note that we deliberately don't do individual searches for each + * "logical" posting list tuple, since the posting list itself is + * validated by other checks. */ if (state->rootdescend && P_ISLEAF(topaque) && !bt_rootdescend(state, itup)) { char *itid, *htid; + ItemPointer tid = BTreeTupleGetHeapTID(itup); itid = psprintf("(%u,%u)", state->targetblock, offset); htid = psprintf("(%u,%u)", - ItemPointerGetBlockNumber(&(itup->t_tid)), - ItemPointerGetOffsetNumber(&(itup->t_tid))); + ItemPointerGetBlockNumber(tid), + ItemPointerGetOffsetNumber(tid)); ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("could not find tuple using search from root page in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Index tid=%s points to heap tid=%s page lsn=%X/%X.", + errdetail_internal("Index tid=%s min heap tid=%s page lsn=%X/%X.", itid, htid, (uint32) (state->targetlsn >> 32), (uint32) state->targetlsn))); } + /* + * If tuple is actually a posting list, make sure posting list TIDs + * are in order. + */ + if (BTreeTupleIsPosting(itup)) + { + ItemPointerData last; + ItemPointer current; + + ItemPointerCopy(BTreeTupleGetHeapTID(itup), &last); + + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + + current = BTreeTupleGetPostingN(itup, i); + + if (ItemPointerCompare(current, &last) <= 0) + { + char *itid, + *htid; + + itid = psprintf("(%u,%u)", state->targetblock, offset); + htid = psprintf("(%u,%u)", + ItemPointerGetBlockNumberNoCheck(current), + ItemPointerGetOffsetNumberNoCheck(current)); + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("posting list heap TIDs out of order in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Index tid=%s min heap tid=%s page lsn=%X/%X.", + itid, htid, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn))); + } + + ItemPointerCopy(current, &last); + } + } + /* Build insertion scankey for current page offset */ skey = bt_mkscankey_pivotsearch(state->rel, itup); @@ -1039,12 +1084,33 @@ bt_target_page_check(BtreeCheckState *state) { IndexTuple norm; - norm = bt_normalize_tuple(state, itup); - bloom_add_element(state->filter, (unsigned char *) norm, - IndexTupleSize(norm)); - /* Be tidy */ - if (norm != itup) - pfree(norm); + if (BTreeTupleIsPosting(itup)) + { + IndexTuple onetup; + + /* Fingerprint all elements of posting tuple one by one */ + for (int i = 0; i < BTreeTupleGetNPosting(itup); i++) + { + onetup = BTreeGetNthTupleOfPosting(itup, i); + + norm = bt_normalize_tuple(state, onetup); + bloom_add_element(state->filter, (unsigned char *) norm, + IndexTupleSize(norm)); + /* Be tidy */ + if (norm != onetup) + pfree(norm); + pfree(onetup); + } + } + else + { + norm = bt_normalize_tuple(state, itup); + bloom_add_element(state->filter, (unsigned char *) norm, + IndexTupleSize(norm)); + /* Be tidy */ + if (norm != itup) + pfree(norm); + } } /* @@ -1052,7 +1118,8 @@ bt_target_page_check(BtreeCheckState *state) * * If there is a high key (if this is not the rightmost page on its * entire level), check that high key actually is upper bound on all - * page items. + * page items. If this is a posting list tuple, we'll need to set + * scantid to be highest TID in posting list. * * We prefer to check all items against high key rather than checking * just the last and trusting that the operator class obeys the @@ -1092,6 +1159,9 @@ bt_target_page_check(BtreeCheckState *state) * tuple. (See also: "Notes About Data Representation" in the nbtree * README.) */ + scantid = skey->scantid; + if (!BTreeTupleIsPivot(itup)) + skey->scantid = BTreeTupleGetMaxTID(itup); if (!P_RIGHTMOST(topaque) && !(P_ISLEAF(topaque) ? invariant_leq_offset(state, skey, P_HIKEY) : invariant_l_offset(state, skey, P_HIKEY))) @@ -1115,6 +1185,7 @@ bt_target_page_check(BtreeCheckState *state) (uint32) (state->targetlsn >> 32), (uint32) state->targetlsn))); } + skey->scantid = scantid; /* * * Item order check * @@ -1129,11 +1200,13 @@ bt_target_page_check(BtreeCheckState *state) *htid, *nitid, *nhtid; + ItemPointer tid; itid = psprintf("(%u,%u)", state->targetblock, offset); + tid = BTreeTupleGetHeapTID(itup); htid = psprintf("(%u,%u)", - ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)), - ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid))); + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); nitid = psprintf("(%u,%u)", state->targetblock, OffsetNumberNext(offset)); @@ -1142,9 +1215,11 @@ bt_target_page_check(BtreeCheckState *state) state->target, OffsetNumberNext(offset)); itup = (IndexTuple) PageGetItem(state->target, itemid); + + tid = BTreeTupleGetHeapTID(itup); nhtid = psprintf("(%u,%u)", - ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)), - ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid))); + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), @@ -1154,10 +1229,10 @@ bt_target_page_check(BtreeCheckState *state) "higher index tid=%s (points to %s tid=%s) " "page lsn=%X/%X.", itid, - P_ISLEAF(topaque) ? "heap" : "index", + P_ISLEAF(topaque) ? "min heap" : "index", htid, nitid, - P_ISLEAF(topaque) ? "heap" : "index", + P_ISLEAF(topaque) ? "min heap" : "index", nhtid, (uint32) (state->targetlsn >> 32), (uint32) state->targetlsn))); @@ -1918,10 +1993,11 @@ bt_tuple_present_callback(Relation index, HeapTuple htup, Datum *values, * verification. In particular, it won't try to normalize opclass-equal * datums with potentially distinct representations (e.g., btree/numeric_ops * index datums will not get their display scale normalized-away here). - * Normalization may need to be expanded to handle more cases in the future, - * though. For example, it's possible that non-pivot tuples could in the - * future have alternative logically equivalent representations due to using - * the INDEX_ALT_TID_MASK bit to implement intelligent deduplication. + * Caller does normalization for non-pivot tuples that have their own posting + * list, since dummy CREATE INDEX callback code generates new tuples with the + * same normalized representation. Compression is performed + * opportunistically, and in general there is no guarantee about how or when + * compression will be applied. */ static IndexTuple bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup) @@ -2525,14 +2601,16 @@ static inline ItemPointer BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, IndexTuple itup, bool nonpivot) { - ItemPointer result = BTreeTupleGetHeapTID(itup); + ItemPointer result; BlockNumber targetblock = state->targetblock; - if (result == NULL && nonpivot) + if (BTreeTupleIsPivot(itup) == nonpivot) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("block %u or its right sibling block or child block in index \"%s\" contains non-pivot tuple that lacks a heap TID", targetblock, RelationGetRelationName(state->rel)))); + result = BTreeTupleGetHeapTID(itup); + return result; } diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 5890f393f6..f0c1174e2a 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -41,6 +41,17 @@ static OffsetNumber _bt_findinsertloc(Relation rel, BTStack stack, Relation heapRel); static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack); +static void _bt_delete_and_insert(Relation rel, + Buffer buf, + IndexTuple newitup, + OffsetNumber newitemoff); +static void _bt_insertonpg_in_posting(Relation rel, BTScanInsert itup_key, + Buffer buf, + Buffer cbuf, + BTStack stack, + IndexTuple itup, + OffsetNumber newitemoff, + bool split_only_page, int in_posting_offset); static void _bt_insertonpg(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, @@ -56,6 +67,8 @@ static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off); static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel); +static void insert_itupprev_to_page(Page page, BTCompressState *compressState); +static void _bt_compress_one_page(Relation rel, Buffer buffer, Relation heapRel); /* * _bt_doinsert() -- Handle insertion of a single index tuple in the tree. @@ -297,10 +310,17 @@ top: * search bounds established within _bt_check_unique when insertion is * checkingunique. */ + insertstate.in_posting_offset = 0; newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique, stack, heapRel); - _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack, - itup, newitemoff, false); + + if (insertstate.in_posting_offset) + _bt_insertonpg_in_posting(rel, itup_key, insertstate.buf, + InvalidBuffer, stack, itup, newitemoff, + false, insertstate.in_posting_offset); + else + _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, + stack, itup, newitemoff, false); } else { @@ -412,6 +432,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, } curitemid = PageGetItemId(page, offset); + Assert(!BTreeTupleIsPosting(curitup)); /* * We can skip items that are marked killed. @@ -759,6 +780,26 @@ _bt_findinsertloc(Relation rel, _bt_vacuum_one_page(rel, insertstate->buf, heapRel); insertstate->bounds_valid = false; } + + /* + * If the target page is full, try to compress the page + */ + if (PageGetFreeSpace(page) < insertstate->itemsz && !checkingunique) + { + _bt_compress_one_page(rel, insertstate->buf, heapRel); + insertstate->bounds_valid = false; /* paranoia */ + + /* + * FIXME: _bt_vacuum_one_page() won't have cleared the + * BTP_HAS_GARBAGE flag when it didn't kill items. Maybe we + * should clear the BTP_HAS_GARBAGE flag bit from the page when + * compression avoids a page split -- _bt_vacuum_one_page() is + * expecting a page split that takes care of it. + * + * (On the other hand, maybe it doesn't matter very much. A + * comment update seems like the bare minimum we should do.) + */ + } } else { @@ -900,6 +941,208 @@ _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack) insertstate->bounds_valid = false; } +/* + * Delete tuple on newitemoff offset and insert newitup at the same offset. + * All checks of free space must have been done before calling this function. + * + * For use in posting tuple's update. + */ +static void +_bt_delete_and_insert(Relation rel, + Buffer buf, + IndexTuple newitup, + OffsetNumber newitemoff) +{ + Page page = BufferGetPage(buf); + Size newitupsz = IndexTupleSize(newitup); + + newitupsz = MAXALIGN(newitupsz); + + START_CRIT_SECTION(); + + PageIndexTupleDelete(page, newitemoff); + + if (!_bt_pgaddtup(page, newitupsz, newitup, newitemoff)) + elog(ERROR, "failed to insert compressed item in index \"%s\"", + RelationGetRelationName(rel)); + + MarkBufferDirty(buf); + + /* Xlog stuff */ + if (RelationNeedsWAL(rel)) + { + xl_btree_insert xlrec; + XLogRecPtr recptr; + + xlrec.offnum = newitemoff; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert); + + Assert(P_ISLEAF((BTPageOpaque) PageGetSpecialPointer(page))); + + /* + * Force full page write to keep code simple + * + * TODO: think of using XLOG_BTREE_INSERT_LEAF with a new tuple's data + */ + XLogRegisterBuffer(0, buf, REGBUF_STANDARD | REGBUF_FORCE_IMAGE); + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_INSERT_LEAF); + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); +} + +/* + * _bt_insertonpg_in_posting() -- + * Insert a tuple on a particular page in the index + * (compression aware version). + * + * If new tuple's key is equal to the key of a posting tuple that already + * exists on the page and it's TID falls inside the min/max range of + * existing posting list, update the posting tuple. + * + * It only can happen on leaf page. + * + * newitemoff - offset of the posting tuple we must update + * in_posting_offset - position of the new tuple's TID in posting list + * + * If necessary, split the page. + */ +static void +_bt_insertonpg_in_posting(Relation rel, + BTScanInsert itup_key, + Buffer buf, + Buffer cbuf, + BTStack stack, + IndexTuple itup, + OffsetNumber newitemoff, + bool split_only_page, + int in_posting_offset) +{ + IndexTuple origtup; + IndexTuple lefttup; + IndexTuple righttup; + ItemPointerData *ipd; + IndexTuple newitup; + ItemId itemid; + Page page; + int nipd, + nipd_right; + + page = BufferGetPage(buf); + /* get old posting tuple */ + itemid = PageGetItemId(page, newitemoff); + origtup = (IndexTuple) PageGetItem(page, itemid); + Assert(BTreeTupleIsPosting(origtup)); + nipd = BTreeTupleGetNPosting(origtup); + Assert(in_posting_offset < nipd); + Assert(itup_key->scantid != NULL); + Assert(itup_key->heapkeyspace); + + elog(DEBUG4, "(%u,%u) is min, (%u,%u) is max, (%u,%u) is new", + ItemPointerGetBlockNumberNoCheck(BTreeTupleGetHeapTID(origtup)), + ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetHeapTID(origtup)), + ItemPointerGetBlockNumberNoCheck(BTreeTupleGetMaxTID(origtup)), + ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetMaxTID(origtup)), + ItemPointerGetBlockNumberNoCheck(BTreeTupleGetMaxTID(itup)), + ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetMaxTID(itup))); + + /* + * Fist check if existing item is dead. + * + * Then check if the new itempointer fits into the tuple's posting list. + * + * Also check if new itempointer fits into the page. + * + * If not, posting tuple's split is required in both cases. + * + * XXX: Think some more about alignment - pg + */ + if (ItemIdIsDead(itemid)) + { + /* FIXME: We need to call index_compute_xid_horizon_for_tuples() */ + elog(DEBUG4, "replacing LP_DEAD posting list item, new off %d", + newitemoff); + _bt_delete_and_insert(rel, buf, itup, newitemoff); + _bt_relbuf(rel, buf); + } + else if (BTMaxItemSize(page) < MAXALIGN(IndexTupleSize(origtup)) + MAXALIGN(sizeof(ItemPointerData)) || + PageGetFreeSpace(page) < MAXALIGN(IndexTupleSize(origtup)) + MAXALIGN(sizeof(ItemPointerData))) + { + /* + * Split posting tuple into two halves. + * + * Left tuple contains all item pointes less than the new one and + * right tuple contains new item pointer and all to the right. + * + * TODO Probably we can come up with more clever algorithm. + */ + lefttup = BTreeFormPostingTuple(origtup, BTreeTupleGetPosting(origtup), + in_posting_offset); + + nipd_right = nipd - in_posting_offset + 1; + ipd = palloc0(sizeof(ItemPointerData) * nipd_right); + /* insert new item pointer */ + memcpy(ipd, itup, sizeof(ItemPointerData)); + /* copy item pointers from original tuple that belong on right */ + memcpy(ipd + 1, + BTreeTupleGetPostingN(origtup, in_posting_offset), + sizeof(ItemPointerData) * (nipd - in_posting_offset)); + + righttup = BTreeFormPostingTuple(origtup, ipd, nipd_right); + elog(DEBUG4, "inserting inside posting list with split due to no space orig elements %d new off %d", + nipd, in_posting_offset); + + Assert(ItemPointerCompare(BTreeTupleGetMaxTID(lefttup), + BTreeTupleGetHeapTID(righttup)) < 0); + + /* + * Replace old tuple with a left tuple on a page. + * + * And insert righttuple using ordinary _bt_insertonpg() function If + * split is required, _bt_insertonpg will handle it. + * + * FIXME: This doesn't seem very crash safe -- what if we fail after + * _bt_delete_and_insert() but before _bt_insertonpg()? We could + * crash and then lose some of the logical tuples that used to be + * contained within original posting list, but will now go into new + * righttup posting list. + */ + _bt_delete_and_insert(rel, buf, lefttup, newitemoff); + _bt_insertonpg(rel, itup_key, buf, InvalidBuffer, + stack, righttup, newitemoff + 1, false); + + pfree(ipd); + pfree(lefttup); + pfree(righttup); + } + else + { + ipd = palloc0(sizeof(ItemPointerData) * (nipd + 1)); + elog(DEBUG4, "inserting inside posting list due to apparent overlap"); + + /* copy item pointers from original tuple into ipd */ + memcpy(ipd, BTreeTupleGetPosting(origtup), + sizeof(ItemPointerData) * in_posting_offset); + /* add item pointer of the new tuple into ipd */ + memcpy(ipd + in_posting_offset, itup, sizeof(ItemPointerData)); + /* copy item pointers from old tuple into ipd */ + memcpy(ipd + in_posting_offset + 1, + BTreeTupleGetPostingN(origtup, in_posting_offset), + sizeof(ItemPointerData) * (nipd - in_posting_offset)); + + newitup = BTreeFormPostingTuple(itup, ipd, nipd + 1); + + _bt_delete_and_insert(rel, buf, newitup, newitemoff); + + pfree(ipd); + pfree(newitup); + _bt_relbuf(rel, buf); + } +} + /*---------- * _bt_insertonpg() -- Insert a tuple on a particular page in the index. * @@ -2290,3 +2533,186 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel) * the page. */ } + +/* + * Add new item (compressed or not) to the page, while compressing it. + * If insertion failed, return false. + * Caller should consider this as compression failure and + * leave page uncompressed. + */ +static void +insert_itupprev_to_page(Page page, BTCompressState *compressState) +{ + IndexTuple to_insert; + OffsetNumber offnum = PageGetMaxOffsetNumber(page); + + if (compressState->ntuples == 0) + to_insert = compressState->itupprev; + else + { + IndexTuple postingtuple; + + /* form a tuple with a posting list */ + postingtuple = BTreeFormPostingTuple(compressState->itupprev, + compressState->ipd, + compressState->ntuples); + to_insert = postingtuple; + pfree(compressState->ipd); + } + + /* Add the new item into the page */ + offnum = OffsetNumberNext(offnum); + + elog(DEBUG4, "insert_itupprev_to_page. compressState->ntuples %d IndexTupleSize %zu free %zu", + compressState->ntuples, IndexTupleSize(to_insert), PageGetFreeSpace(page)); + + if (PageAddItem(page, (Item) to_insert, IndexTupleSize(to_insert), + offnum, false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add tuple to page while compresing it"); + + if (compressState->ntuples > 0) + pfree(to_insert); + compressState->ntuples = 0; +} + +/* + * Before splitting the page, try to compress items to free some space. + * If compression didn't succeed, buffer will contain old state of the page. + * This function should be called after lp_dead items + * were removed by _bt_vacuum_one_page(). + */ +static void +_bt_compress_one_page(Relation rel, Buffer buffer, Relation heapRel) +{ + OffsetNumber offnum, + minoff, + maxoff; + Page page = BufferGetPage(buffer); + Page newpage; + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + bool use_compression = false; + BTCompressState *compressState = NULL; + int natts = IndexRelationGetNumberOfAttributes(rel); + + /* + * Don't use compression for indexes with INCLUDEd columns and unique + * indexes. + */ + use_compression = (IndexRelationGetNumberOfKeyAttributes(rel) == + IndexRelationGetNumberOfAttributes(rel) && + !rel->rd_index->indisunique); + if (!use_compression) + return; + + /* init compress state needed to build posting tuples */ + compressState = (BTCompressState *) palloc0(sizeof(BTCompressState)); + compressState->ipd = NULL; + compressState->ntuples = 0; + compressState->itupprev = NULL; + compressState->maxitemsize = BTMaxItemSize(page); + compressState->maxpostingsize = 0; + + /* + * Scan over all items to see which ones can be compressed + */ + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + newpage = PageGetTempPageCopySpecial(page); + elog(DEBUG4, "_bt_compress_one_page rel: %s,blkno: %u", + RelationGetRelationName(rel), BufferGetBlockNumber(buffer)); + + /* Copy High Key if any */ + if (!P_RIGHTMOST(opaque)) + { + ItemId itemid = PageGetItemId(page, P_HIKEY); + Size itemsz = ItemIdGetLength(itemid); + IndexTuple item = (IndexTuple) PageGetItem(page, itemid); + + if (PageAddItem(newpage, (Item) item, itemsz, P_HIKEY, + false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add highkey during compression"); + } + + /* + * Iterate over tuples on the page, try to compress them into posting + * lists and insert into new page. + */ + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemId = PageGetItemId(page, offnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemId); + + /* + * We do not expect to meet any DEAD items, since this function is + * called right after _bt_vacuum_one_page(). If for some reason we + * found dead item, don't compress it, to allow upcoming microvacuum + * or vacuum clean it up. + */ + if (ItemIdIsDead(itemId)) + continue; + + if (compressState->itupprev != NULL) + { + int n_equal_atts = + _bt_keep_natts_fast(rel, compressState->itupprev, itup); + int itup_ntuples = BTreeTupleIsPosting(itup) ? + BTreeTupleGetNPosting(itup) : 1; + + if (n_equal_atts > natts) + { + /* + * When tuples are equal, create or update posting. + * + * If posting is too big, insert it on page and continue. + */ + if (compressState->maxitemsize > + MAXALIGN(((IndexTupleSize(compressState->itupprev) + + (compressState->ntuples + itup_ntuples + 1) * sizeof(ItemPointerData))))) + { + _bt_add_posting_item(compressState, itup); + } + else + { + insert_itupprev_to_page(newpage, compressState); + } + } + else + { + insert_itupprev_to_page(newpage, compressState); + } + } + + /* + * Copy the tuple into temp variable itupprev to compare it with the + * following tuple and maybe unite them into a posting tuple + */ + if (compressState->itupprev) + pfree(compressState->itupprev); + compressState->itupprev = CopyIndexTuple(itup); + + Assert(IndexTupleSize(compressState->itupprev) <= compressState->maxitemsize); + } + + /* Handle the last item. */ + insert_itupprev_to_page(newpage, compressState); + + START_CRIT_SECTION(); + + PageRestoreTempPage(newpage, page); + MarkBufferDirty(buffer); + + /* Log full page write */ + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + + recptr = log_newpage_buffer(buffer, true); + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + elog(DEBUG4, "_bt_compress_one_page. success"); +} diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 9c1f7de60f..86c662d4e6 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -983,14 +983,52 @@ _bt_page_recyclable(Page page) void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, + OffsetNumber *remainingoffset, + IndexTuple *remaining, int nremaining, BlockNumber lastBlockVacuumed) { Page page = BufferGetPage(buf); BTPageOpaque opaque; + Size itemsz; + Size remaining_sz = 0; + char *remaining_buf = NULL; + + /* XLOG stuff, buffer for remainings */ + if (nremaining && RelationNeedsWAL(rel)) + { + Size offset = 0; + + for (int i = 0; i < nremaining; i++) + remaining_sz += MAXALIGN(IndexTupleSize(remaining[i])); + + remaining_buf = palloc0(remaining_sz); + for (int i = 0; i < nremaining; i++) + { + itemsz = IndexTupleSize(remaining[i]); + memcpy(remaining_buf + offset, (char *) remaining[i], itemsz); + offset += MAXALIGN(itemsz); + } + Assert(offset == remaining_sz); + } /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); + /* Handle posting tuples here */ + for (int i = 0; i < nremaining; i++) + { + /* At first, delete the old tuple. */ + PageIndexTupleDelete(page, remainingoffset[i]); + + itemsz = IndexTupleSize(remaining[i]); + itemsz = MAXALIGN(itemsz); + + /* Add tuple with remaining ItemPointers to the page. */ + if (PageAddItem(page, (Item) remaining[i], itemsz, remainingoffset[i], + false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to rewrite compressed item in index while doing vacuum"); + } + /* Fix the page */ if (nitems > 0) PageIndexMultiDelete(page, itemnos, nitems); @@ -1020,6 +1058,8 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, xl_btree_vacuum xlrec_vacuum; xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed; + xlrec_vacuum.nremaining = nremaining; + xlrec_vacuum.ndeleted = nitems; XLogBeginInsert(); XLogRegisterBuffer(0, buf, REGBUF_STANDARD); @@ -1033,6 +1073,19 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, if (nitems > 0) XLogRegisterBufData(0, (char *) itemnos, nitems * sizeof(OffsetNumber)); + /* + * Here we should save offnums and remaining tuples themselves. It's + * important to restore them in correct order. At first, we must + * handle remaining tuples and only after that other deleted items. + */ + if (nremaining > 0) + { + Assert(remaining_buf != NULL); + XLogRegisterBufData(0, (char *) remainingoffset, + nremaining * sizeof(OffsetNumber)); + XLogRegisterBufData(0, remaining_buf, remaining_sz); + } + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM); PageSetLSN(page, recptr); diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 4cfd5289ad..22fb228b81 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -97,6 +97,8 @@ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, BTCycleId cycleid, TransactionId *oldestBtpoXact); static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno); +static ItemPointer btreevacuumPosting(BTVacState *vstate, IndexTuple itup, + int *nremaining); /* @@ -1069,7 +1071,8 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, RBM_NORMAL, info->strategy); LockBufferForCleanup(buf); _bt_checkpage(rel, buf); - _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed); + _bt_delitems_vacuum(rel, buf, NULL, 0, NULL, NULL, 0, + vstate.lastBlockVacuumed); _bt_relbuf(rel, buf); } @@ -1193,6 +1196,9 @@ restart: OffsetNumber offnum, minoff, maxoff; + IndexTuple remaining[MaxOffsetNumber]; + OffsetNumber remainingoffset[MaxOffsetNumber]; + int nremaining; /* * Trade in the initial read lock for a super-exclusive write lock on @@ -1229,6 +1235,7 @@ restart: * callback function. */ ndeletable = 0; + nremaining = 0; minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); if (callback) @@ -1242,31 +1249,78 @@ restart: itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); - htup = &(itup->t_tid); - /* - * During Hot Standby we currently assume that - * XLOG_BTREE_VACUUM records do not produce conflicts. That is - * only true as long as the callback function depends only - * upon whether the index tuple refers to heap tuples removed - * in the initial heap scan. When vacuum starts it derives a - * value of OldestXmin. Backends taking later snapshots could - * have a RecentGlobalXmin with a later xid than the vacuum's - * OldestXmin, so it is possible that row versions deleted - * after OldestXmin could be marked as killed by other - * backends. The callback function *could* look at the index - * tuple state in isolation and decide to delete the index - * tuple, though currently it does not. If it ever did, we - * would need to reconsider whether XLOG_BTREE_VACUUM records - * should cause conflicts. If they did cause conflicts they - * would be fairly harsh conflicts, since we haven't yet - * worked out a way to pass a useful value for - * latestRemovedXid on the XLOG_BTREE_VACUUM records. This - * applies to *any* type of index that marks index tuples as - * killed. - */ - if (callback(htup, callback_state)) - deletable[ndeletable++] = offnum; + if (BTreeTupleIsPosting(itup)) + { + int nnewipd = 0; + ItemPointer newipd = NULL; + + newipd = btreevacuumPosting(vstate, itup, &nnewipd); + + if (nnewipd == 0) + { + /* + * All TIDs from posting list must be deleted, we can + * delete whole tuple in a regular way. + */ + deletable[ndeletable++] = offnum; + } + else if (nnewipd == BTreeTupleGetNPosting(itup)) + { + /* + * All TIDs from posting tuple must remain. Do + * nothing, just cleanup. + */ + pfree(newipd); + } + else if (nnewipd < BTreeTupleGetNPosting(itup)) + { + /* Some TIDs from posting tuple must remain. */ + Assert(nnewipd > 0); + Assert(newipd != NULL); + + /* + * Form new tuple that contains only remaining TIDs. + * Remember this tuple and the offset of the old tuple + * to update it in place. + */ + remainingoffset[nremaining] = offnum; + remaining[nremaining] = BTreeFormPostingTuple(itup, newipd, nnewipd); + nremaining++; + pfree(newipd); + + Assert(IndexTupleSize(itup) <= BTMaxItemSize(page)); + } + } + else + { + htup = &(itup->t_tid); + + /* + * During Hot Standby we currently assume that + * XLOG_BTREE_VACUUM records do not produce conflicts. + * That is only true as long as the callback function + * depends only upon whether the index tuple refers to + * heap tuples removed in the initial heap scan. When + * vacuum starts it derives a value of OldestXmin. + * Backends taking later snapshots could have a + * RecentGlobalXmin with a later xid than the vacuum's + * OldestXmin, so it is possible that row versions deleted + * after OldestXmin could be marked as killed by other + * backends. The callback function *could* look at the + * index tuple state in isolation and decide to delete the + * index tuple, though currently it does not. If it ever + * did, we would need to reconsider whether + * XLOG_BTREE_VACUUM records should cause conflicts. If + * they did cause conflicts they would be fairly harsh + * conflicts, since we haven't yet worked out a way to + * pass a useful value for latestRemovedXid on the + * XLOG_BTREE_VACUUM records. This applies to *any* type + * of index that marks index tuples as killed. + */ + if (callback(htup, callback_state)) + deletable[ndeletable++] = offnum; + } } } @@ -1274,7 +1328,7 @@ restart: * Apply any needed deletes. We issue just one _bt_delitems_vacuum() * call per page, so as to minimize WAL traffic. */ - if (ndeletable > 0) + if (ndeletable > 0 || nremaining > 0) { /* * Notice that the issued XLOG_BTREE_VACUUM WAL record includes @@ -1291,6 +1345,7 @@ restart: * that. */ _bt_delitems_vacuum(rel, buf, deletable, ndeletable, + remainingoffset, remaining, nremaining, vstate->lastBlockVacuumed); /* @@ -1375,6 +1430,41 @@ restart: } } +/* + * btreevacuumPosting() -- vacuums a posting tuple. + * + * Returns new palloc'd posting list with remaining items. + * Posting list size is returned via nremaining. + * + * If all items are dead, + * nremaining is 0 and resulting posting list is NULL. + */ +static ItemPointer +btreevacuumPosting(BTVacState *vstate, IndexTuple itup, int *nremaining) +{ + int remaining = 0; + int nitem = BTreeTupleGetNPosting(itup); + ItemPointer tmpitems = NULL, + items = BTreeTupleGetPosting(itup); + + /* + * Check each tuple in the posting list, save alive tuples into tmpitems + */ + for (int i = 0; i < nitem; i++) + { + if (vstate->callback(items + i, vstate->callback_state)) + continue; + + if (tmpitems == NULL) + tmpitems = palloc(sizeof(ItemPointerData) * nitem); + + tmpitems[remaining++] = items[i]; + } + + *nremaining = remaining; + return tmpitems; +} + /* * btcanreturn() -- Check whether btree indexes support index-only scans. * diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 19735bf733..20975970d6 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -30,6 +30,9 @@ static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum); static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup); +static void _bt_savepostingitem(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, ItemPointer iptr, + IndexTuple itup, int i); static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir); static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir); static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, @@ -504,7 +507,8 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate) /* We have low <= mid < high, so mid points at a real slot */ - result = _bt_compare(rel, key, page, mid); + result = _bt_compare_posting(rel, key, page, mid, + &(insertstate->in_posting_offset)); if (result >= cmpval) low = mid + 1; @@ -533,6 +537,55 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate) return low; } +/* + * Compare insertion-type scankey to tuple on a page, + * taking into account posting tuples. + * If the key of the posting tuple is equal to scankey, + * find exact position inside the posting list, + * using TID as extra attribute. + */ +int32 +_bt_compare_posting(Relation rel, + BTScanInsert key, + Page page, + OffsetNumber offnum, + int *in_posting_offset) +{ + IndexTuple itup; + int result; + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + result = _bt_compare(rel, key, page, offnum); + + if (BTreeTupleIsPosting(itup) && result == 0) + { + int low, + high, + mid, + res; + + low = 0; + /* "high" is past end of posting list for loop invariant */ + high = BTreeTupleGetNPosting(itup); + + while (high > low) + { + mid = low + ((high - low) / 2); + res = ItemPointerCompare(key->scantid, + BTreeTupleGetPostingN(itup, mid)); + + if (res >= 1) + low = mid + 1; + else + high = mid; + } + + *in_posting_offset = high; + } + + return result; +} + /*---------- * _bt_compare() -- Compare insertion-type scankey to tuple on a page. * @@ -665,61 +718,120 @@ _bt_compare(Relation rel, * Use the heap TID attribute and scantid to try to break the tie. The * rules are the same as any other key attribute -- only the * representation differs. + * + * When itup is a posting tuple, the check becomes more complex. It is + * possible that the scankey belongs to the tuple's posting list TID + * range. + * + * _bt_compare() is multipurpose, so it just returns 0 for a fact that key + * matches tuple at this offset. + * + * Use special _bt_compare_posting() wrapper function to handle this case + * and perform recheck for posting tuple, finding exact position of the + * scankey. */ - heapTid = BTreeTupleGetHeapTID(itup); - if (key->scantid == NULL) + if (!BTreeTupleIsPosting(itup)) { + heapTid = BTreeTupleGetHeapTID(itup); + if (key->scantid == NULL) + { + /* + * Most searches have a scankey that is considered greater than a + * truncated pivot tuple if and when the scankey has equal values + * for attributes up to and including the least significant + * untruncated attribute in tuple. + * + * For example, if an index has the minimum two attributes (single + * user key attribute, plus heap TID attribute), and a page's high + * key is ('foo', -inf), and scankey is ('foo', ), the + * search will not descend to the page to the left. The search + * will descend right instead. The truncated attribute in pivot + * tuple means that all non-pivot tuples on the page to the left + * are strictly < 'foo', so it isn't necessary to descend left. In + * other words, search doesn't have to descend left because it + * isn't interested in a match that has a heap TID value of -inf. + * + * However, some searches (pivotsearch searches) actually require + * that we descend left when this happens. -inf is treated as a + * possible match for omitted scankey attribute(s). This is + * needed by page deletion, which must re-find leaf pages that are + * targets for deletion using their high keys. + * + * Note: the heap TID part of the test ensures that scankey is + * being compared to a pivot tuple with one or more truncated key + * attributes. + * + * Note: pg_upgrade'd !heapkeyspace indexes must always descend to + * the left here, since they have no heap TID attribute (and + * cannot have any -inf key values in any case, since truncation + * can only remove non-key attributes). !heapkeyspace searches + * must always be prepared to deal with matches on both sides of + * the pivot once the leaf level is reached. + */ + if (key->heapkeyspace && !key->pivotsearch && + key->keysz == ntupatts && heapTid == NULL) + return 1; + + /* All provided scankey arguments found to be equal */ + return 0; + } + /* - * Most searches have a scankey that is considered greater than a - * truncated pivot tuple if and when the scankey has equal values for - * attributes up to and including the least significant untruncated - * attribute in tuple. - * - * For example, if an index has the minimum two attributes (single - * user key attribute, plus heap TID attribute), and a page's high key - * is ('foo', -inf), and scankey is ('foo', ), the search - * will not descend to the page to the left. The search will descend - * right instead. The truncated attribute in pivot tuple means that - * all non-pivot tuples on the page to the left are strictly < 'foo', - * so it isn't necessary to descend left. In other words, search - * doesn't have to descend left because it isn't interested in a match - * that has a heap TID value of -inf. - * - * However, some searches (pivotsearch searches) actually require that - * we descend left when this happens. -inf is treated as a possible - * match for omitted scankey attribute(s). This is needed by page - * deletion, which must re-find leaf pages that are targets for - * deletion using their high keys. - * - * Note: the heap TID part of the test ensures that scankey is being - * compared to a pivot tuple with one or more truncated key - * attributes. - * - * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the - * left here, since they have no heap TID attribute (and cannot have - * any -inf key values in any case, since truncation can only remove - * non-key attributes). !heapkeyspace searches must always be - * prepared to deal with matches on both sides of the pivot once the - * leaf level is reached. + * Treat truncated heap TID as minus infinity, since scankey has a key + * attribute value (scantid) that would otherwise be compared directly */ - if (key->heapkeyspace && !key->pivotsearch && - key->keysz == ntupatts && heapTid == NULL) + Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel)); + if (heapTid == NULL) return 1; - /* All provided scankey arguments found to be equal */ - return 0; + Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel)); + return ItemPointerCompare(key->scantid, heapTid); + } + else + { + heapTid = BTreeTupleGetHeapTID(itup); + if (key->scantid != NULL && heapTid != NULL) + { + int cmp = ItemPointerCompare(key->scantid, heapTid); + + if (cmp == -1 || cmp == 0) + { + elog(DEBUG4, "offnum %d Scankey (%u,%u) is less than or equal to posting tuple (%u,%u)", + offnum, ItemPointerGetBlockNumberNoCheck(key->scantid), + ItemPointerGetOffsetNumberNoCheck(key->scantid), + ItemPointerGetBlockNumberNoCheck(heapTid), + ItemPointerGetOffsetNumberNoCheck(heapTid)); + return cmp; + } + + heapTid = BTreeTupleGetMaxTID(itup); + cmp = ItemPointerCompare(key->scantid, heapTid); + if (cmp == 1) + { + elog(DEBUG4, "offnum %d Scankey (%u,%u) is greater than posting tuple (%u,%u)", + offnum, ItemPointerGetBlockNumberNoCheck(key->scantid), + ItemPointerGetOffsetNumberNoCheck(key->scantid), + ItemPointerGetBlockNumberNoCheck(heapTid), + ItemPointerGetOffsetNumberNoCheck(heapTid)); + return cmp; + } + + /* + * if we got here, scantid is inbetween of posting items of the + * tuple + */ + elog(DEBUG4, "offnum %d Scankey (%u,%u) is between posting items (%u,%u) and (%u,%u)", + offnum, ItemPointerGetBlockNumberNoCheck(key->scantid), + ItemPointerGetOffsetNumberNoCheck(key->scantid), + ItemPointerGetBlockNumberNoCheck(BTreeTupleGetHeapTID(itup)), + ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetHeapTID(itup)), + ItemPointerGetBlockNumberNoCheck(heapTid), + ItemPointerGetOffsetNumberNoCheck(heapTid)); + return 0; + } } - /* - * Treat truncated heap TID as minus infinity, since scankey has a key - * attribute value (scantid) that would otherwise be compared directly - */ - Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel)); - if (heapTid == NULL) - return 1; - - Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel)); - return ItemPointerCompare(key->scantid, heapTid); + return 0; } /* @@ -1456,6 +1568,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) /* initialize tuple workspace to empty */ so->currPos.nextTupleOffset = 0; + so->currPos.prevTupleOffset = 0; /* * Now that the current page has been made consistent, the macro should be @@ -1490,8 +1603,22 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan)) { /* tuple passes all scan key conditions, so remember it */ - _bt_saveitem(so, itemIndex, offnum, itup); - itemIndex++; + if (!BTreeTupleIsPosting(itup)) + { + _bt_saveitem(so, itemIndex, offnum, itup); + itemIndex++; + } + else + { + /* Return posting list "logical" tuples */ + for (int i = 0; i < BTreeTupleGetNPosting(itup); i++) + { + _bt_savepostingitem(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i), + itup, i); + itemIndex++; + } + } } /* When !continuescan, there can't be any more matches, so stop */ if (!continuescan) @@ -1524,7 +1651,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) if (!continuescan) so->currPos.moreRight = false; - Assert(itemIndex <= MaxIndexTuplesPerPage); + Assert(itemIndex <= MaxPostingIndexTuplesPerPage); so->currPos.firstItem = 0; so->currPos.lastItem = itemIndex - 1; so->currPos.itemIndex = 0; @@ -1532,7 +1659,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) else { /* load items[] in descending order */ - itemIndex = MaxIndexTuplesPerPage; + itemIndex = MaxPostingIndexTuplesPerPage; offnum = Min(offnum, maxoff); @@ -1574,8 +1701,23 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) if (passes_quals && tuple_alive) { /* tuple passes all scan key conditions, so remember it */ - itemIndex--; - _bt_saveitem(so, itemIndex, offnum, itup); + if (!BTreeTupleIsPosting(itup)) + { + itemIndex--; + _bt_saveitem(so, itemIndex, offnum, itup); + } + else + { + /* Return posting list "logical" tuples */ + /* XXX: Maybe this loop should be backwards? */ + for (int i = 0; i < BTreeTupleGetNPosting(itup); i++) + { + itemIndex--; + _bt_savepostingitem(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i), + itup, i); + } + } } if (!continuescan) { @@ -1589,8 +1731,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) Assert(itemIndex >= 0); so->currPos.firstItem = itemIndex; - so->currPos.lastItem = MaxIndexTuplesPerPage - 1; - so->currPos.itemIndex = MaxIndexTuplesPerPage - 1; + so->currPos.lastItem = MaxPostingIndexTuplesPerPage - 1; + so->currPos.itemIndex = MaxPostingIndexTuplesPerPage - 1; } return (so->currPos.firstItem <= so->currPos.lastItem); @@ -1603,6 +1745,8 @@ _bt_saveitem(BTScanOpaque so, int itemIndex, { BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + Assert(!BTreeTupleIsPosting(itup)); + currItem->heapTid = itup->t_tid; currItem->indexOffset = offnum; if (so->currTuples) @@ -1615,6 +1759,33 @@ _bt_saveitem(BTScanOpaque so, int itemIndex, } } +/* Save an index item into so->currPos.items[itemIndex] for posting tuples. */ +static void +_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, + ItemPointer iptr, IndexTuple itup, int i) +{ + BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + + currItem->heapTid = *iptr; + currItem->indexOffset = offnum; + + if (so->currTuples) + { + if (i == 0) + { + /* save key. the same for all tuples in the posting */ + Size itupsz = BTreeTupleGetPostingOffset(itup); + + currItem->tupleOffset = so->currPos.nextTupleOffset; + memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz); + so->currPos.nextTupleOffset += MAXALIGN(itupsz); + so->currPos.prevTupleOffset = currItem->tupleOffset; + } + else + currItem->tupleOffset = so->currPos.prevTupleOffset; + } +} + /* * _bt_steppage() -- Step to next page containing valid data for scan * diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index b30cf9e989..b058599aa4 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -288,6 +288,8 @@ static void _bt_sortaddtup(Page page, Size itemsize, static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup); static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state); +static void _bt_buildadd_posting(BTWriteState *wstate, BTPageState *state, + BTCompressState *compressState); static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2); static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, @@ -972,6 +974,11 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) * only shift the line pointer array back and forth, and overwrite * the tuple space previously occupied by oitup. This is fairly * cheap. + * + * If lastleft tuple was a posting tuple, we'll truncate its + * posting list in _bt_truncate as well. Note that it is also + * applicable only to leaf pages, since internal pages never + * contain posting tuples. */ ii = PageGetItemId(opage, OffsetNumberPrev(last_off)); lastleft = (IndexTuple) PageGetItem(opage, ii); @@ -1011,6 +1018,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) * the minimum key for the new page. */ state->btps_minkey = CopyIndexTuple(oitup); + Assert(BTreeTupleIsPivot(state->btps_minkey)); /* * Set the sibling links for both pages. @@ -1052,6 +1060,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) Assert(state->btps_minkey == NULL); state->btps_minkey = CopyIndexTuple(itup); /* _bt_sortaddtup() will perform full truncation later */ + BTreeTupleClearBtIsPosting(state->btps_minkey); BTreeTupleSetNAtts(state->btps_minkey, 0); } @@ -1136,6 +1145,91 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) _bt_blwritepage(wstate, metapage, BTREE_METAPAGE); } +/* + * Add new tuple (posting or non-posting) to the page while building index. + */ +static void +_bt_buildadd_posting(BTWriteState *wstate, BTPageState *state, + BTCompressState *compressState) +{ + IndexTuple to_insert; + + /* Return, if there is no tuple to insert */ + if (state == NULL) + return; + + if (compressState->ntuples == 0) + to_insert = compressState->itupprev; + else + { + IndexTuple postingtuple; + + /* form a tuple with a posting list */ + postingtuple = BTreeFormPostingTuple(compressState->itupprev, + compressState->ipd, + compressState->ntuples); + to_insert = postingtuple; + pfree(compressState->ipd); + } + + _bt_buildadd(wstate, state, to_insert); + + if (compressState->ntuples > 0) + pfree(to_insert); + compressState->ntuples = 0; +} + +/* + * Save item pointer(s) of itup to the posting list in compressState. + * + * Helper function for _bt_load() and _bt_compress_one_page(). + * + * Note: caller is responsible for size check to ensure that resulting tuple + * won't exceed BTMaxItemSize. + */ +void +_bt_add_posting_item(BTCompressState *compressState, IndexTuple itup) +{ + int nposting = 0; + + if (compressState->ntuples == 0) + { + compressState->ipd = palloc0(compressState->maxitemsize); + + if (BTreeTupleIsPosting(compressState->itupprev)) + { + /* if itupprev is posting, add all its TIDs to the posting list */ + nposting = BTreeTupleGetNPosting(compressState->itupprev); + memcpy(compressState->ipd, + BTreeTupleGetPosting(compressState->itupprev), + sizeof(ItemPointerData) * nposting); + compressState->ntuples += nposting; + } + else + { + memcpy(compressState->ipd, compressState->itupprev, + sizeof(ItemPointerData)); + compressState->ntuples++; + } + } + + if (BTreeTupleIsPosting(itup)) + { + /* if tuple is posting, add all its TIDs to the posting list */ + nposting = BTreeTupleGetNPosting(itup); + memcpy(compressState->ipd + compressState->ntuples, + BTreeTupleGetPosting(itup), + sizeof(ItemPointerData) * nposting); + compressState->ntuples += nposting; + } + else + { + memcpy(compressState->ipd + compressState->ntuples, itup, + sizeof(ItemPointerData)); + compressState->ntuples++; + } +} + /* * Read tuples in correct sort order from tuplesort, and load them into * btree leaves. @@ -1150,9 +1244,20 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) bool load1; TupleDesc tupdes = RelationGetDescr(wstate->index); int i, - keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index); + keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index), + natts = IndexRelationGetNumberOfAttributes(wstate->index); SortSupport sortKeys; int64 tuples_done = 0; + bool use_compression = false; + BTCompressState *compressState = NULL; + + /* + * Don't use compression for indexes with INCLUDEd columns and unique + * indexes. + */ + use_compression = (IndexRelationGetNumberOfKeyAttributes(wstate->index) == + IndexRelationGetNumberOfAttributes(wstate->index) && + !wstate->index->rd_index->indisunique); if (merge) { @@ -1266,19 +1371,89 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) } else { - /* merge is unnecessary */ - while ((itup = tuplesort_getindextuple(btspool->sortstate, - true)) != NULL) + if (!use_compression) { - /* When we see first tuple, create first index page */ - if (state == NULL) - state = _bt_pagestate(wstate, 0); + /* merge is unnecessary */ + while ((itup = tuplesort_getindextuple(btspool->sortstate, + true)) != NULL) + { + /* When we see first tuple, create first index page */ + if (state == NULL) + state = _bt_pagestate(wstate, 0); - _bt_buildadd(wstate, state, itup); + _bt_buildadd(wstate, state, itup); - /* Report progress */ - pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, - ++tuples_done); + /* Report progress */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, + ++tuples_done); + } + } + else + { + /* init compress state needed to build posting tuples */ + compressState = (BTCompressState *) palloc0(sizeof(BTCompressState)); + compressState->ipd = NULL; + compressState->ntuples = 0; + compressState->itupprev = NULL; + compressState->maxitemsize = 0; + compressState->maxpostingsize = 0; + + while ((itup = tuplesort_getindextuple(btspool->sortstate, + true)) != NULL) + { + /* When we see first tuple, create first index page */ + if (state == NULL) + { + state = _bt_pagestate(wstate, 0); + compressState->maxitemsize = BTMaxItemSize(state->btps_page); + } + + if (compressState->itupprev != NULL) + { + int n_equal_atts = _bt_keep_natts_fast(wstate->index, + compressState->itupprev, itup); + + if (n_equal_atts > natts) + { + /* + * Tuples are equal. Create or update posting. + * + * Else If posting is too big, insert it on page and + * continue. + */ + if ((compressState->ntuples + 1) * sizeof(ItemPointerData) < + compressState->maxpostingsize) + _bt_add_posting_item(compressState, itup); + else + _bt_buildadd_posting(wstate, state, + compressState); + } + else + { + /* + * Tuples are not equal. Insert itupprev into index. + * Save current tuple for the next iteration. + */ + _bt_buildadd_posting(wstate, state, compressState); + } + } + + /* + * Save the tuple to compare it with the next one and maybe + * unite them into a posting tuple. + */ + if (compressState->itupprev) + pfree(compressState->itupprev); + compressState->itupprev = CopyIndexTuple(itup); + + /* compute max size of posting list */ + compressState->maxpostingsize = compressState->maxitemsize - + IndexInfoFindDataOffset(compressState->itupprev->t_info) - + MAXALIGN(IndexTupleSize(compressState->itupprev)); + } + + /* Handle the last item */ + _bt_buildadd_posting(wstate, state, compressState); } } diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c index a7882fd874..77e1d46672 100644 --- a/src/backend/access/nbtree/nbtsplitloc.c +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -459,6 +459,7 @@ _bt_recsplitloc(FindSplitData *state, int16 leftfree, rightfree; Size firstrightitemsz; + Size postingsubhikey = 0; bool newitemisfirstonright; /* Is the new item going to be the first item on the right page? */ @@ -466,10 +467,33 @@ _bt_recsplitloc(FindSplitData *state, && !newitemonleft); if (newitemisfirstonright) + { firstrightitemsz = state->newitemsz; + + /* Calculate posting list overhead, if any */ + if (state->is_leaf && BTreeTupleIsPosting(state->newitem)) + postingsubhikey = IndexTupleSize(state->newitem) - + BTreeTupleGetPostingOffset(state->newitem); + } else + { firstrightitemsz = firstoldonrightsz; + /* Calculate posting list overhead, if any */ + if (state->is_leaf) + { + ItemId itemid; + IndexTuple newhighkey; + + itemid = PageGetItemId(state->page, firstoldonright); + newhighkey = (IndexTuple) PageGetItem(state->page, itemid); + + if (BTreeTupleIsPosting(newhighkey)) + postingsubhikey = IndexTupleSize(newhighkey) - + BTreeTupleGetPostingOffset(newhighkey); + } + } + /* Account for all the old tuples */ leftfree = state->leftspace - olddataitemstoleft; rightfree = state->rightspace - @@ -492,9 +516,13 @@ _bt_recsplitloc(FindSplitData *state, * adding a heap TID to the left half's new high key when splitting at the * leaf level. In practice the new high key will often be smaller and * will rarely be larger, but conservatively assume the worst case. + * Truncation always truncates away any posting list that appears in the + * first right tuple, though, so it's safe to subtract that overhead + * (while still conservatively assuming that truncation might have to add + * back a single heap TID using the pivot tuple heap TID representation). */ if (state->is_leaf) - leftfree -= (int16) (firstrightitemsz + + leftfree -= (int16) ((firstrightitemsz - postingsubhikey) + MAXALIGN(sizeof(ItemPointerData))); else leftfree -= (int16) firstrightitemsz; diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 93fab264ae..75ba61c0c9 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -111,8 +111,12 @@ _bt_mkscankey(Relation rel, IndexTuple itup) key->nextkey = false; key->pivotsearch = false; key->keysz = Min(indnkeyatts, tupnatts); - key->scantid = key->heapkeyspace && itup ? - BTreeTupleGetHeapTID(itup) : NULL; + + if (itup && key->heapkeyspace) + key->scantid = BTreeTupleGetHeapTID(itup); + else + key->scantid = NULL; + skey = key->scankeys; for (i = 0; i < indnkeyatts; i++) { @@ -1787,7 +1791,9 @@ _bt_killitems(IndexScanDesc scan) ItemId iid = PageGetItemId(page, offnum); IndexTuple ituple = (IndexTuple) PageGetItem(page, iid); - if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid)) + /* No microvacuum for posting tuples */ + if (!BTreeTupleIsPosting(ituple) && + (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))) { /* found the item */ ItemIdMarkDead(iid); @@ -2145,6 +2151,16 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, pivot = index_truncate_tuple(itupdesc, firstright, keepnatts); + if (BTreeTupleIsPosting(firstright)) + { + BTreeTupleClearBtIsPosting(pivot); + BTreeTupleSetNAtts(pivot, keepnatts); + pivot->t_info &= ~INDEX_SIZE_MASK; + pivot->t_info |= BTreeTupleGetPostingOffset(firstright); + } + + Assert(!BTreeTupleIsPosting(pivot)); + /* * If there is a distinguishing key attribute within new pivot tuple, * there is no need to add an explicit heap TID attribute @@ -2161,6 +2177,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, * attribute to the new pivot tuple. */ Assert(natts != nkeyatts); + Assert(!BTreeTupleIsPosting(lastleft)); + Assert(!BTreeTupleIsPosting(firstright)); newsize = IndexTupleSize(pivot) + MAXALIGN(sizeof(ItemPointerData)); tidpivot = palloc0(newsize); memcpy(tidpivot, pivot, IndexTupleSize(pivot)); @@ -2168,6 +2186,27 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, pfree(pivot); pivot = tidpivot; } + else if (BTreeTupleIsPosting(firstright)) + { + /* + * No truncation was possible, since key attributes are all equal. But + * the tuple is a compressed tuple with a posting list, so we still + * must truncate it. + * + * It's necessary to add a heap TID attribute to the new pivot tuple. + */ + newsize = BTreeTupleGetPostingOffset(firstright) + + MAXALIGN(sizeof(ItemPointerData)); + pivot = palloc0(newsize); + memcpy(pivot, firstright, BTreeTupleGetPostingOffset(firstright)); + + pivot->t_info &= ~INDEX_SIZE_MASK; + pivot->t_info |= newsize; + BTreeTupleClearBtIsPosting(pivot); + BTreeTupleSetAltHeapTID(pivot); + + Assert(!BTreeTupleIsPosting(pivot)); + } else { /* @@ -2205,7 +2244,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, */ pivotheaptid = (ItemPointer) ((char *) pivot + newsize - sizeof(ItemPointerData)); - ItemPointerCopy(&lastleft->t_tid, pivotheaptid); + ItemPointerCopy(BTreeTupleGetMaxTID(lastleft), pivotheaptid); /* * Lehman and Yao require that the downlink to the right page, which is to @@ -2216,9 +2255,12 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, * tiebreaker. */ #ifndef DEBUG_NO_TRUNCATE - Assert(ItemPointerCompare(&lastleft->t_tid, &firstright->t_tid) < 0); - Assert(ItemPointerCompare(pivotheaptid, &lastleft->t_tid) >= 0); - Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0); + Assert(ItemPointerCompare(BTreeTupleGetMaxTID(lastleft), + BTreeTupleGetHeapTID(firstright)) < 0); + Assert(ItemPointerCompare(pivotheaptid, + BTreeTupleGetHeapTID(lastleft)) >= 0); + Assert(ItemPointerCompare(pivotheaptid, + BTreeTupleGetHeapTID(firstright)) < 0); #else /* @@ -2231,7 +2273,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, * attribute values along with lastleft's heap TID value when lastleft's * TID happens to be greater than firstright's TID. */ - ItemPointerCopy(&firstright->t_tid, pivotheaptid); + ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid); /* * Pivot heap TID should never be fully equal to firstright. Note that @@ -2240,7 +2282,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, */ ItemPointerSetOffsetNumber(pivotheaptid, OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid))); - Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0); + Assert(ItemPointerCompare(pivotheaptid, + BTreeTupleGetHeapTID(firstright)) < 0); #endif BTreeTupleSetNAtts(pivot, nkeyatts); @@ -2330,6 +2373,25 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * leaving excessive amounts of free space on either side of page split. * Callers can rely on the fact that attributes considered equal here are * definitely also equal according to _bt_keep_natts. + * + * To build a posting tuple we need to ensure that all attributes + * of both tuples are equal. Use this function to compare them. + * TODO: maybe it's worth to rename the function. + * + * XXX: Obviously we need infrastructure for making sure it is okay to use + * this for posting list stuff. For example, non-deterministic collations + * cannot use compression, and will not work with what we have now. + * + * XXX: Even then, we probably also need to worry about TOAST as a special + * case. Don't repeat bugs like the amcheck bug that was fixed in commit + * eba775345d23d2c999bbb412ae658b6dab36e3e8. As the test case added in that + * commit shows, we need to worry about pg_attribute.attstorage changing in + * the underlying table due to an ALTER TABLE (and maybe a few other things + * like that). In general, the "TOAST input state" of a TOASTable datum isn't + * something that we make many guarantees about today, so even with C + * collation text we could in theory get different answers from + * _bt_keep_natts_fast() and _bt_keep_natts(). This needs to be nailed down + * in some way. */ int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) @@ -2415,7 +2477,7 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum) * Non-pivot tuples currently never use alternative heap TID * representation -- even those within heapkeyspace indexes */ - if ((itup->t_info & INDEX_ALT_TID_MASK) != 0) + if (BTreeTupleIsPivot(itup)) return false; /* @@ -2470,7 +2532,7 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum) * that to decide if the tuple is a pre-v11 tuple. */ return tupnatts == 0 || - ((itup->t_info & INDEX_ALT_TID_MASK) == 0 && + (!BTreeTupleIsPivot(itup) && ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY); } else @@ -2497,7 +2559,7 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum) * heapkeyspace index pivot tuples, regardless of whether or not there are * non-key attributes. */ - if ((itup->t_info & INDEX_ALT_TID_MASK) == 0) + if (!BTreeTupleIsPivot(itup)) return false; /* @@ -2549,6 +2611,8 @@ _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid(page)) return; + /* TODO correct error messages for posting tuples */ + /* * Internal page insertions cannot fail here, because that would mean that * an earlier leaf level insertion that should have failed didn't @@ -2575,3 +2639,79 @@ _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, "or use full text indexing."), errtableconstraint(heap, RelationGetRelationName(rel)))); } + +/* + * Given a basic tuple that contains key datum and posting list, + * build a posting tuple. + * + * Basic tuple can be a posting tuple, but we only use key part of it, + * all ItemPointers must be passed via ipd. + * + * If nipd == 1 fallback to building a non-posting tuple. + * It is necessary to avoid storage overhead after posting tuple was vacuumed. + */ +IndexTuple +BTreeFormPostingTuple(IndexTuple tuple, ItemPointerData *ipd, int nipd) +{ + uint32 keysize, + newsize = 0; + IndexTuple itup; + + /* We only need key part of the tuple */ + if (BTreeTupleIsPosting(tuple)) + keysize = BTreeTupleGetPostingOffset(tuple); + else + keysize = IndexTupleSize(tuple); + + Assert(nipd > 0); + + /* Add space needed for posting list */ + if (nipd > 1) + newsize = SHORTALIGN(keysize) + sizeof(ItemPointerData) * nipd; + else + newsize = keysize; + + newsize = MAXALIGN(newsize); + itup = palloc0(newsize); + memcpy(itup, tuple, keysize); + itup->t_info &= ~INDEX_SIZE_MASK; + itup->t_info |= newsize; + + if (nipd > 1) + { + /* Form posting tuple, fill posting fields */ + + /* Set meta info about the posting list */ + itup->t_info |= INDEX_ALT_TID_MASK; + BTreeSetPostingMeta(itup, nipd, SHORTALIGN(keysize)); + + /* sort the list to preserve TID order invariant */ + qsort((void *) ipd, nipd, sizeof(ItemPointerData), + (int (*) (const void *, const void *)) ItemPointerCompare); + + /* Copy posting list into the posting tuple */ + memcpy(BTreeTupleGetPosting(itup), ipd, + sizeof(ItemPointerData) * nipd); + } + else + { + /* To finish building of a non-posting tuple, copy TID from ipd */ + itup->t_info &= ~INDEX_ALT_TID_MASK; + ItemPointerCopy(ipd, &itup->t_tid); + } + + return itup; +} + +/* + * Opposite of BTreeFormPostingTuple. + * returns regular tuple that contains the key, + * the tid of the new tuple is the nth tid of original tuple's posting list + * result tuple palloc'd in a caller's context. + */ +IndexTuple +BTreeGetNthTupleOfPosting(IndexTuple tuple, int n) +{ + Assert(BTreeTupleIsPosting(tuple)); + return BTreeFormPostingTuple(tuple, BTreeTupleGetPostingN(tuple, n), 1); +} diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index dd5315c1aa..538a6bc8a7 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -386,8 +386,8 @@ btree_xlog_vacuum(XLogReaderState *record) Buffer buffer; Page page; BTPageOpaque opaque; -#ifdef UNUSED xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record); +#ifdef UNUSED /* * This section of code is thought to be no longer needed, after analysis @@ -478,14 +478,34 @@ btree_xlog_vacuum(XLogReaderState *record) if (len > 0) { - OffsetNumber *unused; - OffsetNumber *unend; + if (xlrec->nremaining) + { + OffsetNumber *remainingoffset; + IndexTuple remaining; + Size itemsz; - unused = (OffsetNumber *) ptr; - unend = (OffsetNumber *) ((char *) ptr + len); + remainingoffset = (OffsetNumber *) + (ptr + xlrec->ndeleted * sizeof(OffsetNumber)); + remaining = (IndexTuple) ((char *) remainingoffset + + xlrec->nremaining * sizeof(OffsetNumber)); - if ((unend - unused) > 0) - PageIndexMultiDelete(page, unused, unend - unused); + /* Handle posting tuples */ + for (int i = 0; i < xlrec->nremaining; i++) + { + PageIndexTupleDelete(page, remainingoffset[i]); + + itemsz = MAXALIGN(IndexTupleSize(remaining)); + + if (PageAddItem(page, (Item) remaining, itemsz, remainingoffset[i], + false, false) == InvalidOffsetNumber) + elog(PANIC, "btree_xlog_vacuum: failed to add remaining item"); + + remaining = (IndexTuple) ((char *) remaining + itemsz); + } + } + + if (xlrec->ndeleted) + PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted); } /* diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c index a14eb792ec..e4fa99ad27 100644 --- a/src/backend/access/rmgrdesc/nbtdesc.c +++ b/src/backend/access/rmgrdesc/nbtdesc.c @@ -46,8 +46,10 @@ btree_desc(StringInfo buf, XLogReaderState *record) { xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec; - appendStringInfo(buf, "lastBlockVacuumed %u", - xlrec->lastBlockVacuumed); + appendStringInfo(buf, "lastBlockVacuumed %u; nremaining %u; ndeleted %u", + xlrec->lastBlockVacuumed, + xlrec->nremaining, + xlrec->ndeleted); break; } case XLOG_BTREE_DELETE: diff --git a/src/include/access/itup.h b/src/include/access/itup.h index 744ffb6c61..b10c0d5255 100644 --- a/src/include/access/itup.h +++ b/src/include/access/itup.h @@ -141,6 +141,10 @@ typedef IndexAttributeBitMapData * IndexAttributeBitMap; * On such a page, N tuples could take one MAXALIGN quantum less space than * estimated here, seemingly allowing one more tuple than estimated here. * But such a page always has at least MAXALIGN special space, so we're safe. + * + * Note: btree leaf pages may contain posting tuples, which store duplicates + * in a more effective way, so they may contain more tuples. + * Use MaxPostingIndexTuplesPerPage instead. */ #define MaxIndexTuplesPerPage \ ((int) ((BLCKSZ - SizeOfPageHeaderData) / \ diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 83e0e6c28e..bacc77b258 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -234,8 +234,7 @@ typedef struct BTMetaPageData * t_tid | t_info | key values | INCLUDE columns, if any * * t_tid points to the heap TID, which is a tiebreaker key column as of - * BTREE_VERSION 4. Currently, the INDEX_ALT_TID_MASK status bit is never - * set for non-pivot tuples. + * BTREE_VERSION 4. * * All other types of index tuples ("pivot" tuples) only have key columns, * since pivot tuples only exist to represent how the key space is @@ -252,6 +251,39 @@ typedef struct BTMetaPageData * omitted rather than truncated, since its representation is different to * the non-pivot representation.) * + * Non-pivot posting tuple format: + * t_tid | t_info | key values | INCLUDE columns, if any | posting_list[] + * + * In order to store duplicated keys more effectively, + * we use special format of tuples - posting tuples. + * posting_list is an array of ItemPointerData. + * + * This type of compression never applies to system indexes, unique indexes + * or indexes with INCLUDEd columns. + * + * To differ posting tuples we use INDEX_ALT_TID_MASK flag in t_info and + * BT_IS_POSTING flag in t_tid. + * These flags redefine the content of the posting tuple's tid: + * - t_tid.ip_blkid contains offset of the posting list. + * - t_tid offset field contains number of posting items this tuple contain + * + * The 12 least significant offset bits from t_tid are used to represent + * the number of posting items in posting tuples, leaving 4 status + * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for + * future use. + * BT_N_POSTING_OFFSET_MASK is large enough to store any number of posting + * tuples, which is constrainted by BTMaxItemSize. + + * If page contains so many duplicates, that they do not fit into one posting + * tuple (bounded by BTMaxItemSize and ), page may contain several posting + * tuples with the same key. + * Also page can contain both posting and non-posting tuples with the same key. + * Currently, posting tuples always contain at least two TIDs in the posting + * list. + * + * Posting tuples always have the same number of attributes as the index has + * generally. + * * Pivot tuple format: * * t_tid | t_info | key values | [heap TID] @@ -281,23 +313,144 @@ typedef struct BTMetaPageData * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for * future use. BT_N_KEYS_OFFSET_MASK should be large enough to store any * number of columns/attributes <= INDEX_MAX_KEYS. + * BT_IS_POSTING bit must be unset for pivot tuples, since we use it + * to distinct posting tuples from pivot tuples. * * Note well: The macros that deal with the number of attributes in tuples - * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple, - * and that a tuple without INDEX_ALT_TID_MASK set must be a non-pivot - * tuple (or must have the same number of attributes as the index has - * generally in the case of !heapkeyspace indexes). They will need to be - * updated if non-pivot tuples ever get taught to use INDEX_ALT_TID_MASK - * for something else. + * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple or + * non-pivot posting tuple, and that a tuple without INDEX_ALT_TID_MASK set + * must be a non-pivot tuple (or must have the same number of attributes as + * the index has generally in the case of !heapkeyspace indexes). */ #define INDEX_ALT_TID_MASK INDEX_AM_RESERVED_BIT /* Item pointer offset bits */ #define BT_RESERVED_OFFSET_MASK 0xF000 #define BT_N_KEYS_OFFSET_MASK 0x0FFF +#define BT_N_POSTING_OFFSET_MASK 0x0FFF #define BT_HEAP_TID_ATTR 0x1000 +#define BT_IS_POSTING 0x2000 -/* Get/set downlink block number */ +#define BTreeTupleIsPosting(itup) \ + ( \ + ((itup)->t_info & INDEX_ALT_TID_MASK && \ + ((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) != 0))\ + ) + +#define BTreeTupleIsPivot(itup) \ + ( \ + ((itup)->t_info & INDEX_ALT_TID_MASK && \ + ((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) == 0))\ + ) + +/* + * MaxPostingIndexTuplesPerPage is an upper bound on the number of tuples + * that can fit on one btree leaf page. + * + * Btree leaf pages may contain posting tuples, which store duplicates + * in a more effective way, so MaxPostingIndexTuplesPerPage is larger then + * MaxIndexTuplesPerPage. + * + * Each leaf page must contain at least three items, so estimate it as + * if we have three posting tuples with minimal size keys. + */ +#define MaxPostingIndexTuplesPerPage \ + ((int) ((BLCKSZ - SizeOfPageHeaderData - \ + 3*((MAXALIGN(sizeof(IndexTupleData) + 1) + sizeof(ItemIdData))) )) / \ + (sizeof(ItemPointerData))) + +/* + * Btree-private state needed to build posting tuples. + * ipd is a posting list - an array of ItemPointerData. + * + * Iterating over tuples during index build or applying compression to a + * single page, we remember a tuple in itupprev, then compare the next one + * with it. If tuples are equal, save their TIDs in the posting list. + * ntuples contains the size of the posting list. + * + * Use maxitemsize and maxpostingsize to ensure that resulting posting tuple + * will satisfy BTMaxItemSize. + */ +typedef struct BTCompressState +{ + Size maxitemsize; + Size maxpostingsize; + IndexTuple itupprev; + int ntuples; + ItemPointerData *ipd; +} BTCompressState; + +/* macros to work with posting tuples *BEGIN* */ +#define BTreeTupleSetBtIsPosting(itup) \ + do { \ + Assert((itup)->t_info & INDEX_ALT_TID_MASK); \ + Assert(!((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) != 0)); \ + ItemPointerSetOffsetNumber(&(itup)->t_tid, \ + ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_IS_POSTING); \ + } while(0) + +#define BTreeTupleClearBtIsPosting(itup) \ + do { \ + ItemPointerSetOffsetNumber(&(itup)->t_tid, \ + ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & ~BT_IS_POSTING); \ + } while(0) + +#define BTreeTupleGetNPosting(itup) \ + ( \ + AssertMacro(BTreeTupleIsPosting(itup)), \ + ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_POSTING_OFFSET_MASK \ + ) + +#define BTreeTupleSetNPosting(itup, n) \ + do { \ + ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_POSTING_OFFSET_MASK); \ + BTreeTupleSetBtIsPosting(itup); \ + } while(0) + +/* + * If tuple is posting, t_tid.ip_blkid contains offset of the posting list. + * Caller is responsible for checking BTreeTupleIsPosting to ensure that it + * will get what is expected. + */ +#define BTreeTupleGetPostingOffset(itup) \ + ( \ + AssertMacro(BTreeTupleIsPosting(itup)), \ + ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid)) \ + ) +#define BTreeTupleSetPostingOffset(itup, offset) \ + ( \ + AssertMacro(BTreeTupleIsPosting(itup)), \ + ItemPointerSetBlockNumber(&((itup)->t_tid), (offset)) \ + ) +#define BTreeSetPostingMeta(itup, nposting, off) \ + do { \ + BTreeTupleSetNPosting(itup, nposting); \ + BTreeTupleSetPostingOffset(itup, off); \ + } while(0) + +#define BTreeTupleGetPosting(itup) \ + (ItemPointerData*) ((char*)(itup) + BTreeTupleGetPostingOffset(itup)) +#define BTreeTupleGetPostingN(itup,n) \ + (ItemPointerData*) (BTreeTupleGetPosting(itup) + (n)) + +/* + * Posting tuples always contain more than one TID. The minimum TID can be + * accessed using BTreeTupleGetHeapTID(). The maximum is accessed using + * BTreeTupleGetMaxTID(). + */ +#define BTreeTupleGetMaxTID(itup) \ + ( \ + ((itup)->t_info & INDEX_ALT_TID_MASK && \ + ((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING))) ? \ + ( \ + (ItemPointer) (BTreeTupleGetPosting(itup) + (BTreeTupleGetNPosting(itup)-1)) \ + ) \ + : \ + (ItemPointer) &((itup)->t_tid) \ + ) +/* macros to work with posting tuples *END* */ + +/* Get/set downlink block number */ #define BTreeInnerTupleGetDownLink(itup) \ ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid)) #define BTreeInnerTupleSetDownLink(itup, blkno) \ @@ -326,7 +479,8 @@ typedef struct BTMetaPageData */ #define BTreeTupleGetNAtts(itup, rel) \ ( \ - (itup)->t_info & INDEX_ALT_TID_MASK ? \ + ((itup)->t_info & INDEX_ALT_TID_MASK && \ + ((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) == 0)) ? \ ( \ ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK \ ) \ @@ -335,6 +489,7 @@ typedef struct BTMetaPageData ) #define BTreeTupleSetNAtts(itup, n) \ do { \ + Assert(!BTreeTupleIsPosting(itup)); \ (itup)->t_info |= INDEX_ALT_TID_MASK; \ ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_KEYS_OFFSET_MASK); \ } while(0) @@ -342,6 +497,8 @@ typedef struct BTMetaPageData /* * Get tiebreaker heap TID attribute, if any. Macro works with both pivot * and non-pivot tuples, despite differences in how heap TID is represented. + * + * For non-pivot posting tuples this returns the first tid from posting list. */ #define BTreeTupleGetHeapTID(itup) \ ( \ @@ -351,7 +508,10 @@ typedef struct BTMetaPageData (ItemPointer) (((char *) (itup) + IndexTupleSize(itup)) - \ sizeof(ItemPointerData)) \ ) \ - : (itup)->t_info & INDEX_ALT_TID_MASK ? NULL : (ItemPointer) &((itup)->t_tid) \ + : (itup)->t_info & INDEX_ALT_TID_MASK ? \ + (((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) != 0) ? \ + (ItemPointer) BTreeTupleGetPosting(itup) : NULL) \ + : (ItemPointer) &((itup)->t_tid) \ ) /* * Set the heap TID attribute for a tuple that uses the INDEX_ALT_TID_MASK @@ -360,6 +520,7 @@ typedef struct BTMetaPageData #define BTreeTupleSetAltHeapTID(itup) \ do { \ Assert((itup)->t_info & INDEX_ALT_TID_MASK); \ + Assert(!((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) != 0)); \ ItemPointerSetOffsetNumber(&(itup)->t_tid, \ ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_HEAP_TID_ATTR); \ } while(0) @@ -500,6 +661,12 @@ typedef struct BTInsertStateData /* Buffer containing leaf page we're likely to insert itup on */ Buffer buf; + /* + * if _bt_binsrch_insert() found the location inside existing posting + * list, save the position inside the list. + */ + int in_posting_offset; + /* * Cache of bounds within the current buffer. Only used for insertions * where _bt_check_unique is called. See _bt_binsrch_insert and @@ -567,6 +734,8 @@ typedef struct BTScanPosData * location in the associated tuple storage workspace. */ int nextTupleOffset; + /* prevTupleOffset is for posting list handling */ + int prevTupleOffset; /* * The items array is always ordered in index order (ie, increasing @@ -579,7 +748,7 @@ typedef struct BTScanPosData int lastItem; /* last valid index in items[] */ int itemIndex; /* current index in items[] */ - BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */ + BTScanPosItem items[MaxPostingIndexTuplesPerPage]; /* MUST BE LAST */ } BTScanPosData; typedef BTScanPosData *BTScanPos; @@ -763,6 +932,8 @@ extern void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, Relation heapRel); extern void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, + OffsetNumber *remainingoffset, + IndexTuple *remaining, int nremaining, BlockNumber lastBlockVacuumed); extern int _bt_pagedel(Relation rel, Buffer buf); @@ -775,6 +946,8 @@ extern Buffer _bt_moveright(Relation rel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access, Snapshot snapshot); extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate); extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum); +extern int32 _bt_compare_posting(Relation rel, BTScanInsert key, Page page, + OffsetNumber offnum, int *in_posting_offset); extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, @@ -813,6 +986,9 @@ extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum); extern void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup); +extern IndexTuple BTreeFormPostingTuple(IndexTuple tuple, ItemPointerData *ipd, + int nipd); +extern IndexTuple BTreeGetNthTupleOfPosting(IndexTuple tuple, int n); /* * prototypes for functions in nbtvalidate.c @@ -825,5 +1001,7 @@ extern bool btvalidate(Oid opclassoid); extern IndexBuildResult *btbuild(Relation heap, Relation index, struct IndexInfo *indexInfo); extern void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc); +extern void _bt_add_posting_item(BTCompressState *compressState, + IndexTuple itup); #endif /* NBTREE_H */ diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index afa614da25..4b615e0d36 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -173,10 +173,19 @@ typedef struct xl_btree_vacuum { BlockNumber lastBlockVacuumed; - /* TARGET OFFSET NUMBERS FOLLOW */ + /* + * This field helps us to find beginning of the remaining tuples from + * postings which follow array of offset numbers. + */ + uint32 nremaining; + uint32 ndeleted; + + /* REMAINING OFFSET NUMBERS FOLLOW (nremaining values) */ + /* REMAINING TUPLES TO INSERT FOLLOW (if nremaining > 0) */ + /* TARGET OFFSET NUMBERS FOLLOW (if any) */ } xl_btree_vacuum; -#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber)) +#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, ndeleted) + sizeof(BlockNumber)) /* * This is what we need to know about marking an empty branch for deletion. -- 2.17.1