diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c index f2eda67..b356e2b 100644 --- a/contrib/bloom/blutils.c +++ b/contrib/bloom/blutils.c @@ -142,6 +142,7 @@ blhandler(PG_FUNCTION_ARGS) amroutine->amestimateparallelscan = NULL; amroutine->aminitparallelscan = NULL; amroutine->amparallelrescan = NULL; + amroutine->amrecheck = NULL; PG_RETURN_POINTER(amroutine); } diff --git a/contrib/bloom/blvacuum.c b/contrib/bloom/blvacuum.c index 04abd0f..ff50361 100644 --- a/contrib/bloom/blvacuum.c +++ b/contrib/bloom/blvacuum.c @@ -88,7 +88,7 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, while (itup < itupEnd) { /* Do we have to delete this tuple? */ - if (callback(&itup->heapPtr, callback_state)) + if (callback(&itup->heapPtr, false, callback_state) == IBDCR_DELETE) { /* Yes; adjust count of tuples that will be left on page */ BloomPageGetOpaque(page)->maxoff--; diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index b22563b..b4a1465 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -116,6 +116,7 @@ brinhandler(PG_FUNCTION_ARGS) amroutine->amestimateparallelscan = NULL; amroutine->aminitparallelscan = NULL; amroutine->amparallelrescan = NULL; + amroutine->amrecheck = NULL; PG_RETURN_POINTER(amroutine); } diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c index c9ccfee..8ed71c5 100644 --- a/src/backend/access/gin/ginvacuum.c +++ b/src/backend/access/gin/ginvacuum.c @@ -56,7 +56,8 @@ ginVacuumItemPointers(GinVacuumState *gvs, ItemPointerData *items, */ for (i = 0; i < nitem; i++) { - if (gvs->callback(items + i, gvs->callback_state)) + if (gvs->callback(items + i, false, gvs->callback_state) == + IBDCR_DELETE) { gvs->result->tuples_removed += 1; if (!tmpitems) diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index 6593771..843389b 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -94,6 +94,7 @@ gisthandler(PG_FUNCTION_ARGS) amroutine->amestimateparallelscan = NULL; amroutine->aminitparallelscan = NULL; amroutine->amparallelrescan = NULL; + amroutine->amrecheck = NULL; PG_RETURN_POINTER(amroutine); } diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index 77d9d12..0955db6 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -202,7 +202,8 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); - if (callback(&(idxtuple->t_tid), callback_state)) + if (callback(&(idxtuple->t_tid), false, callback_state) == + IBDCR_DELETE) todelete[ntodelete++] = i; else stats->num_index_tuples += 1; diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index cfcec34..2274237 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -75,6 +75,7 @@ hashhandler(PG_FUNCTION_ARGS) amroutine->ambuild = hashbuild; amroutine->ambuildempty = hashbuildempty; amroutine->aminsert = hashinsert; + amroutine->amwarminsert = hashwarminsert; amroutine->ambulkdelete = hashbulkdelete; amroutine->amvacuumcleanup = hashvacuumcleanup; amroutine->amcanreturn = NULL; @@ -92,6 +93,7 @@ hashhandler(PG_FUNCTION_ARGS) amroutine->amestimateparallelscan = NULL; amroutine->aminitparallelscan = NULL; amroutine->amparallelrescan = NULL; + amroutine->amrecheck = hashrecheck; PG_RETURN_POINTER(amroutine); } @@ -233,11 +235,11 @@ hashbuildCallback(Relation index, * Hash on the heap tuple's key, form an index tuple with hash code. * Find the appropriate location for the new tuple, and put it there. */ -bool -hashinsert(Relation rel, Datum *values, bool *isnull, +static bool +hashinsert_internal(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, - IndexInfo *indexInfo) + IndexInfo *indexInfo, bool warm_update) { Datum index_values[1]; bool index_isnull[1]; @@ -253,6 +255,11 @@ hashinsert(Relation rel, Datum *values, bool *isnull, itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull); itup->t_tid = *ht_ctid; + if (warm_update) + ItemPointerSetFlags(&itup->t_tid, HASH_INDEX_WARM_POINTER); + else + ItemPointerClearFlags(&itup->t_tid); + _hash_doinsert(rel, itup, heapRel); pfree(itup); @@ -260,6 +267,26 @@ hashinsert(Relation rel, Datum *values, bool *isnull, return false; } +bool +hashinsert(Relation rel, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + IndexInfo *indexInfo) +{ + return hashinsert_internal(rel, values, isnull, ht_ctid, heapRel, + checkUnique, indexInfo, false); +} + +bool +hashwarminsert(Relation rel, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + IndexInfo *indexInfo) +{ + return hashinsert_internal(rel, values, isnull, ht_ctid, heapRel, + checkUnique, indexInfo, true); + +} /* * hashgettuple() -- Get the next tuple in the scan. @@ -274,6 +301,8 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir) OffsetNumber offnum; ItemPointer current; bool res; + IndexTuple itup; + /* Hash indexes are always lossy since we store only the hash code */ scan->xs_recheck = true; @@ -316,8 +345,6 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir) offnum <= maxoffnum; offnum = OffsetNumberNext(offnum)) { - IndexTuple itup; - itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); if (ItemPointerEquals(&(so->hashso_heappos), &(itup->t_tid))) break; @@ -789,6 +816,8 @@ hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, Page page; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; + OffsetNumber clearwarm[MaxOffsetNumber]; + int nclearwarm = 0; bool retain_pin = false; vacuum_delay_point(); @@ -806,20 +835,35 @@ hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, IndexTuple itup; Bucket bucket; bool kill_tuple = false; + bool clear_tuple = false; + int flags; + bool is_warm; + IndexBulkDeleteCallbackResult result; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno)); htup = &(itup->t_tid); + flags = ItemPointerGetFlags(&itup->t_tid); + is_warm = ((flags & HASH_INDEX_WARM_POINTER) != 0); + /* * To remove the dead tuples, we strictly want to rely on results * of callback function. refer btvacuumpage for detailed reason. */ - if (callback && callback(htup, callback_state)) + if (callback) { - kill_tuple = true; - if (tuples_removed) - *tuples_removed += 1; + result = callback(htup, is_warm, callback_state); + if (result == IBDCR_DELETE) + { + kill_tuple = true; + if (tuples_removed) + *tuples_removed += 1; + } + else if (result == IBDCR_CLEAR_WARM) + { + clear_tuple = true; + } } else if (split_cleanup) { @@ -842,6 +886,12 @@ hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, } } + if (clear_tuple) + { + /* clear the WARM pointer */ + clearwarm[nclearwarm++] = offno; + } + if (kill_tuple) { /* mark the item for deletion */ @@ -866,12 +916,27 @@ hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, /* * Apply deletions, advance to next page and write page if needed. */ - if (ndeletable > 0) + if (ndeletable > 0 || nclearwarm > 0) { /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); - PageIndexMultiDelete(page, deletable, ndeletable); + /* + * Clear the WARM pointers. + * + * We must do this before dealing with the dead items because + * PageIndexMultiDelete may move items around to compactify the + * array and hence offnums recorded earlier won't make any sense + * after PageIndexMultiDelete is called. + */ + if (nclearwarm > 0) + _hash_clear_items(page, clearwarm, nclearwarm); + + /* + * And delete the deletable items + */ + if (ndeletable > 0) + PageIndexMultiDelete(page, deletable, ndeletable); bucket_dirty = true; /* @@ -892,6 +957,7 @@ hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, XLogRecPtr recptr; xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false; + xlrec.nclearitems = nclearwarm; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashDelete); @@ -904,6 +970,8 @@ hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE); XLogRegisterBuffer(1, buf, REGBUF_STANDARD); + XLogRegisterBufData(1, (char *) clearwarm, + nclearwarm * sizeof(OffsetNumber)); XLogRegisterBufData(1, (char *) deletable, ndeletable * sizeof(OffsetNumber)); diff --git a/src/backend/access/hash/hash_xlog.c b/src/backend/access/hash/hash_xlog.c index 8647e8c..fe89ee1 100644 --- a/src/backend/access/hash/hash_xlog.c +++ b/src/backend/access/hash/hash_xlog.c @@ -840,6 +840,7 @@ hash_xlog_delete(XLogReaderState *record) /* replay the record for deleting entries in bucket page */ if (action == BLK_NEEDS_REDO) { + uint16 nclearwarm = xldata->nclearitems; char *ptr; Size len; @@ -849,12 +850,17 @@ hash_xlog_delete(XLogReaderState *record) if (len > 0) { + OffsetNumber *clearwarm; OffsetNumber *unused; OffsetNumber *unend; - unused = (OffsetNumber *) ptr; + clearwarm = (OffsetNumber *) ptr; + unused = clearwarm + nclearwarm; unend = (OffsetNumber *) ((char *) ptr + len); + if (nclearwarm) + _hash_clear_items(page, clearwarm, nclearwarm); + if ((unend - unused) > 0) PageIndexMultiDelete(page, unused, unend - unused); } diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 622cc4b..e689f90 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -1576,3 +1576,17 @@ _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, int access, return buf; } + +void _hash_clear_items(Page page, OffsetNumber *clearitemnos, + uint16 nclearitems) +{ + int i; + IndexTuple itup; + + for (i = 0; i < nclearitems; i++) + { + itup = (IndexTuple) PageGetItem(page, + PageGetItemId(page, clearitemnos[i])); + ItemPointerClearFlags(&itup->t_tid); + } +} diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c index 2d92049..330ccc5 100644 --- a/src/backend/access/hash/hashsearch.c +++ b/src/backend/access/hash/hashsearch.c @@ -59,6 +59,8 @@ _hash_next(IndexScanDesc scan, ScanDirection dir) itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); so->hashso_heappos = itup->t_tid; + if (scan->xs_want_itup) + scan->xs_itup = itup; return true; } @@ -367,6 +369,9 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); so->hashso_heappos = itup->t_tid; + if (scan->xs_want_itup) + scan->xs_itup = itup; + return true; } diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c index 2e99719..48464b8 100644 --- a/src/backend/access/hash/hashutil.c +++ b/src/backend/access/hash/hashutil.c @@ -17,9 +17,11 @@ #include "access/hash.h" #include "access/reloptions.h" #include "access/relscan.h" +#include "catalog/index.h" #include "utils/lsyscache.h" #include "utils/rel.h" #include "storage/buf_internals.h" +#include "utils/datum.h" #define CALC_NEW_BUCKET(old_bucket, lowmask) \ old_bucket | (lowmask + 1) @@ -514,3 +516,70 @@ _hash_kill_items(IndexScanDesc scan) MarkBufferDirtyHint(so->hashso_curbuf, true); } } + +/* + * Recheck if the heap tuple satisfies the key stored in the index tuple + */ +bool +hashrecheck(Relation indexRel, IndexInfo *indexInfo, IndexTuple indexTuple, + Relation heapRel, HeapTuple heapTuple) +{ + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + bool isavail[INDEX_MAX_KEYS]; + Datum values2[INDEX_MAX_KEYS]; + bool isnull2[INDEX_MAX_KEYS]; + int i; + bool equal; + int natts = indexRel->rd_rel->relnatts; + Form_pg_attribute att; + + FormIndexPlainDatum(indexInfo, heapRel, heapTuple, values, isnull, isavail); + + /* + * HASH indexes compute a hash value of the key and store that in the + * index. So we must first obtain the hash of the value obtained from the + * heap and then do a comparison + */ + _hash_convert_tuple(indexRel, values, isnull, values2, isnull2); + + equal = true; + for (i = 1; i <= natts; i++) + { + Datum indxvalue; + bool indxisnull; + + if (!isavail[i - 1]) + continue; + + indxvalue = index_getattr(indexTuple, i, indexRel->rd_att, &indxisnull); + + /* + * If both are NULL then they are equal + */ + if (isnull2[i - 1] && indxisnull) + continue; + + /* + * If either is NULL then they are not equal + */ + if (isnull2[i - 1] || indxisnull) + { + equal = false; + break; + } + + /* + * Now do a raw memory comparison + */ + att = indexRel->rd_att->attrs[i - 1]; + if (!datumIsEqual(values2[i - 1], indxvalue, att->attbyval, + att->attlen)) + { + equal = false; + break; + } + } + + return equal; +} diff --git a/src/backend/access/heap/README.WARM b/src/backend/access/heap/README.WARM new file mode 100644 index 0000000..7569227 --- /dev/null +++ b/src/backend/access/heap/README.WARM @@ -0,0 +1,305 @@ +src/backend/access/heap/README.WARM + +Write Amplification Reduction Method (WARM) +=========================================== + +The Heap Only Tuple (HOT) feature greatly eliminated redudant index +entries and allowed re-use of the dead space occupied by previously +updated or deleted tuples (see src/backend/access/heap/README.HOT) + +One of the necessary conditions for satisfying HOT update is that the +update must not change a column used in any of the indexes on the table. +The condition is sometimes hard to meet, especially for complex +workloads with several indexes on large yet frequently updated tables. +Worse, sometimes only one or two index columns may be updated, but the +regular non-HOT update will still insert a new index entry in every +index on the table, irrespective of whether the key pertaining to the +index changed or not. + +WARM is a technique devised to address these problems. + + +Update Chains With Multiple Index Entries Pointing to the Root +-------------------------------------------------------------- + +When a non-HOT update is caused by an index key change, a new index +entry must be inserted for the changed index. But if the index key +hasn't changed for other indexes, we don't really need to insert a new +entry. Even though the existing index entry is pointing to the old +tuple, the new tuple is reachable via the t_ctid chain. To keep things +simple, a WARM update requires that the heap block must have enough +space to store the new version of the tuple. This is same as HOT +updates. + +In WARM, we ensure that every index entry always points to the root of +the WARM chain. In fact, a WARM chain looks exactly like a HOT chain +except for the fact that there could be multiple index entries pointing +to the root of the chain. So when new entry is inserted in an index for +updated tuple, and if we are doing a WARM update, the new entry is made +point to the root of the WARM chain. + +For example, if we have a table with two columns and two indexes on each +of the column. When a tuple is first inserted the table, we have exactly +one index entry pointing to the tuple from both indexes. + + lp [1] + [1111, aaaa] + + Index1's entry (1111) points to 1 + Index2's entry (aaaa) also points to 1 + +Now if the tuple's second column is updated and if there is room on the +page, we perform a WARM update. To do so, Index1 does not get any new +entry and Index2's new entry will still point to the root tuple of the +chain. + + lp [1] [2] + [1111, aaaa]->[111, bbbb] + + Index1's entry (1111) points to 1 + Index2's old entry (aaaa) points to 1 + Index2's new entry (bbbb) also points to 1 + +"A update chain which has more than one index entries pointing to its +root line pointer is called WARM chain and the action that creates a +WARM chain is called WARM update." + +Since all indexes always point to the root of the WARM chain, even when +there are more than one index entries, WARM chains can be pruned and +dead tuples can be removed without a need to do corresponding index +cleanup. + +While this solves the problem of pruning dead tuples from a HOT/WARM +chain, it also opens up a new technical challenge because now we have a +situation where a heap tuple is reachable from multiple index entries, +each having a different index key. While MVCC still ensures that only +valid tuples are returned, a tuple with a wrong index key may be +returned because of wrong index entries. In the above example, tuple +[1111, bbbb] is reachable from both keys (aaaa) as well as (bbbb). For +this reason, tuples returned from a WARM chain must always be rechecked +for index key-match. + +Recheck Index Key Againt Heap Tuple +----------------------------------- + +Since every Index AM has it's own notion of index tuples, each Index AM +must implement its own method to recheck heap tuples. For example, a +hash index stores the hash value of the column and hence recheck routine +for hash AM must first compute the hash value of the heap attribute and +then compare it against the value stored in the index tuple. + +The patch currently implement recheck routines for hash and btree +indexes. If the table has an index which doesn't support recheck +routine, WARM updates are disabled on such tables. + +Problem With Duplicate (key, ctid) Index Entries +------------------------------------------------ + +The index-key recheck logic works as long as there are no duplicate +index keys, both pointing to the same WARM chain. In that case, the same +valid tuple will be reachable via multiple index keys, yet satisfying +the index key checks. In the above example, if the tuple [1111, bbbb] is +again updated to [1111, aaaa] and if we insert a new index entry (aaaa) +pointing to the root line pointer, we will end up with the following +structure: + + lp [1] [2] [3] + [1111, aaaa]->[1111, bbbb]->[1111, aaaa] + + Index1's entry (1111) points to 1 + Index2's oldest entry (aaaa) points to 1 + Index2's old entry (bbbb) also points to 1 + Index2's new entry (aaaa) also points to 1 + +We must solve this problem to ensure that the same tuple is not +reachable via multiple index pointers. There are couple of ways to +address this issue: + +1. Do not allow WARM update to a tuple from a WARM chain. This +guarantees that there can never be duplicate index entries to the same +root line pointer because we must have checked for old and new index +keys while doing the first WARM update. + +2. Do not allow duplicate (key, ctid) index pointers. In the above +example, since (aaaa, 1) already exists in the index, we must not insert +a duplicate index entry. + +The patch currently implements 1 i.e. do not do WARM updates to a tuple +from a WARM chain. HOT updates are fine because they do not add a new +index entry. + +Even with the restriction, this is a significant improvement because the +number of regular UPDATEs are curtailed down to half. + +Expression and Partial Indexes +------------------------------ + +Expressions may evaluate to the same value even if the underlying column +values have changed. A simple example is an index on "lower(col)" which +will return the same value if the new heap value only differs in the +case sensitivity. So we can not solely rely on the heap column check to +decide whether or not to insert a new index entry for expression +indexes. Similarly, for partial indexes, the predicate expression must +be evaluated to decide whether or not to cause a new index entry when +columns referred in the predicate expressions change. + +(None of these things are currently implemented and we squarely disallow +WARM update if a column from expression indexes or predicate has +changed). + + +Efficiently Finding the Root Line Pointer +----------------------------------------- + +During WARM update, we must be able to find the root line pointer of the +tuple being updated. It must be noted that the t_ctid field in the heap +tuple header is usually used to find the next tuple in the update chain. +But the tuple that we are updating, must be the last tuple in the update +chain. In such cases, the c_tid field usually points the tuple itself. +So in theory, we could use the t_ctid to store additional information in +the last tuple of the update chain, if the information about the tuple +being the last tuple is stored elsewhere. + +We now utilize another bit from t_infomask2 to explicitly identify that +this is the last tuple in the update chain. + +HEAP_LATEST_TUPLE - When this bit is set, the tuple is the last tuple in +the update chain. The OffsetNumber part of t_ctid points to the root +line pointer of the chain when HEAP_LATEST_TUPLE flag is set. + +If UPDATE operation is aborted, the last tuple in the update chain +becomes dead. The root line pointer information stored in the tuple +which remains the last valid tuple in the chain is also lost. In such +rare cases, the root line pointer must be found in a hard way by +scanning the entire heap page. + +Tracking WARM Chains +-------------------- + +When a tuple is WARM updated, the old, the new and every subsequent tuple in +the chain is marked with a special HEAP_WARM_UPDATED flag. We use the last +remaining bit in t_infomask2 to store this information. + +When a tuple is returned from a WARM chain, the caller must do additional +checks to ensure that the tuple matches the index key. Even if the tuple +precedes the WARM update in the chain, it must still be rechecked for the index +key match (case when old tuple is returned by the new index key). So we must +follow the update chain everytime to the end to see check if this is a WARM +chain. + +Converting WARM chains back to HOT chains (VACUUM ?) +---------------------------------------------------- + +The current implementation of WARM allows only one WARM update per +chain. This simplifies the design and addresses certain issues around +duplicate scans. But this also implies that the benefit of WARM will be +no more than 50%, which is still significant, but if we could return +WARM chains back to normal status, we could do far more WARM updates. + +A distinct property of a WARM chain is that at least one index has more +than one live index entries pointing to the root of the chain. In other +words, if we can remove duplicate entry from every index or conclusively +prove that there are no duplicate index entries for the root line +pointer, the chain can again be marked as HOT. + +Here is one idea: + +A WARM chain has two parts, separated by the tuple that caused WARM +update. All tuples in each part has matching index keys, but certain +index keys may not match between these two parts. Lets say we mark heap +tuples in the second part with a special HEAP_WARM_TUPLE flag. Similarly, the +new index entries caused by the first WARM update are also marked with +INDEX_WARM_POINTER flags. + +There are two distinct parts of the WARM chain. The first part where none of +the tuples have HEAP_WARM_TUPLE flag set and the second part where every tuple +has the flag set. Each of these parts satisfy HOT property on its own i.e. all +tuples have the same value for indexed columns. But these two parts are +separated by the WARM update which breaks HOT property for one or more indexes. + +Heap chain: [1] [2] [3] [4] + [aaaa, 1111] -> [aaaa, 1111] -> [bbbb, 1111]W -> [bbbb, 1111]W + +Index1: (aaaa) points to 1 (satisfies only tuples without W) + (bbbb)W points to 1 (satisfies only tuples marked with W) + +Index2: (1111) points to 1 (satisfies tuples with and without W) + + +It's clear that for indexes with both pointers, a heap tuple without +HEAP_WARM_TUPLE flag will be reachable from the index pointer cleared of +INDEX_WARM_POINTER flag and that with HEAP_WARM_TUPLE flag will be reachable +from the pointer with INDEX_WARM_POINTER. But for indexes which did not create +a new entry, tuples with and without the HEAP_WARM_TUPLE flag will be reachable +from the original index pointer which doesn't have the INDEX_WARM_POINTER flag. +(there is no pointer with INDEX_WARM_POINTER in such indexes). + +During first heap scan of VACUUM, we look for tuples with HEAP_WARM_UPDATED +set. If all or none of the live tuples in the chain are marked with +HEAP_WARM_TUPLE flag, then the chain is a candidate for HOT conversion. We +remember the root line pointer and whether the tuples in the chain had +HEAP_WARM_TUPLE flags set or not. + +If we have a WARM chain with HEAP_WARM_TUPLE set, then our goal is to remove +the index pointers without INDEX_WARM_POINTER flags and vice versa. But there +is a catch. For Index2 above, there is only one pointer and it does not have +the INDEX_WARM_POINTER flag set. Since all heap tuples are reachable only via +this pointer, it must not be removed. IOW we should remove index pointer +without INDEX_WARM_POINTER iff a another index pointer with INDEX_WARM_POINTER +exists. Since index vacuum may visit these pointers in any order, we will need +another index pass to remove dead index pointers. So in the first index pass we +check which WARM candidates have 2 index pointers. In the second pass, we +remove the dead pointer and clear the INDEX_WARM_POINTER flag if that's the +surviving index pointer. + +During the second heap scan, we fix WARM chain by clearing HEAP_WARM_UPDATED +flag and also clear HEAP_WARM_TUPLE flags on tuples. + +There are some more problems around aborted vacuums. For example, if vacuum +aborts after clearing INDEX_WARM_POINTER flag but before removing the other +index pointer, we will end up with two index pointers and none of those will +have INDEX_WARM_POINTER set. But since the HEAP_WARM_UPDATED flag on the heap +tuple is still set, further WARM updates to the chain will be blocked. I guess +we will need some special handling for case with multiple index pointers where +none of the index pointers has INDEX_WARM_POINTER flag set. We can either leave +these WARM chains alone and let them die with a subsequent non-WARM update or +must apply heap-recheck logic during index vacuum to find the dead pointer. +Given that vacuum-aborts are not common, I am inclined to leave this case +unhandled. We must still check for presence of multiple index pointers without +INDEX_WARM_POINTER flags and ensure that we don't accidently remove either of +these pointers and also must not clear WARM chains. + +CREATE INDEX CONCURRENTLY +------------------------- + +Currently CREATE INDEX CONCURRENTLY (CIC) is implemented as a 3-phase +process. In the first phase, we create catalog entry for the new index +so that the index is visible to all other backends, but still don't use +it for either read or write. But we ensure that no new broken HOT +chains are created by new transactions. In the second phase, we build +the new index using a MVCC snapshot and then make the index available +for inserts. We then do another pass over the index and insert any +missing tuples, everytime indexing only it's root line pointer. See +README.HOT for details about how HOT impacts CIC and how various +challenges are tackeled. + +WARM poses another challenge because it allows creation of HOT chains +even when an index key is changed. But since the index is not ready for +insertion until the second phase is over, we might end up with a +situation where the HOT chain has tuples with different index columns, +yet only one of these values are indexed by the new index. Note that +during the third phase, we only index tuples whose root line pointer is +missing from the index. But we can't easily check if the existing index +tuple is actually indexing the heap tuple visible to the new MVCC +snapshot. Finding that information will require us to query the index +again for every tuple in the chain, especially if it's a WARM tuple. +This would require repeated access to the index. Another option would be +to return index keys along with the heap TIDs when index is scanned for +collecting all indexed TIDs during third phase. We can then compare the +heap tuple against the already indexed key and decide whether or not to +index the new tuple. + +We solve this problem more simply by disallowing WARM updates until the +index is ready for insertion. We don't need to disallow WARM on a +wholesale basis, but only those updates that change the columns of the +new index are disallowed to be WARM updates. diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 26a7af4..c86fbc6 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -1974,6 +1974,206 @@ heap_fetch(Relation relation, } /* + * Check status of a (possibly) WARM chain. + * + * This function looks at a HOT/WARM chain starting at tid and return a bitmask + * of information. We only follow the chain as long as it's known to be valid + * HOT chain. Information returned by the function consists of: + * + * HCWC_WARM_UPDATED_TUPLE - a tuple with HEAP_WARM_UPDATED is found somewhere + * in the chain. Note that when a tuple is WARM + * updated, both old and new versions are marked + * with this flag/ + * + * HCWC_WARM_TUPLE - a tuple with HEAP_WARM_TUPLE is found somewhere in + * the chain. + * + * HCWC_CLEAR_TUPLE - a tuple without HEAP_WARM_TUPLE is found somewhere in + * the chain. + * + * If stop_at_warm is true, we stop when the first HEAP_WARM_UPDATED tuple is + * found and return information collected so far. + */ +HeapCheckWarmChainStatus +heap_check_warm_chain(Page dp, ItemPointer tid, bool stop_at_warm) +{ + TransactionId prev_xmax = InvalidTransactionId; + OffsetNumber offnum; + HeapTupleData heapTuple; + HeapCheckWarmChainStatus status = 0; + + offnum = ItemPointerGetOffsetNumber(tid); + heapTuple.t_self = *tid; + /* Scan through possible multiple members of HOT-chain */ + for (;;) + { + ItemId lp; + + /* check for bogus TID */ + if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp)) + break; + + lp = PageGetItemId(dp, offnum); + + /* check for unused, dead, or redirected items */ + if (!ItemIdIsNormal(lp)) + { + if (ItemIdIsRedirected(lp)) + { + /* Follow the redirect */ + offnum = ItemIdGetRedirect(lp); + continue; + } + /* else must be end of chain */ + break; + } + + heapTuple.t_data = (HeapTupleHeader) PageGetItem(dp, lp); + ItemPointerSetOffsetNumber(&heapTuple.t_self, offnum); + + /* + * The xmin should match the previous xmax value, else chain is + * broken. + */ + if (TransactionIdIsValid(prev_xmax) && + !TransactionIdEquals(prev_xmax, + HeapTupleHeaderGetXmin(heapTuple.t_data))) + break; + + + if (HeapTupleHeaderIsWarmUpdated(heapTuple.t_data)) + { + /* We found a WARM_UPDATED tuple */ + status |= HCWC_WARM_UPDATED_TUPLE; + + /* + * If we've been told to stop at the first WARM_UPDATED tuple, just + * return whatever information collected so far. + */ + if (stop_at_warm) + return status; + + /* + * Remember whether it's a CLEAR or a WARM tuple. + */ + if (HeapTupleHeaderIsWarm(heapTuple.t_data)) + status |= HCWC_WARM_TUPLE; + else + status |= HCWC_CLEAR_TUPLE; + } + else + /* Must be a regular, non-WARM tuple */ + status |= HCWC_CLEAR_TUPLE; + + /* + * Check to see if HOT chain continues past this tuple; if so fetch + * the next offnum and loop around. + */ + if (!HeapTupleIsHotUpdated(&heapTuple)) + break; + + /* + * It can't be a HOT chain if the tuple contains root line pointer + */ + if (HeapTupleHeaderHasRootOffset(heapTuple.t_data)) + break; + + offnum = ItemPointerGetOffsetNumber(&heapTuple.t_data->t_ctid); + prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple.t_data); + } + + /* All OK. No need to recheck */ + return status; +} + +/* + * Scan through the WARM chain starting at tid and reset all WARM related + * flags. At the end, the chain will have all characteristics of a regular HOT + * chain. + * + * Return the number of cleared offnums. Cleared offnums are returned in the + * passed-in cleared_offnums array. The caller must ensure that the array is + * large enough to hold maximum offnums that can be cleared by this invokation + * of heap_clear_warm_chain(). + */ +int +heap_clear_warm_chain(Page dp, ItemPointer tid, OffsetNumber *cleared_offnums) +{ + TransactionId prev_xmax = InvalidTransactionId; + OffsetNumber offnum; + HeapTupleData heapTuple; + int num_cleared = 0; + + offnum = ItemPointerGetOffsetNumber(tid); + heapTuple.t_self = *tid; + /* Scan through possible multiple members of HOT-chain */ + for (;;) + { + ItemId lp; + + /* check for bogus TID */ + if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp)) + break; + + lp = PageGetItemId(dp, offnum); + + /* check for unused, dead, or redirected items */ + if (!ItemIdIsNormal(lp)) + { + if (ItemIdIsRedirected(lp)) + { + /* Follow the redirect */ + offnum = ItemIdGetRedirect(lp); + continue; + } + /* else must be end of chain */ + break; + } + + heapTuple.t_data = (HeapTupleHeader) PageGetItem(dp, lp); + ItemPointerSetOffsetNumber(&heapTuple.t_self, offnum); + + /* + * The xmin should match the previous xmax value, else chain is + * broken. + */ + if (TransactionIdIsValid(prev_xmax) && + !TransactionIdEquals(prev_xmax, + HeapTupleHeaderGetXmin(heapTuple.t_data))) + break; + + + /* + * Clear WARM_UPDATED and WARM flags. + */ + if (HeapTupleHeaderIsWarmUpdated(heapTuple.t_data)) + { + HeapTupleHeaderClearWarmUpdated(heapTuple.t_data); + HeapTupleHeaderClearWarm(heapTuple.t_data); + cleared_offnums[num_cleared++] = offnum; + } + + /* + * Check to see if HOT chain continues past this tuple; if so fetch + * the next offnum and loop around. + */ + if (!HeapTupleIsHotUpdated(&heapTuple)) + break; + + /* + * It can't be a HOT chain if the tuple contains root line pointer + */ + if (HeapTupleHeaderHasRootOffset(heapTuple.t_data)) + break; + + offnum = ItemPointerGetOffsetNumber(&heapTuple.t_data->t_ctid); + prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple.t_data); + } + + return num_cleared; +} + +/* * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot * * On entry, *tid is the TID of a tuple (either a simple tuple, or the root @@ -1993,11 +2193,14 @@ heap_fetch(Relation relation, * Unlike heap_fetch, the caller must already have pin and (at least) share * lock on the buffer; it is still pinned/locked at exit. Also unlike * heap_fetch, we do not report any pgstats count; caller may do so if wanted. + * + * recheck should be set false on entry by caller, will be set true on exit + * if a WARM tuple is encountered. */ bool heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, - bool *all_dead, bool first_call) + bool *all_dead, bool first_call, bool *recheck) { Page dp = (Page) BufferGetPage(buffer); TransactionId prev_xmax = InvalidTransactionId; @@ -2051,9 +2254,12 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, ItemPointerSetOffsetNumber(&heapTuple->t_self, offnum); /* - * Shouldn't see a HEAP_ONLY tuple at chain start. + * Shouldn't see a HEAP_ONLY tuple at chain start, unless we are + * dealing with a WARM updated tuple in which case deferred triggers + * may request to fetch a WARM tuple from middle of a chain. */ - if (at_chain_start && HeapTupleIsHeapOnly(heapTuple)) + if (at_chain_start && HeapTupleIsHeapOnly(heapTuple) && + !HeapTupleIsWarmUpdated(heapTuple)) break; /* @@ -2066,6 +2272,20 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, break; /* + * Check if there exists a WARM tuple somewhere down the chain and set + * recheck to TRUE. + * + * XXX This is not very efficient right now, and we should look for + * possible improvements here. + */ + if (recheck && *recheck == false) + { + HeapCheckWarmChainStatus status; + status = heap_check_warm_chain(dp, &heapTuple->t_self, true); + *recheck = HCWC_IS_WARM_UPDATED(status); + } + + /* * When first_call is true (and thus, skip is initially false) we'll * return the first tuple we find. But on later passes, heapTuple * will initially be pointing to the tuple we returned last time. @@ -2114,7 +2334,8 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, * Check to see if HOT chain continues past this tuple; if so fetch * the next offnum and loop around. */ - if (HeapTupleIsHotUpdated(heapTuple)) + if (HeapTupleIsHotUpdated(heapTuple) && + !HeapTupleHeaderHasRootOffset(heapTuple->t_data)) { Assert(ItemPointerGetBlockNumber(&heapTuple->t_data->t_ctid) == ItemPointerGetBlockNumber(tid)); @@ -2138,18 +2359,41 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, */ bool heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot, - bool *all_dead) + bool *all_dead, bool *recheck, Buffer *cbuffer, + HeapTuple heapTuple) { bool result; Buffer buffer; - HeapTupleData heapTuple; + ItemPointerData ret_tid = *tid; buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); LockBuffer(buffer, BUFFER_LOCK_SHARE); - result = heap_hot_search_buffer(tid, relation, buffer, snapshot, - &heapTuple, all_dead, true); - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - ReleaseBuffer(buffer); + result = heap_hot_search_buffer(&ret_tid, relation, buffer, snapshot, + heapTuple, all_dead, true, recheck); + + /* + * If we are returning a potential candidate tuple from this chain and the + * caller has requested for "recheck" hint, keep the buffer locked and + * pinned. The caller must release the lock and pin on the buffer in all + * such cases. + */ + if (!result || !recheck || !(*recheck)) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + } + + /* + * Set the caller supplied tid with the actual location of the tuple being + * returned. + */ + if (result) + { + *tid = ret_tid; + if (cbuffer) + *cbuffer = buffer; + } + return result; } @@ -2792,7 +3036,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, { XLogRecPtr recptr; xl_heap_multi_insert *xlrec; - uint8 info = XLOG_HEAP2_MULTI_INSERT; + uint8 info = XLOG_HEAP_MULTI_INSERT; char *tupledata; int totaldatalen; char *scratchptr = scratch; @@ -2889,7 +3133,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, /* filtering by origin on a row level is much more efficient */ XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); - recptr = XLogInsert(RM_HEAP2_ID, info); + recptr = XLogInsert(RM_HEAP_ID, info); PageSetLSN(page, recptr); } @@ -3313,7 +3557,9 @@ l1: } /* store transaction information of xact deleting the tuple */ - tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + tp.t_data->t_infomask &= ~HEAP_XMAX_BITS; + if (HeapTupleHeaderIsMoved(tp.t_data)) + tp.t_data->t_infomask &= ~HEAP_MOVED; tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; tp.t_data->t_infomask |= new_infomask; tp.t_data->t_infomask2 |= new_infomask2; @@ -3508,15 +3754,18 @@ simple_heap_delete(Relation relation, ItemPointer tid) HTSU_Result heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, CommandId cid, Snapshot crosscheck, bool wait, - HeapUpdateFailureData *hufd, LockTupleMode *lockmode) + HeapUpdateFailureData *hufd, LockTupleMode *lockmode, + Bitmapset **modified_attrsp, bool *warm_update) { HTSU_Result result; TransactionId xid = GetCurrentTransactionId(); Bitmapset *hot_attrs; Bitmapset *key_attrs; Bitmapset *id_attrs; + Bitmapset *exprindx_attrs; Bitmapset *interesting_attrs; Bitmapset *modified_attrs; + Bitmapset *notready_attrs; ItemId lp; HeapTupleData oldtup; HeapTuple heaptup; @@ -3537,6 +3786,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, bool have_tuple_lock = false; bool iscombo; bool use_hot_update = false; + bool use_warm_update = false; bool key_intact; bool all_visible_cleared = false; bool all_visible_cleared_new = false; @@ -3561,6 +3811,10 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, (errcode(ERRCODE_INVALID_TRANSACTION_STATE), errmsg("cannot update tuples during a parallel operation"))); + /* Assume no-warm update */ + if (warm_update) + *warm_update = false; + /* * Fetch the list of attributes to be checked for various operations. * @@ -3582,10 +3836,17 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY); id_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_IDENTITY_KEY); + exprindx_attrs = RelationGetIndexAttrBitmap(relation, + INDEX_ATTR_BITMAP_EXPR_PREDICATE); + notready_attrs = RelationGetIndexAttrBitmap(relation, + INDEX_ATTR_BITMAP_NOTREADY); + + interesting_attrs = bms_add_members(NULL, hot_attrs); interesting_attrs = bms_add_members(interesting_attrs, key_attrs); interesting_attrs = bms_add_members(interesting_attrs, id_attrs); - + interesting_attrs = bms_add_members(interesting_attrs, exprindx_attrs); + interesting_attrs = bms_add_members(interesting_attrs, notready_attrs); block = ItemPointerGetBlockNumber(otid); offnum = ItemPointerGetOffsetNumber(otid); @@ -3637,6 +3898,9 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, modified_attrs = HeapDetermineModifiedColumns(relation, interesting_attrs, &oldtup, newtup); + if (modified_attrsp) + *modified_attrsp = bms_copy(modified_attrs); + /* * If we're not updating any "key" column, we can grab a weaker lock type. * This allows for more concurrency when we are running simultaneously @@ -3892,6 +4156,7 @@ l2: bms_free(hot_attrs); bms_free(key_attrs); bms_free(id_attrs); + bms_free(exprindx_attrs); bms_free(modified_attrs); bms_free(interesting_attrs); return result; @@ -4057,7 +4322,9 @@ l2: START_CRIT_SECTION(); /* Clear obsolete visibility flags ... */ - oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + oldtup.t_data->t_infomask &= ~HEAP_XMAX_BITS; + if (HeapTupleHeaderIsMoved(oldtup.t_data)) + oldtup.t_data->t_infomask &= ~HEAP_MOVED; oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; HeapTupleClearHotUpdated(&oldtup); /* ... and store info about transaction updating this tuple */ @@ -4210,6 +4477,24 @@ l2: */ if (!bms_overlap(modified_attrs, hot_attrs)) use_hot_update = true; + else + { + /* + * If no WARM updates yet on this chain, let this update be a WARM + * update. + * + * We check for both warm and warm updated tuples since if the + * previous WARM update aborted, we may still have added + * another index entry for this HOT chain. In such situations, we + * must not attempt a WARM update. + */ + if (relation->rd_supportswarm && + !bms_overlap(modified_attrs, exprindx_attrs) && + !bms_is_subset(hot_attrs, modified_attrs) && + !bms_overlap(notready_attrs, modified_attrs) && + !HeapTupleIsWarmUpdated(&oldtup)) + use_warm_update = true; + } } else { @@ -4256,6 +4541,32 @@ l2: HeapTupleSetHeapOnly(heaptup); /* Mark the caller's copy too, in case different from heaptup */ HeapTupleSetHeapOnly(newtup); + + /* + * Even if we are doing a HOT update, we must carry forward the WARM + * flag because we may have already inserted another index entry + * pointing to our root and a third entry may create duplicates. + * + * Note: If we ever have a mechanism to avoid duplicate in + * indexes, we could look at relaxing this restriction and allow even + * more WARM udpates. + */ + if (HeapTupleIsWarmUpdated(&oldtup)) + { + HeapTupleSetWarmUpdated(heaptup); + HeapTupleSetWarmUpdated(newtup); + } + + /* + * If the old tuple is a WARM tuple then mark the new tuple as a WARM + * tuple as well. + */ + if (HeapTupleIsWarm(&oldtup)) + { + HeapTupleSetWarm(heaptup); + HeapTupleSetWarm(newtup); + } + /* * For HOT (or WARM) updated tuples, we store the offset of the root * line pointer of this chain in the ip_posid field of the new tuple. @@ -4268,12 +4579,45 @@ l2: if (HeapTupleHeaderHasRootOffset(oldtup.t_data)) root_offnum = HeapTupleHeaderGetRootOffset(oldtup.t_data); } + else if (use_warm_update) + { + /* Mark the old tuple as HOT-updated */ + HeapTupleSetHotUpdated(&oldtup); + HeapTupleSetWarmUpdated(&oldtup); + + /* And mark the new tuple as heap-only */ + HeapTupleSetHeapOnly(heaptup); + /* Mark the new tuple as WARM tuple */ + HeapTupleSetWarmUpdated(heaptup); + /* This update also starts the WARM chain */ + HeapTupleSetWarm(heaptup); + Assert(!HeapTupleIsWarm(&oldtup)); + + /* Mark the caller's copy too, in case different from heaptup */ + HeapTupleSetHeapOnly(newtup); + HeapTupleSetWarmUpdated(newtup); + HeapTupleSetWarm(newtup); + + if (HeapTupleHeaderHasRootOffset(oldtup.t_data)) + root_offnum = HeapTupleHeaderGetRootOffset(oldtup.t_data); + else + root_offnum = heap_get_root_tuple(page, + ItemPointerGetOffsetNumber(&(oldtup.t_self))); + + /* Let the caller know we did a WARM update */ + if (warm_update) + *warm_update = true; + } else { /* Make sure tuples are correctly marked as not-HOT */ HeapTupleClearHotUpdated(&oldtup); HeapTupleClearHeapOnly(heaptup); HeapTupleClearHeapOnly(newtup); + HeapTupleClearWarmUpdated(heaptup); + HeapTupleClearWarmUpdated(newtup); + HeapTupleClearWarm(heaptup); + HeapTupleClearWarm(newtup); root_offnum = InvalidOffsetNumber; } @@ -4292,7 +4636,9 @@ l2: HeapTupleHeaderSetHeapLatest(newtup->t_data, root_offnum); /* Clear obsolete visibility flags, possibly set by ourselves above... */ - oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + oldtup.t_data->t_infomask &= ~HEAP_XMAX_BITS; + if (HeapTupleHeaderIsMoved(oldtup.t_data)) + oldtup.t_data->t_infomask &= ~HEAP_MOVED; oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; /* ... and store info about transaction updating this tuple */ Assert(TransactionIdIsValid(xmax_old_tuple)); @@ -4383,7 +4729,10 @@ l2: if (have_tuple_lock) UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode); - pgstat_count_heap_update(relation, use_hot_update); + /* + * Count HOT and WARM updates separately + */ + pgstat_count_heap_update(relation, use_hot_update, use_warm_update); /* * If heaptup is a private copy, release it. Don't forget to copy t_self @@ -4523,7 +4872,8 @@ HeapDetermineModifiedColumns(Relation relation, Bitmapset *interesting_cols, * via ereport(). */ void -simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup) +simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup, + Bitmapset **modified_attrs, bool *warm_update) { HTSU_Result result; HeapUpdateFailureData hufd; @@ -4532,7 +4882,7 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup) result = heap_update(relation, otid, tup, GetCurrentCommandId(true), InvalidSnapshot, true /* wait for commit */ , - &hufd, &lockmode); + &hufd, &lockmode, modified_attrs, warm_update); switch (result) { case HeapTupleSelfUpdated: @@ -6209,7 +6559,9 @@ heap_abort_speculative(Relation relation, HeapTuple tuple) PageSetPrunable(page, RecentGlobalXmin); /* store transaction information of xact deleting the tuple */ - tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + tp.t_data->t_infomask &= ~HEAP_XMAX_BITS; + if (HeapTupleHeaderIsMoved(tp.t_data)) + tp.t_data->t_infomask &= ~HEAP_MOVED; tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; /* @@ -6783,7 +7135,7 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid, * Old-style VACUUM FULL is gone, but we have to keep this code as long as * we support having MOVED_OFF/MOVED_IN tuples in the database. */ - if (tuple->t_infomask & HEAP_MOVED) + if (HeapTupleHeaderIsMoved(tuple)) { xid = HeapTupleHeaderGetXvac(tuple); @@ -6802,7 +7154,7 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid, * have failed; whereas a non-dead MOVED_IN tuple must mean the * xvac transaction succeeded. */ - if (tuple->t_infomask & HEAP_MOVED_OFF) + if (HeapTupleHeaderIsMovedOff(tuple)) frz->frzflags |= XLH_INVALID_XVAC; else frz->frzflags |= XLH_FREEZE_XVAC; @@ -7272,7 +7624,7 @@ heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple) return true; } - if (tuple->t_infomask & HEAP_MOVED) + if (HeapTupleHeaderIsMoved(tuple)) { xid = HeapTupleHeaderGetXvac(tuple); if (TransactionIdIsNormal(xid)) @@ -7355,7 +7707,7 @@ heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid, return true; } - if (tuple->t_infomask & HEAP_MOVED) + if (HeapTupleHeaderIsMoved(tuple)) { xid = HeapTupleHeaderGetXvac(tuple); if (TransactionIdIsNormal(xid) && @@ -7381,7 +7733,7 @@ HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, TransactionId xmax = HeapTupleHeaderGetUpdateXid(tuple); TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - if (tuple->t_infomask & HEAP_MOVED) + if (HeapTupleHeaderIsMoved(tuple)) { if (TransactionIdPrecedes(*latestRemovedXid, xvac)) *latestRemovedXid = xvac; @@ -7430,6 +7782,36 @@ log_heap_cleanup_info(RelFileNode rnode, TransactionId latestRemovedXid) } /* + * Perform XLogInsert for a heap-warm-clear operation. Caller must already + * have modified the buffer and marked it dirty. + */ +XLogRecPtr +log_heap_warmclear(Relation reln, Buffer buffer, + OffsetNumber *cleared, int ncleared) +{ + xl_heap_warmclear xlrec; + XLogRecPtr recptr; + + /* Caller should not call me on a non-WAL-logged relation */ + Assert(RelationNeedsWAL(reln)); + + xlrec.ncleared = ncleared; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapWarmClear); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + if (ncleared > 0) + XLogRegisterBufData(0, (char *) cleared, + ncleared * sizeof(OffsetNumber)); + + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_WARMCLEAR); + + return recptr; +} + +/* * Perform XLogInsert for a heap-clean operation. Caller must already * have modified the buffer and marked it dirty. * @@ -7584,6 +7966,7 @@ log_heap_update(Relation reln, Buffer oldbuf, bool need_tuple_data = RelationIsLogicallyLogged(reln); bool init; int bufflags; + bool warm_update = false; /* Caller should not call me on a non-WAL-logged relation */ Assert(RelationNeedsWAL(reln)); @@ -7595,6 +7978,9 @@ log_heap_update(Relation reln, Buffer oldbuf, else info = XLOG_HEAP_UPDATE; + if (HeapTupleIsWarmUpdated(newtup)) + warm_update = true; + /* * If the old and new tuple are on the same page, we only need to log the * parts of the new tuple that were changed. That saves on the amount of @@ -7668,6 +8054,8 @@ log_heap_update(Relation reln, Buffer oldbuf, xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_KEY; } } + if (warm_update) + xlrec.flags |= XLH_UPDATE_WARM_UPDATE; /* If new tuple is the single and first tuple on page... */ if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber && @@ -8082,6 +8470,60 @@ heap_xlog_clean(XLogReaderState *record) XLogRecordPageWithFreeSpace(rnode, blkno, freespace); } + +/* + * Handles HEAP2_WARMCLEAR record type + */ +static void +heap_xlog_warmclear(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_warmclear *xlrec = (xl_heap_warmclear *) XLogRecGetData(record); + Buffer buffer; + RelFileNode rnode; + BlockNumber blkno; + XLogRedoAction action; + + XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno); + + /* + * If we have a full-page image, restore it (using a cleanup lock) and + * we're done. + */ + action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, + &buffer); + if (action == BLK_NEEDS_REDO) + { + Page page = (Page) BufferGetPage(buffer); + OffsetNumber *cleared; + int ncleared; + Size datalen; + int i; + + cleared = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen); + + ncleared = xlrec->ncleared; + + for (i = 0; i < ncleared; i++) + { + ItemId lp; + OffsetNumber offnum = cleared[i]; + HeapTupleData heapTuple; + + lp = PageGetItemId(page, offnum); + heapTuple.t_data = (HeapTupleHeader) PageGetItem(page, lp); + + HeapTupleHeaderClearWarmUpdated(heapTuple.t_data); + HeapTupleHeaderClearWarm(heapTuple.t_data); + } + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + /* * Replay XLOG_HEAP2_VISIBLE record. * @@ -8328,7 +8770,9 @@ heap_xlog_delete(XLogReaderState *record) htup = (HeapTupleHeader) PageGetItem(page, lp); - htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + htup->t_infomask &= ~HEAP_XMAX_BITS; + if (HeapTupleHeaderIsMoved(htup)) + htup->t_infomask &= ~HEAP_MOVED; htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; HeapTupleHeaderClearHotUpdated(htup); fix_infomask_from_infobits(xlrec->infobits_set, @@ -8349,7 +8793,7 @@ heap_xlog_delete(XLogReaderState *record) if (!HeapTupleHeaderHasRootOffset(htup)) { OffsetNumber root_offnum; - root_offnum = heap_get_root_tuple(page, xlrec->offnum); + root_offnum = heap_get_root_tuple(page, xlrec->offnum); HeapTupleHeaderSetHeapLatest(htup, root_offnum); } @@ -8645,16 +9089,22 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) Size freespace = 0; XLogRedoAction oldaction; XLogRedoAction newaction; + bool warm_update = false; /* initialize to keep the compiler quiet */ oldtup.t_data = NULL; oldtup.t_len = 0; + if (xlrec->flags & XLH_UPDATE_WARM_UPDATE) + warm_update = true; + XLogRecGetBlockTag(record, 0, &rnode, NULL, &newblk); if (XLogRecGetBlockTag(record, 1, NULL, NULL, &oldblk)) { /* HOT updates are never done across pages */ Assert(!hot_update); + /* WARM updates are never done across pages */ + Assert(!warm_update); } else oldblk = newblk; @@ -8714,6 +9164,11 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) &htup->t_infomask2); HeapTupleHeaderSetXmax(htup, xlrec->old_xmax); HeapTupleHeaderSetCmax(htup, FirstCommandId, false); + + /* Mark the old tuple has a WARM tuple */ + if (warm_update) + HeapTupleHeaderSetWarmUpdated(htup); + /* Set forward chain link in t_ctid */ HeapTupleHeaderSetNextTid(htup, &newtid); @@ -8849,6 +9304,10 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) HeapTupleHeaderSetCmin(htup, FirstCommandId); HeapTupleHeaderSetXmax(htup, xlrec->new_xmax); + /* Mark the new tuple has a WARM tuple */ + if (warm_update) + HeapTupleHeaderSetWarmUpdated(htup); + offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true); if (offnum == InvalidOffsetNumber) elog(PANIC, "failed to add tuple"); @@ -8976,7 +9435,9 @@ heap_xlog_lock(XLogReaderState *record) htup = (HeapTupleHeader) PageGetItem(page, lp); - htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + htup->t_infomask &= ~HEAP_XMAX_BITS; + if (HeapTupleHeaderIsMoved(htup)) + htup->t_infomask &= ~HEAP_MOVED; htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask, &htup->t_infomask2); @@ -9055,7 +9516,9 @@ heap_xlog_lock_updated(XLogReaderState *record) htup = (HeapTupleHeader) PageGetItem(page, lp); - htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + htup->t_infomask &= ~HEAP_XMAX_BITS; + if (HeapTupleHeaderIsMoved(htup)) + htup->t_infomask &= ~HEAP_MOVED; htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask, &htup->t_infomask2); @@ -9124,6 +9587,9 @@ heap_redo(XLogReaderState *record) case XLOG_HEAP_INSERT: heap_xlog_insert(record); break; + case XLOG_HEAP_MULTI_INSERT: + heap_xlog_multi_insert(record); + break; case XLOG_HEAP_DELETE: heap_xlog_delete(record); break; @@ -9152,7 +9618,7 @@ heap2_redo(XLogReaderState *record) { uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; - switch (info & XLOG_HEAP_OPMASK) + switch (info & XLOG_HEAP2_OPMASK) { case XLOG_HEAP2_CLEAN: heap_xlog_clean(record); @@ -9166,9 +9632,6 @@ heap2_redo(XLogReaderState *record) case XLOG_HEAP2_VISIBLE: heap_xlog_visible(record); break; - case XLOG_HEAP2_MULTI_INSERT: - heap_xlog_multi_insert(record); - break; case XLOG_HEAP2_LOCK_UPDATED: heap_xlog_lock_updated(record); break; @@ -9182,6 +9645,9 @@ heap2_redo(XLogReaderState *record) case XLOG_HEAP2_REWRITE: heap_xlog_logical_rewrite(record); break; + case XLOG_HEAP2_WARMCLEAR: + heap_xlog_warmclear(record); + break; default: elog(PANIC, "heap2_redo: unknown op code %u", info); } diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index f54337c..4e8ed79 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -834,6 +834,13 @@ heap_get_root_tuples_internal(Page page, OffsetNumber target_offnum, if (!HeapTupleHeaderIsHotUpdated(htup)) continue; + /* + * If the tuple has root line pointer, it must be the end of the + * chain + */ + if (HeapTupleHeaderHasRootOffset(htup)) + break; + /* Set up to scan the HOT-chain */ nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid); priorXmax = HeapTupleHeaderGetUpdateXid(htup); diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index 2d3ae9b..bd469ee 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -404,6 +404,14 @@ rewrite_heap_tuple(RewriteState state, old_tuple->t_data->t_infomask & HEAP_XACT_MASK; /* + * We must clear the HEAP_WARM_TUPLE flag if the HEAP_WARM_UPDATED is + * cleared above. + */ + if (HeapTupleHeaderIsWarmUpdated(old_tuple->t_data)) + HeapTupleHeaderClearWarm(new_tuple->t_data); + + + /* * While we have our hands on the tuple, we may as well freeze any * eligible xmin or xmax, so that future VACUUM effort can be saved. */ @@ -428,7 +436,7 @@ rewrite_heap_tuple(RewriteState state, memset(&hashkey, 0, sizeof(hashkey)); hashkey.xmin = HeapTupleHeaderGetUpdateXid(old_tuple->t_data); - /* + /* * We've already checked that this is not the last tuple in the chain, * so fetch the next TID in the chain. */ @@ -737,7 +745,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup) newitemid = PageGetItemId(page, newoff); onpage_tup = (HeapTupleHeader) PageGetItem(page, newitemid); - /* + /* * Set t_ctid just to ensure that block number is copied correctly, but * then immediately mark the tuple as the latest. */ diff --git a/src/backend/access/heap/tuptoaster.c b/src/backend/access/heap/tuptoaster.c index 19e7048..47b01eb 100644 --- a/src/backend/access/heap/tuptoaster.c +++ b/src/backend/access/heap/tuptoaster.c @@ -1620,7 +1620,8 @@ toast_save_datum(Relation rel, Datum value, toastrel, toastidxs[i]->rd_index->indisunique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, - NULL); + NULL, + false); } /* diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index a91fda7..d523c8f 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -127,6 +127,8 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->xs_cbuf = InvalidBuffer; scan->xs_continue_hot = false; + scan->indexInfo = NULL; + return scan; } diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index cc5ac8b..04018fe 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -197,7 +197,8 @@ index_insert(Relation indexRelation, ItemPointer heap_t_ctid, Relation heapRelation, IndexUniqueCheck checkUnique, - IndexInfo *indexInfo) + IndexInfo *indexInfo, + bool warm_update) { RELATION_CHECKS; CHECK_REL_PROCEDURE(aminsert); @@ -207,6 +208,12 @@ index_insert(Relation indexRelation, (HeapTuple) NULL, InvalidBuffer); + if (warm_update) + { + Assert(indexRelation->rd_amroutine->amwarminsert != NULL); + return indexRelation->rd_amroutine->amwarminsert(indexRelation, values, + isnull, heap_t_ctid, heapRelation, checkUnique, indexInfo); + } return indexRelation->rd_amroutine->aminsert(indexRelation, values, isnull, heap_t_ctid, heapRelation, checkUnique, indexInfo); @@ -234,6 +241,25 @@ index_beginscan(Relation heapRelation, scan->heapRelation = heapRelation; scan->xs_snapshot = snapshot; + /* + * If the index supports recheck, make sure that index tuple is saved + * during index scans. Also build and cache IndexInfo which is used by + * amrecheck routine. + * + * XXX Ideally, we should look at all indexes on the table and check if + * WARM is at all supported on the base table. If WARM is not supported + * then we don't need to do any recheck. RelationGetIndexAttrBitmap() does + * do that and sets rd_supportswarm after looking at all indexes. But we + * don't know if the function was called earlier in the session when we're + * here. We can't call it now because there exists a risk of causing + * deadlock. + */ + if (indexRelation->rd_amroutine->amrecheck) + { + scan->xs_want_itup = true; + scan->indexInfo = BuildIndexInfo(indexRelation); + } + return scan; } @@ -358,6 +384,10 @@ index_endscan(IndexScanDesc scan) if (scan->xs_temp_snap) UnregisterSnapshot(scan->xs_snapshot); + /* Free cached IndexInfo, if any */ + if (scan->indexInfo) + pfree(scan->indexInfo); + /* Release the scan data structure itself */ IndexScanEnd(scan); } @@ -535,7 +565,7 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) /* * The AM's amgettuple proc finds the next index entry matching the scan * keys, and puts the TID into scan->xs_ctup.t_self. It should also set - * scan->xs_recheck and possibly scan->xs_itup/scan->xs_hitup, though we + * scan->xs_tuple_recheck and possibly scan->xs_itup/scan->xs_hitup, though we * pay no attention to those fields here. */ found = scan->indexRelation->rd_amroutine->amgettuple(scan, direction); @@ -574,7 +604,7 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) * dropped in a future index_getnext_tid, index_fetch_heap or index_endscan * call). * - * Note: caller must check scan->xs_recheck, and perform rechecking of the + * Note: caller must check scan->xs_tuple_recheck, and perform rechecking of the * scan keys if required. We do not do that here because we don't have * enough information to do it efficiently in the general case. * ---------------- @@ -585,6 +615,7 @@ index_fetch_heap(IndexScanDesc scan) ItemPointer tid = &scan->xs_ctup.t_self; bool all_dead = false; bool got_heap_tuple; + bool tuple_recheck; /* We can skip the buffer-switching logic if we're in mid-HOT chain. */ if (!scan->xs_continue_hot) @@ -603,6 +634,8 @@ index_fetch_heap(IndexScanDesc scan) heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf); } + tuple_recheck = false; + /* Obtain share-lock on the buffer so we can examine visibility */ LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE); got_heap_tuple = heap_hot_search_buffer(tid, scan->heapRelation, @@ -610,32 +643,60 @@ index_fetch_heap(IndexScanDesc scan) scan->xs_snapshot, &scan->xs_ctup, &all_dead, - !scan->xs_continue_hot); - LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); + !scan->xs_continue_hot, + &tuple_recheck); if (got_heap_tuple) { + bool res = true; + + /* + * Ok we got a tuple which satisfies the snapshot, but if its part of a + * WARM chain, we must do additional checks to ensure that we are + * indeed returning a correct tuple. Note that if the index AM does not + * implement amrecheck method, then we don't any additional checks + * since WARM must have been disabled on such tables. + */ + if (tuple_recheck && scan->xs_itup && + scan->indexRelation->rd_amroutine->amrecheck) + { + res = scan->indexRelation->rd_amroutine->amrecheck( + scan->indexRelation, + scan->indexInfo, + scan->xs_itup, + scan->heapRelation, + &scan->xs_ctup); + } + + LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); + /* * Only in a non-MVCC snapshot can more than one member of the HOT * chain be visible. */ scan->xs_continue_hot = !IsMVCCSnapshot(scan->xs_snapshot); pgstat_count_heap_fetch(scan->indexRelation); - return &scan->xs_ctup; + + if (res) + return &scan->xs_ctup; } + else + { + LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); - /* We've reached the end of the HOT chain. */ - scan->xs_continue_hot = false; + /* We've reached the end of the HOT chain. */ + scan->xs_continue_hot = false; - /* - * If we scanned a whole HOT chain and found only dead tuples, tell index - * AM to kill its entry for that TID (this will take effect in the next - * amgettuple call, in index_getnext_tid). We do not do this when in - * recovery because it may violate MVCC to do so. See comments in - * RelationGetIndexScan(). - */ - if (!scan->xactStartedInRecovery) - scan->kill_prior_tuple = all_dead; + /* + * If we scanned a whole HOT chain and found only dead tuples, tell index + * AM to kill its entry for that TID (this will take effect in the next + * amgettuple call, in index_getnext_tid). We do not do this when in + * recovery because it may violate MVCC to do so. See comments in + * RelationGetIndexScan(). + */ + if (!scan->xactStartedInRecovery) + scan->kill_prior_tuple = all_dead; + } return NULL; } diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 6dca810..328184b 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -20,12 +20,12 @@ #include "access/nbtxlog.h" #include "access/transam.h" #include "access/xloginsert.h" +#include "catalog/index.h" #include "miscadmin.h" #include "storage/lmgr.h" #include "storage/predicate.h" #include "utils/tqual.h" - typedef struct { /* context data for _bt_checksplitloc */ @@ -250,6 +250,10 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, BTPageOpaque opaque; Buffer nbuf = InvalidBuffer; bool found = false; + Buffer buffer; + HeapTupleData heapTuple; + bool recheck = false; + IndexInfo *indexInfo = BuildIndexInfo(rel); /* Assume unique until we find a duplicate */ *is_unique = true; @@ -309,6 +313,8 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, curitup = (IndexTuple) PageGetItem(page, curitemid); htid = curitup->t_tid; + recheck = false; + /* * If we are doing a recheck, we expect to find the tuple we * are rechecking. It's not a duplicate, but we have to keep @@ -326,112 +332,153 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, * have just a single index entry for the entire chain. */ else if (heap_hot_search(&htid, heapRel, &SnapshotDirty, - &all_dead)) + &all_dead, &recheck, &buffer, + &heapTuple)) { TransactionId xwait; + bool result = true; /* - * It is a duplicate. If we are only doing a partial - * check, then don't bother checking if the tuple is being - * updated in another transaction. Just return the fact - * that it is a potential conflict and leave the full - * check till later. + * If the tuple was WARM update, we may again see our own + * tuple. Since WARM updates don't create new index + * entries, our own tuple is only reachable via the old + * index pointer. */ - if (checkUnique == UNIQUE_CHECK_PARTIAL) + if (checkUnique == UNIQUE_CHECK_EXISTING && + ItemPointerCompare(&htid, &itup->t_tid) == 0) { - if (nbuf != InvalidBuffer) - _bt_relbuf(rel, nbuf); - *is_unique = false; - return InvalidTransactionId; + found = true; + result = false; + if (recheck) + UnlockReleaseBuffer(buffer); } - - /* - * If this tuple is being updated by other transaction - * then we have to wait for its commit/abort. - */ - xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ? - SnapshotDirty.xmin : SnapshotDirty.xmax; - - if (TransactionIdIsValid(xwait)) + else if (recheck) { - if (nbuf != InvalidBuffer) - _bt_relbuf(rel, nbuf); - /* Tell _bt_doinsert to wait... */ - *speculativeToken = SnapshotDirty.speculativeToken; - return xwait; + result = btrecheck(rel, indexInfo, curitup, heapRel, &heapTuple); + UnlockReleaseBuffer(buffer); } - /* - * Otherwise we have a definite conflict. But before - * complaining, look to see if the tuple we want to insert - * is itself now committed dead --- if so, don't complain. - * This is a waste of time in normal scenarios but we must - * do it to support CREATE INDEX CONCURRENTLY. - * - * We must follow HOT-chains here because during - * concurrent index build, we insert the root TID though - * the actual tuple may be somewhere in the HOT-chain. - * While following the chain we might not stop at the - * exact tuple which triggered the insert, but that's OK - * because if we find a live tuple anywhere in this chain, - * we have a unique key conflict. The other live tuple is - * not part of this chain because it had a different index - * entry. - */ - htid = itup->t_tid; - if (heap_hot_search(&htid, heapRel, SnapshotSelf, NULL)) - { - /* Normal case --- it's still live */ - } - else + if (result) { /* - * It's been deleted, so no error, and no need to - * continue searching + * It is a duplicate. If we are only doing a partial + * check, then don't bother checking if the tuple is being + * updated in another transaction. Just return the fact + * that it is a potential conflict and leave the full + * check till later. */ - break; - } + if (checkUnique == UNIQUE_CHECK_PARTIAL) + { + if (nbuf != InvalidBuffer) + _bt_relbuf(rel, nbuf); + *is_unique = false; + return InvalidTransactionId; + } - /* - * Check for a conflict-in as we would if we were going to - * write to this page. We aren't actually going to write, - * but we want a chance to report SSI conflicts that would - * otherwise be masked by this unique constraint - * violation. - */ - CheckForSerializableConflictIn(rel, NULL, buf); + /* + * If this tuple is being updated by other transaction + * then we have to wait for its commit/abort. + */ + xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ? + SnapshotDirty.xmin : SnapshotDirty.xmax; + + if (TransactionIdIsValid(xwait)) + { + if (nbuf != InvalidBuffer) + _bt_relbuf(rel, nbuf); + /* Tell _bt_doinsert to wait... */ + *speculativeToken = SnapshotDirty.speculativeToken; + return xwait; + } - /* - * This is a definite conflict. Break the tuple down into - * datums and report the error. But first, make sure we - * release the buffer locks we're holding --- - * BuildIndexValueDescription could make catalog accesses, - * which in the worst case might touch this same index and - * cause deadlocks. - */ - if (nbuf != InvalidBuffer) - _bt_relbuf(rel, nbuf); - _bt_relbuf(rel, buf); + /* + * Otherwise we have a definite conflict. But before + * complaining, look to see if the tuple we want to insert + * is itself now committed dead --- if so, don't complain. + * This is a waste of time in normal scenarios but we must + * do it to support CREATE INDEX CONCURRENTLY. + * + * We must follow HOT-chains here because during + * concurrent index build, we insert the root TID though + * the actual tuple may be somewhere in the HOT-chain. + * While following the chain we might not stop at the + * exact tuple which triggered the insert, but that's OK + * because if we find a live tuple anywhere in this chain, + * we have a unique key conflict. The other live tuple is + * not part of this chain because it had a different index + * entry. + */ + recheck = false; + ItemPointerCopy(&itup->t_tid, &htid); + if (heap_hot_search(&htid, heapRel, SnapshotSelf, NULL, + &recheck, &buffer, &heapTuple)) + { + bool result = true; + if (recheck) + { + /* + * Recheck if the tuple actually satisfies the + * index key. Otherwise, we might be following + * a wrong index pointer and mustn't entertain + * this tuple. + */ + result = btrecheck(rel, indexInfo, itup, heapRel, &heapTuple); + UnlockReleaseBuffer(buffer); + } + if (!result) + break; + /* Normal case --- it's still live */ + } + else + { + /* + * It's been deleted, so no error, and no need to + * continue searching. + */ + break; + } - { - Datum values[INDEX_MAX_KEYS]; - bool isnull[INDEX_MAX_KEYS]; - char *key_desc; - - index_deform_tuple(itup, RelationGetDescr(rel), - values, isnull); - - key_desc = BuildIndexValueDescription(rel, values, - isnull); - - ereport(ERROR, - (errcode(ERRCODE_UNIQUE_VIOLATION), - errmsg("duplicate key value violates unique constraint \"%s\"", - RelationGetRelationName(rel)), - key_desc ? errdetail("Key %s already exists.", - key_desc) : 0, - errtableconstraint(heapRel, - RelationGetRelationName(rel)))); + /* + * Check for a conflict-in as we would if we were going to + * write to this page. We aren't actually going to write, + * but we want a chance to report SSI conflicts that would + * otherwise be masked by this unique constraint + * violation. + */ + CheckForSerializableConflictIn(rel, NULL, buf); + + /* + * This is a definite conflict. Break the tuple down into + * datums and report the error. But first, make sure we + * release the buffer locks we're holding --- + * BuildIndexValueDescription could make catalog accesses, + * which in the worst case might touch this same index and + * cause deadlocks. + */ + if (nbuf != InvalidBuffer) + _bt_relbuf(rel, nbuf); + _bt_relbuf(rel, buf); + + { + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + char *key_desc; + + index_deform_tuple(itup, RelationGetDescr(rel), + values, isnull); + + key_desc = BuildIndexValueDescription(rel, values, + isnull); + + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("duplicate key value violates unique constraint \"%s\"", + RelationGetRelationName(rel)), + key_desc ? errdetail("Key %s already exists.", + key_desc) : 0, + errtableconstraint(heapRel, + RelationGetRelationName(rel)))); + } } } else if (all_dead) diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index f815fd4..ce1bea0 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -766,29 +766,20 @@ _bt_page_recyclable(Page page) } /* - * Delete item(s) from a btree page during VACUUM. + * Delete item(s) and clear WARM item(s) on a btree page during VACUUM. * * This must only be used for deleting leaf items. Deleting an item on a * non-leaf page has to be done as part of an atomic action that includes - * deleting the page it points to. + * deleting the page it points to. We don't ever clear pointers on a non-leaf + * page. * * This routine assumes that the caller has pinned and locked the buffer. * Also, the given itemnos *must* appear in increasing order in the array. - * - * We record VACUUMs and b-tree deletes differently in WAL. InHotStandby - * we need to be able to pin all of the blocks in the btree in physical - * order when replaying the effects of a VACUUM, just as we do for the - * original VACUUM itself. lastBlockVacuumed allows us to tell whether an - * intermediate range of blocks has had no changes at all by VACUUM, - * and so must be scanned anyway during replay. We always write a WAL record - * for the last block in the index, whether or not it contained any items - * to be removed. This allows us to scan right up to end of index to - * ensure correct locking. */ void -_bt_delitems_vacuum(Relation rel, Buffer buf, - OffsetNumber *itemnos, int nitems, - BlockNumber lastBlockVacuumed) +_bt_handleitems_vacuum(Relation rel, Buffer buf, + OffsetNumber *delitemnos, int ndelitems, + OffsetNumber *clearitemnos, int nclearitems) { Page page = BufferGetPage(buf); BTPageOpaque opaque; @@ -796,9 +787,20 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); + /* + * Clear the WARM pointers. + * + * We must do this before dealing with the dead items because + * PageIndexMultiDelete may move items around to compactify the array and + * hence offnums recorded earlier won't make any sense after + * PageIndexMultiDelete is called. + */ + if (nclearitems > 0) + _bt_clear_items(page, clearitemnos, nclearitems); + /* Fix the page */ - if (nitems > 0) - PageIndexMultiDelete(page, itemnos, nitems); + if (ndelitems > 0) + PageIndexMultiDelete(page, delitemnos, ndelitems); /* * We can clear the vacuum cycle ID since this page has certainly been @@ -824,7 +826,8 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, XLogRecPtr recptr; xl_btree_vacuum xlrec_vacuum; - xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed; + xlrec_vacuum.ndelitems = ndelitems; + xlrec_vacuum.nclearitems = nclearitems; XLogBeginInsert(); XLogRegisterBuffer(0, buf, REGBUF_STANDARD); @@ -835,8 +838,11 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, * is. When XLogInsert stores the whole buffer, the offsets array * need not be stored too. */ - if (nitems > 0) - XLogRegisterBufData(0, (char *) itemnos, nitems * sizeof(OffsetNumber)); + if (ndelitems > 0) + XLogRegisterBufData(0, (char *) delitemnos, ndelitems * sizeof(OffsetNumber)); + + if (nclearitems > 0) + XLogRegisterBufData(0, (char *) clearitemnos, nclearitems * sizeof(OffsetNumber)); recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM); @@ -1882,3 +1888,18 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) return true; } + +void +_bt_clear_items(Page page, OffsetNumber *clearitemnos, uint16 nclearitems) +{ + int i; + ItemId itemid; + IndexTuple itup; + + for (i = 0; i < nclearitems; i++) + { + itemid = PageGetItemId(page, clearitemnos[i]); + itup = (IndexTuple) PageGetItem(page, itemid); + ItemPointerClearFlags(&itup->t_tid); + } +} diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 775f2ff..6d558af 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -146,6 +146,7 @@ bthandler(PG_FUNCTION_ARGS) amroutine->ambuild = btbuild; amroutine->ambuildempty = btbuildempty; amroutine->aminsert = btinsert; + amroutine->amwarminsert = btwarminsert; amroutine->ambulkdelete = btbulkdelete; amroutine->amvacuumcleanup = btvacuumcleanup; amroutine->amcanreturn = btcanreturn; @@ -163,6 +164,7 @@ bthandler(PG_FUNCTION_ARGS) amroutine->amestimateparallelscan = btestimateparallelscan; amroutine->aminitparallelscan = btinitparallelscan; amroutine->amparallelrescan = btparallelrescan; + amroutine->amrecheck = btrecheck; PG_RETURN_POINTER(amroutine); } @@ -315,11 +317,12 @@ btbuildempty(Relation index) * Descend the tree recursively, find the appropriate location for our * new tuple, and put it there. */ -bool -btinsert(Relation rel, Datum *values, bool *isnull, +static bool +btinsert_internal(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, - IndexInfo *indexInfo) + IndexInfo *indexInfo, + bool warm_update) { bool result; IndexTuple itup; @@ -328,6 +331,11 @@ btinsert(Relation rel, Datum *values, bool *isnull, itup = index_form_tuple(RelationGetDescr(rel), values, isnull); itup->t_tid = *ht_ctid; + if (warm_update) + ItemPointerSetFlags(&itup->t_tid, BTREE_INDEX_WARM_POINTER); + else + ItemPointerClearFlags(&itup->t_tid); + result = _bt_doinsert(rel, itup, checkUnique, heapRel); pfree(itup); @@ -335,6 +343,26 @@ btinsert(Relation rel, Datum *values, bool *isnull, return result; } +bool +btinsert(Relation rel, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + IndexInfo *indexInfo) +{ + return btinsert_internal(rel, values, isnull, ht_ctid, heapRel, + checkUnique, indexInfo, false); +} + +bool +btwarminsert(Relation rel, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + IndexInfo *indexInfo) +{ + return btinsert_internal(rel, values, isnull, ht_ctid, heapRel, + checkUnique, indexInfo, true); +} + /* * btgettuple() -- Get the next tuple in the scan. */ @@ -1103,7 +1131,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, RBM_NORMAL, info->strategy); LockBufferForCleanup(buf); _bt_checkpage(rel, buf); - _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed); + _bt_handleitems_vacuum(rel, buf, NULL, 0, NULL, 0); _bt_relbuf(rel, buf); } @@ -1201,6 +1229,8 @@ restart: { OffsetNumber deletable[MaxOffsetNumber]; int ndeletable; + OffsetNumber clearwarm[MaxOffsetNumber]; + int nclearwarm; OffsetNumber offnum, minoff, maxoff; @@ -1239,7 +1269,7 @@ restart: * Scan over all items to see which ones need deleted according to the * callback function. */ - ndeletable = 0; + ndeletable = nclearwarm = 0; minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); if (callback) @@ -1250,6 +1280,9 @@ restart: { IndexTuple itup; ItemPointer htup; + int flags; + bool is_warm = false; + IndexBulkDeleteCallbackResult result; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); @@ -1276,16 +1309,36 @@ restart: * applies to *any* type of index that marks index tuples as * killed. */ - if (callback(htup, callback_state)) + flags = ItemPointerGetFlags(&itup->t_tid); + is_warm = ((flags & BTREE_INDEX_WARM_POINTER) != 0); + + if (is_warm) + stats->num_warm_pointers++; + else + stats->num_clear_pointers++; + + result = callback(htup, is_warm, callback_state); + if (result == IBDCR_DELETE) + { + if (is_warm) + stats->warm_pointers_removed++; + else + stats->clear_pointers_removed++; deletable[ndeletable++] = offnum; + } + else if (result == IBDCR_CLEAR_WARM) + { + clearwarm[nclearwarm++] = offnum; + } } } /* - * Apply any needed deletes. We issue just one _bt_delitems_vacuum() - * call per page, so as to minimize WAL traffic. + * Apply any needed deletes and clearing. We issue just one + * _bt_handleitems_vacuum() call per page, so as to minimize WAL + * traffic. */ - if (ndeletable > 0) + if (ndeletable > 0 || nclearwarm > 0) { /* * Notice that the issued XLOG_BTREE_VACUUM WAL record includes @@ -1301,8 +1354,8 @@ restart: * doesn't seem worth the amount of bookkeeping it'd take to avoid * that. */ - _bt_delitems_vacuum(rel, buf, deletable, ndeletable, - vstate->lastBlockVacuumed); + _bt_handleitems_vacuum(rel, buf, deletable, ndeletable, + clearwarm, nclearwarm); /* * Remember highest leaf page number we've issued a @@ -1312,6 +1365,7 @@ restart: vstate->lastBlockVacuumed = blkno; stats->tuples_removed += ndeletable; + stats->pointers_cleared += nclearwarm; /* must recompute maxoff */ maxoff = PageGetMaxOffsetNumber(page); } diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 5b259a3..8dab5a8 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -20,11 +20,13 @@ #include "access/nbtree.h" #include "access/reloptions.h" #include "access/relscan.h" +#include "catalog/index.h" #include "miscadmin.h" #include "utils/array.h" #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/rel.h" +#include "utils/datum.h" typedef struct BTSortArrayContext @@ -2069,3 +2071,64 @@ btproperty(Oid index_oid, int attno, return false; /* punt to generic code */ } } + +/* + * Check if the index tuple's key matches the one computed from the given heap + * tuple's attribute + */ +bool +btrecheck(Relation indexRel, IndexInfo *indexInfo, IndexTuple indexTuple, + Relation heapRel, HeapTuple heapTuple) +{ + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + bool isavail[INDEX_MAX_KEYS]; + int i; + bool equal; + int natts = indexRel->rd_rel->relnatts; + Form_pg_attribute att; + + FormIndexPlainDatum(indexInfo, heapRel, heapTuple, values, isnull, isavail); + + equal = true; + for (i = 1; i <= natts; i++) + { + Datum indxvalue; + bool indxisnull; + + /* No need to compare if the attribute value is not available */ + if (!isavail[i - 1]) + continue; + + indxvalue = index_getattr(indexTuple, i, indexRel->rd_att, &indxisnull); + + /* + * If both are NULL, then they are equal + */ + if (isnull[i - 1] && indxisnull) + continue; + + /* + * If just one is NULL, then they are not equal + */ + if (isnull[i - 1] || indxisnull) + { + equal = false; + break; + } + + /* + * Now just do a raw memory comparison. If the index tuple was formed + * using this heap tuple, the computed index values must match + */ + att = indexRel->rd_att->attrs[i - 1]; + if (!datumIsEqual(values[i - 1], indxvalue, att->attbyval, + att->attlen)) + { + equal = false; + break; + } + } + + return equal; +} diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index ac60db0..92be5c8 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -390,83 +390,9 @@ btree_xlog_vacuum(XLogReaderState *record) Buffer buffer; Page page; BTPageOpaque opaque; -#ifdef UNUSED xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record); /* - * This section of code is thought to be no longer needed, after analysis - * of the calling paths. It is retained to allow the code to be reinstated - * if a flaw is revealed in that thinking. - * - * If we are running non-MVCC scans using this index we need to do some - * additional work to ensure correctness, which is known as a "pin scan" - * described in more detail in next paragraphs. We used to do the extra - * work in all cases, whereas we now avoid that work in most cases. If - * lastBlockVacuumed is set to InvalidBlockNumber then we skip the - * additional work required for the pin scan. - * - * Avoiding this extra work is important since it requires us to touch - * every page in the index, so is an O(N) operation. Worse, it is an - * operation performed in the foreground during redo, so it delays - * replication directly. - * - * If queries might be active then we need to ensure every leaf page is - * unpinned between the lastBlockVacuumed and the current block, if there - * are any. This prevents replay of the VACUUM from reaching the stage of - * removing heap tuples while there could still be indexscans "in flight" - * to those particular tuples for those scans which could be confused by - * finding new tuples at the old TID locations (see nbtree/README). - * - * It might be worth checking if there are actually any backends running; - * if not, we could just skip this. - * - * Since VACUUM can visit leaf pages out-of-order, it might issue records - * with lastBlockVacuumed >= block; that's not an error, it just means - * nothing to do now. - * - * Note: since we touch all pages in the range, we will lock non-leaf - * pages, and also any empty (all-zero) pages that may be in the index. It - * doesn't seem worth the complexity to avoid that. But it's important - * that HotStandbyActiveInReplay() will not return true if the database - * isn't yet consistent; so we need not fear reading still-corrupt blocks - * here during crash recovery. - */ - if (HotStandbyActiveInReplay() && BlockNumberIsValid(xlrec->lastBlockVacuumed)) - { - RelFileNode thisrnode; - BlockNumber thisblkno; - BlockNumber blkno; - - XLogRecGetBlockTag(record, 0, &thisrnode, NULL, &thisblkno); - - for (blkno = xlrec->lastBlockVacuumed + 1; blkno < thisblkno; blkno++) - { - /* - * We use RBM_NORMAL_NO_LOG mode because it's not an error - * condition to see all-zero pages. The original btvacuumpage - * scan would have skipped over all-zero pages, noting them in FSM - * but not bothering to initialize them just yet; so we mustn't - * throw an error here. (We could skip acquiring the cleanup lock - * if PageIsNew, but it's probably not worth the cycles to test.) - * - * XXX we don't actually need to read the block, we just need to - * confirm it is unpinned. If we had a special call into the - * buffer manager we could optimise this so that if the block is - * not in shared_buffers we confirm it as unpinned. Optimizing - * this is now moot, since in most cases we avoid the scan. - */ - buffer = XLogReadBufferExtended(thisrnode, MAIN_FORKNUM, blkno, - RBM_NORMAL_NO_LOG); - if (BufferIsValid(buffer)) - { - LockBufferForCleanup(buffer); - UnlockReleaseBuffer(buffer); - } - } - } -#endif - - /* * Like in btvacuumpage(), we need to take a cleanup lock on every leaf * page. See nbtree/README for details. */ @@ -482,19 +408,30 @@ btree_xlog_vacuum(XLogReaderState *record) if (len > 0) { - OffsetNumber *unused; - OffsetNumber *unend; + OffsetNumber *offnums = (OffsetNumber *) ptr; - unused = (OffsetNumber *) ptr; - unend = (OffsetNumber *) ((char *) ptr + len); + /* + * Clear the WARM pointers. + * + * We must do this before dealing with the dead items because + * PageIndexMultiDelete may move items around to compactify the + * array and hence offnums recorded earlier won't make any sense + * after PageIndexMultiDelete is called. + */ + if (xlrec->nclearitems > 0) + _bt_clear_items(page, offnums + xlrec->ndelitems, + xlrec->nclearitems); - if ((unend - unused) > 0) - PageIndexMultiDelete(page, unused, unend - unused); + /* + * And handle the deleted items too + */ + if (xlrec->ndelitems > 0) + PageIndexMultiDelete(page, offnums, xlrec->ndelitems); } /* * Mark the page as not containing any LP_DEAD items --- see comments - * in _bt_delitems_vacuum(). + * in _bt_handleitems_vacuum(). */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque->btpo_flags &= ~BTP_HAS_GARBAGE; diff --git a/src/backend/access/rmgrdesc/heapdesc.c b/src/backend/access/rmgrdesc/heapdesc.c index 44d2d63..d373e61 100644 --- a/src/backend/access/rmgrdesc/heapdesc.c +++ b/src/backend/access/rmgrdesc/heapdesc.c @@ -44,6 +44,12 @@ heap_desc(StringInfo buf, XLogReaderState *record) appendStringInfo(buf, "off %u", xlrec->offnum); } + else if (info == XLOG_HEAP_MULTI_INSERT) + { + xl_heap_multi_insert *xlrec = (xl_heap_multi_insert *) rec; + + appendStringInfo(buf, "%d tuples", xlrec->ntuples); + } else if (info == XLOG_HEAP_DELETE) { xl_heap_delete *xlrec = (xl_heap_delete *) rec; @@ -102,7 +108,7 @@ heap2_desc(StringInfo buf, XLogReaderState *record) char *rec = XLogRecGetData(record); uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; - info &= XLOG_HEAP_OPMASK; + info &= XLOG_HEAP2_OPMASK; if (info == XLOG_HEAP2_CLEAN) { xl_heap_clean *xlrec = (xl_heap_clean *) rec; @@ -129,12 +135,6 @@ heap2_desc(StringInfo buf, XLogReaderState *record) appendStringInfo(buf, "cutoff xid %u flags %d", xlrec->cutoff_xid, xlrec->flags); } - else if (info == XLOG_HEAP2_MULTI_INSERT) - { - xl_heap_multi_insert *xlrec = (xl_heap_multi_insert *) rec; - - appendStringInfo(buf, "%d tuples", xlrec->ntuples); - } else if (info == XLOG_HEAP2_LOCK_UPDATED) { xl_heap_lock_updated *xlrec = (xl_heap_lock_updated *) rec; @@ -171,6 +171,12 @@ heap_identify(uint8 info) case XLOG_HEAP_INSERT | XLOG_HEAP_INIT_PAGE: id = "INSERT+INIT"; break; + case XLOG_HEAP_MULTI_INSERT: + id = "MULTI_INSERT"; + break; + case XLOG_HEAP_MULTI_INSERT | XLOG_HEAP_INIT_PAGE: + id = "MULTI_INSERT+INIT"; + break; case XLOG_HEAP_DELETE: id = "DELETE"; break; @@ -219,12 +225,6 @@ heap2_identify(uint8 info) case XLOG_HEAP2_VISIBLE: id = "VISIBLE"; break; - case XLOG_HEAP2_MULTI_INSERT: - id = "MULTI_INSERT"; - break; - case XLOG_HEAP2_MULTI_INSERT | XLOG_HEAP_INIT_PAGE: - id = "MULTI_INSERT+INIT"; - break; case XLOG_HEAP2_LOCK_UPDATED: id = "LOCK_UPDATED"; break; diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c index fbde9d6..6b2c5d6 100644 --- a/src/backend/access/rmgrdesc/nbtdesc.c +++ b/src/backend/access/rmgrdesc/nbtdesc.c @@ -48,8 +48,8 @@ btree_desc(StringInfo buf, XLogReaderState *record) { xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec; - appendStringInfo(buf, "lastBlockVacuumed %u", - xlrec->lastBlockVacuumed); + appendStringInfo(buf, "ndelitems %u, nclearitems %u", + xlrec->ndelitems, xlrec->nclearitems); break; } case XLOG_BTREE_DELETE: diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c index e57ac49..59ef7f3 100644 --- a/src/backend/access/spgist/spgutils.c +++ b/src/backend/access/spgist/spgutils.c @@ -72,6 +72,7 @@ spghandler(PG_FUNCTION_ARGS) amroutine->amestimateparallelscan = NULL; amroutine->aminitparallelscan = NULL; amroutine->amparallelrescan = NULL; + amroutine->amrecheck = NULL; PG_RETURN_POINTER(amroutine); } diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index cce9b3f..711d351 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -155,7 +155,8 @@ vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer, { Assert(ItemPointerIsValid(<->heapPtr)); - if (bds->callback(<->heapPtr, bds->callback_state)) + if (bds->callback(<->heapPtr, false, bds->callback_state) == + IBDCR_DELETE) { bds->stats->tuples_removed += 1; deletable[i] = true; @@ -425,7 +426,8 @@ vacuumLeafRoot(spgBulkDeleteState *bds, Relation index, Buffer buffer) { Assert(ItemPointerIsValid(<->heapPtr)); - if (bds->callback(<->heapPtr, bds->callback_state)) + if (bds->callback(<->heapPtr, false, bds->callback_state) == + IBDCR_DELETE) { bds->stats->tuples_removed += 1; toDelete[xlrec.nDelete] = i; @@ -902,10 +904,10 @@ spgbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, } /* Dummy callback to delete no tuples during spgvacuumcleanup */ -static bool -dummy_callback(ItemPointer itemptr, void *state) +static IndexBulkDeleteCallbackResult +dummy_callback(ItemPointer itemptr, bool is_warm, void *state) { - return false; + return IBDCR_KEEP; } /* diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 8d42a34..67e68d1 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -54,6 +54,7 @@ #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" #include "optimizer/clauses.h" +#include "optimizer/var.h" #include "parser/parser.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" @@ -114,7 +115,7 @@ static void IndexCheckExclusion(Relation heapRelation, IndexInfo *indexInfo); static inline int64 itemptr_encode(ItemPointer itemptr); static inline void itemptr_decode(ItemPointer itemptr, int64 encoded); -static bool validate_index_callback(ItemPointer itemptr, void *opaque); +static IndexBulkDeleteCallbackResult validate_index_callback(ItemPointer itemptr, bool is_warm, void *opaque); static void validate_index_heapscan(Relation heapRelation, Relation indexRelation, IndexInfo *indexInfo, @@ -1691,6 +1692,20 @@ BuildIndexInfo(Relation index) ii->ii_AmCache = NULL; ii->ii_Context = CurrentMemoryContext; + /* build a bitmap of all table attributes referred by this index */ + for (i = 0; i < ii->ii_NumIndexAttrs; i++) + { + AttrNumber attr = ii->ii_KeyAttrNumbers[i]; + ii->ii_indxattrs = bms_add_member(ii->ii_indxattrs, attr - + FirstLowInvalidHeapAttributeNumber); + } + + /* Collect all attributes used in expressions, too */ + pull_varattnos((Node *) ii->ii_Expressions, 1, &ii->ii_indxattrs); + + /* Collect all attributes in the index predicate, too */ + pull_varattnos((Node *) ii->ii_Predicate, 1, &ii->ii_indxattrs); + return ii; } @@ -1816,6 +1831,50 @@ FormIndexDatum(IndexInfo *indexInfo, elog(ERROR, "wrong number of index expressions"); } +/* + * This is same as FormIndexDatum but we don't compute any expression + * attributes and hence can be used when executor interfaces are not available. + * If i'th attribute is available then isavail[i] is set to true, else set to + * false. The caller must always check if an attribute value is available + * before trying to do anything useful with that. + */ +void +FormIndexPlainDatum(IndexInfo *indexInfo, + Relation heapRel, + HeapTuple heapTup, + Datum *values, + bool *isnull, + bool *isavail) +{ + int i; + + for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++) + { + int keycol = indexInfo->ii_KeyAttrNumbers[i]; + Datum iDatum; + bool isNull; + + if (keycol != 0) + { + /* + * Plain index column; get the value we need directly from the + * heap tuple. + */ + iDatum = heap_getattr(heapTup, keycol, RelationGetDescr(heapRel), &isNull); + values[i] = iDatum; + isnull[i] = isNull; + isavail[i] = true; + } + else + { + /* + * This is an expression attribute which can't be computed by us. + * So just inform the caller about it. + */ + isavail[i] = false; + } + } +} /* * index_update_stats --- update pg_class entry after CREATE INDEX or REINDEX @@ -2934,15 +2993,15 @@ itemptr_decode(ItemPointer itemptr, int64 encoded) /* * validate_index_callback - bulkdelete callback to collect the index TIDs */ -static bool -validate_index_callback(ItemPointer itemptr, void *opaque) +static IndexBulkDeleteCallbackResult +validate_index_callback(ItemPointer itemptr, bool is_warm, void *opaque) { v_i_state *state = (v_i_state *) opaque; int64 encoded = itemptr_encode(itemptr); tuplesort_putdatum(state->tuplesort, Int64GetDatum(encoded), false); state->itups += 1; - return false; /* never actually delete anything */ + return IBDCR_KEEP; /* never actually delete anything */ } /* @@ -3163,7 +3222,8 @@ validate_index_heapscan(Relation heapRelation, heapRelation, indexInfo->ii_Unique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, - indexInfo); + indexInfo, + false); state->tups_inserted += 1; } diff --git a/src/backend/catalog/indexing.c b/src/backend/catalog/indexing.c index abc344a..6392f33 100644 --- a/src/backend/catalog/indexing.c +++ b/src/backend/catalog/indexing.c @@ -66,10 +66,15 @@ CatalogCloseIndexes(CatalogIndexState indstate) * * This should be called for each inserted or updated catalog tuple. * + * If the tuple was WARM updated, the modified_attrs contains the list of + * columns updated by the update. We must not insert new index entries for + * indexes which do not refer to any of the modified columns. + * * This is effectively a cut-down version of ExecInsertIndexTuples. */ static void -CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple) +CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple, + Bitmapset *modified_attrs, bool warm_update) { int i; int numIndexes; @@ -79,12 +84,28 @@ CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple) IndexInfo **indexInfoArray; Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; + ItemPointerData root_tid; - /* HOT update does not require index inserts */ - if (HeapTupleIsHeapOnly(heapTuple)) + /* + * HOT update does not require index inserts, but WARM may need for some + * indexes. + */ + if (HeapTupleIsHeapOnly(heapTuple) && !warm_update) return; /* + * If we've done a WARM update, then we must index the TID of the root line + * pointer and not the actual TID of the new tuple. + */ + if (warm_update) + ItemPointerSet(&root_tid, + ItemPointerGetBlockNumber(&(heapTuple->t_self)), + HeapTupleHeaderGetRootOffset(heapTuple->t_data)); + else + ItemPointerCopy(&heapTuple->t_self, &root_tid); + + + /* * Get information from the state structure. Fall out if nothing to do. */ numIndexes = indstate->ri_NumIndices; @@ -112,6 +133,17 @@ CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple) continue; /* + * If we've done WARM update, then we must not insert a new index tuple + * if none of the index keys have changed. This is not just an + * optimization, but a requirement for WARM to work correctly. + */ + if (warm_update) + { + if (!bms_overlap(modified_attrs, indexInfo->ii_indxattrs)) + continue; + } + + /* * Expressional and partial indexes on system catalogs are not * supported, nor exclusion constraints, nor deferred uniqueness */ @@ -136,11 +168,12 @@ CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple) index_insert(relationDescs[i], /* index relation */ values, /* array of index Datums */ isnull, /* is-null flags */ - &(heapTuple->t_self), /* tid of heap tuple */ + &root_tid, heapRelation, relationDescs[i]->rd_index->indisunique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, - indexInfo); + indexInfo, + warm_update); } ExecDropSingleTupleTableSlot(slot); @@ -168,7 +201,7 @@ CatalogTupleInsert(Relation heapRel, HeapTuple tup) oid = simple_heap_insert(heapRel, tup); - CatalogIndexInsert(indstate, tup); + CatalogIndexInsert(indstate, tup, NULL, false); CatalogCloseIndexes(indstate); return oid; @@ -190,7 +223,7 @@ CatalogTupleInsertWithInfo(Relation heapRel, HeapTuple tup, oid = simple_heap_insert(heapRel, tup); - CatalogIndexInsert(indstate, tup); + CatalogIndexInsert(indstate, tup, NULL, false); return oid; } @@ -210,12 +243,14 @@ void CatalogTupleUpdate(Relation heapRel, ItemPointer otid, HeapTuple tup) { CatalogIndexState indstate; + bool warm_update; + Bitmapset *modified_attrs; indstate = CatalogOpenIndexes(heapRel); - simple_heap_update(heapRel, otid, tup); + simple_heap_update(heapRel, otid, tup, &modified_attrs, &warm_update); - CatalogIndexInsert(indstate, tup); + CatalogIndexInsert(indstate, tup, modified_attrs, warm_update); CatalogCloseIndexes(indstate); } @@ -231,9 +266,12 @@ void CatalogTupleUpdateWithInfo(Relation heapRel, ItemPointer otid, HeapTuple tup, CatalogIndexState indstate) { - simple_heap_update(heapRel, otid, tup); + Bitmapset *modified_attrs; + bool warm_update; + + simple_heap_update(heapRel, otid, tup, &modified_attrs, &warm_update); - CatalogIndexInsert(indstate, tup); + CatalogIndexInsert(indstate, tup, modified_attrs, warm_update); } /* diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index b6552da..15d0fe4 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -498,6 +498,7 @@ CREATE VIEW pg_stat_all_tables AS pg_stat_get_tuples_updated(C.oid) AS n_tup_upd, pg_stat_get_tuples_deleted(C.oid) AS n_tup_del, pg_stat_get_tuples_hot_updated(C.oid) AS n_tup_hot_upd, + pg_stat_get_tuples_warm_updated(C.oid) AS n_tup_warm_upd, pg_stat_get_live_tuples(C.oid) AS n_live_tup, pg_stat_get_dead_tuples(C.oid) AS n_dead_tup, pg_stat_get_mod_since_analyze(C.oid) AS n_mod_since_analyze, @@ -528,7 +529,8 @@ CREATE VIEW pg_stat_xact_all_tables AS pg_stat_get_xact_tuples_inserted(C.oid) AS n_tup_ins, pg_stat_get_xact_tuples_updated(C.oid) AS n_tup_upd, pg_stat_get_xact_tuples_deleted(C.oid) AS n_tup_del, - pg_stat_get_xact_tuples_hot_updated(C.oid) AS n_tup_hot_upd + pg_stat_get_xact_tuples_hot_updated(C.oid) AS n_tup_hot_upd, + pg_stat_get_xact_tuples_warm_updated(C.oid) AS n_tup_warm_upd FROM pg_class C LEFT JOIN pg_index I ON C.oid = I.indrelid LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace) diff --git a/src/backend/commands/constraint.c b/src/backend/commands/constraint.c index e2544e5..330b661 100644 --- a/src/backend/commands/constraint.c +++ b/src/backend/commands/constraint.c @@ -40,6 +40,7 @@ unique_key_recheck(PG_FUNCTION_ARGS) TriggerData *trigdata = castNode(TriggerData, fcinfo->context); const char *funcname = "unique_key_recheck"; HeapTuple new_row; + HeapTupleData heapTuple; ItemPointerData tmptid; Relation indexRel; IndexInfo *indexInfo; @@ -102,7 +103,8 @@ unique_key_recheck(PG_FUNCTION_ARGS) * removed. */ tmptid = new_row->t_self; - if (!heap_hot_search(&tmptid, trigdata->tg_relation, SnapshotSelf, NULL)) + if (!heap_hot_search(&tmptid, trigdata->tg_relation, SnapshotSelf, NULL, + NULL, NULL, &heapTuple)) { /* * All rows in the HOT chain are dead, so skip the check. @@ -166,7 +168,8 @@ unique_key_recheck(PG_FUNCTION_ARGS) */ index_insert(indexRel, values, isnull, &(new_row->t_self), trigdata->tg_relation, UNIQUE_CHECK_EXISTING, - indexInfo); + indexInfo, + false); } else { diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index ba89b29..120e261 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -2681,6 +2681,8 @@ CopyFrom(CopyState cstate) if (resultRelInfo->ri_NumIndices > 0) recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), + &(tuple->t_self), + NULL, estate, false, NULL, @@ -2835,6 +2837,7 @@ CopyFromInsertBatch(CopyState cstate, EState *estate, CommandId mycid, ExecStoreTuple(bufferedTuples[i], myslot, InvalidBuffer, false); recheckIndexes = ExecInsertIndexTuples(myslot, &(bufferedTuples[i]->t_self), + &(bufferedTuples[i]->t_self), NULL, estate, false, NULL, NIL); ExecARInsertTriggers(estate, resultRelInfo, bufferedTuples[i], diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 9618032..1b2abd4 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -694,7 +694,14 @@ DefineIndex(Oid relationId, * visible to other transactions before we start to build the index. That * will prevent them from making incompatible HOT updates. The new index * will be marked not indisready and not indisvalid, so that no one else - * tries to either insert into it or use it for queries. + * tries to either insert into it or use it for queries. In addition to + * that, WARM updates will be disallowed if an update is modifying one of + * the columns used by this new index. This is necessary to ensure that we + * don't create WARM tuples which do not have corresponding entry in this + * index. It must be noted that during the second phase, we will index only + * those heap tuples whose root line pointer is not already in the index, + * hence it's important that all tuples in a given chain, has the same + * value for any indexed column (including this new index). * * We must commit our current transaction so that the index becomes * visible; then start another. Note that all the data structures we just @@ -742,7 +749,10 @@ DefineIndex(Oid relationId, * marked as "not-ready-for-inserts". The index is consulted while * deciding HOT-safety though. This arrangement ensures that no new HOT * chains can be created where the new tuple and the old tuple in the - * chain have different index keys. + * chain have different index keys. Also, the new index is consulted for + * deciding whether a WARM update is possible, and WARM update is not done + * if a column used by this index is being updated. This ensures that we + * don't create WARM tuples which are not indexed by this index. * * We now take a new snapshot, and build the index using all tuples that * are visible in this snapshot. We can be sure that any HOT updates to @@ -777,7 +787,8 @@ DefineIndex(Oid relationId, /* * Update the pg_index row to mark the index as ready for inserts. Once we * commit this transaction, any new transactions that open the table must - * insert new entries into the index for insertions and non-HOT updates. + * insert new entries into the index for insertions and non-HOT updates or + * WARM updates where this index needs new entry. */ index_set_state_flags(indexRelationId, INDEX_CREATE_SET_READY); diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index b74e493..2b054f7 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -104,6 +104,39 @@ */ #define PREFETCH_SIZE ((BlockNumber) 32) +/* + * Structure to track WARM chains that can be converted into HOT chains during + * this run. + * + * To reduce space requirement, we're using bitfields. But the way things are + * laid down, we're still wasting 1-byte per candidate chain. + */ +typedef struct LVWarmChain +{ + ItemPointerData chain_tid; /* root of the chain */ + + /* + * 1 - if the chain contains only post-warm tuples + * 0 - if the chain contains only pre-warm tuples + */ + uint8 is_postwarm_chain:2; + + /* 1 - if this chain can't be cleared of WARM tuples */ + uint8 keep_warm_chain:2; + + /* + * Number of CLEAR pointers to this root TID found so far - must never be + * more than 2. + */ + uint8 num_clear_pointers:2; + + /* + * Number of WARM pointers to this root TID found so far - must never be + * more than 1. + */ + uint8 num_warm_pointers:2; +} LVWarmChain; + typedef struct LVRelStats { /* hasindex = true means two-pass strategy; false means one-pass */ @@ -122,6 +155,14 @@ typedef struct LVRelStats BlockNumber pages_removed; double tuples_deleted; BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */ + + /* List of candidate WARM chains that can be converted into HOT chains */ + /* NB: this list is ordered by TID of the root pointers */ + int num_warm_chains; /* current # of entries */ + int max_warm_chains; /* # slots allocated in array */ + LVWarmChain *warm_chains; /* array of LVWarmChain */ + double num_non_convertible_warm_chains; + /* List of TIDs of tuples we intend to delete */ /* NB: this list is ordered by TID address */ int num_dead_tuples; /* current # of entries */ @@ -150,6 +191,7 @@ static void lazy_scan_heap(Relation onerel, int options, static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats); static bool lazy_check_needs_freeze(Buffer buf, bool *hastup); static void lazy_vacuum_index(Relation indrel, + bool clear_warm, IndexBulkDeleteResult **stats, LVRelStats *vacrelstats); static void lazy_cleanup_index(Relation indrel, @@ -157,6 +199,10 @@ static void lazy_cleanup_index(Relation indrel, LVRelStats *vacrelstats); static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer); +static int lazy_warmclear_page(Relation onerel, BlockNumber blkno, + Buffer buffer, int chainindex, LVRelStats *vacrelstats, + Buffer *vmbuffer, bool check_all_visible); +static void lazy_reset_warm_pointer_count(LVRelStats *vacrelstats); static bool should_attempt_truncation(LVRelStats *vacrelstats); static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats); static BlockNumber count_nondeletable_pages(Relation onerel, @@ -164,8 +210,15 @@ static BlockNumber count_nondeletable_pages(Relation onerel, static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks); static void lazy_record_dead_tuple(LVRelStats *vacrelstats, ItemPointer itemptr); -static bool lazy_tid_reaped(ItemPointer itemptr, void *state); +static void lazy_record_warm_chain(LVRelStats *vacrelstats, + ItemPointer itemptr); +static void lazy_record_clear_chain(LVRelStats *vacrelstats, + ItemPointer itemptr); +static IndexBulkDeleteCallbackResult lazy_tid_reaped(ItemPointer itemptr, bool is_warm, void *state); +static IndexBulkDeleteCallbackResult lazy_indexvac_phase1(ItemPointer itemptr, bool is_warm, void *state); +static IndexBulkDeleteCallbackResult lazy_indexvac_phase2(ItemPointer itemptr, bool is_warm, void *state); static int vac_cmp_itemptr(const void *left, const void *right); +static int vac_cmp_warm_chain(const void *left, const void *right); static bool heap_page_is_all_visible(Relation rel, Buffer buf, TransactionId *visibility_cutoff_xid, bool *all_frozen); @@ -690,8 +743,10 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, * If we are close to overrunning the available space for dead-tuple * TIDs, pause and do a cycle of vacuuming before we tackle this page. */ - if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage && - vacrelstats->num_dead_tuples > 0) + if (((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage && + vacrelstats->num_dead_tuples > 0) || + ((vacrelstats->max_warm_chains - vacrelstats->num_warm_chains) < MaxHeapTuplesPerPage && + vacrelstats->num_warm_chains > 0)) { const int hvp_index[] = { PROGRESS_VACUUM_PHASE, @@ -721,6 +776,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], + (vacrelstats->num_warm_chains > 0), &indstats[i], vacrelstats); @@ -743,6 +799,9 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, * valid. */ vacrelstats->num_dead_tuples = 0; + vacrelstats->num_warm_chains = 0; + memset(vacrelstats->warm_chains, 0, + vacrelstats->max_warm_chains * sizeof (LVWarmChain)); vacrelstats->num_index_scans++; /* Report that we are once again scanning the heap */ @@ -947,15 +1006,31 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, continue; } + ItemPointerSet(&(tuple.t_self), blkno, offnum); + /* Redirect items mustn't be touched */ if (ItemIdIsRedirected(itemid)) { + HeapCheckWarmChainStatus status = heap_check_warm_chain(page, + &tuple.t_self, false); + if (HCWC_IS_WARM_UPDATED(status)) + { + /* + * A chain which is either complete WARM or CLEAR is a + * candidate for chain conversion. Remember the chain and + * whether the chain has all WARM tuples or not. + */ + if (HCWC_IS_ALL_WARM(status)) + lazy_record_warm_chain(vacrelstats, &tuple.t_self); + else if (HCWC_IS_ALL_CLEAR(status)) + lazy_record_clear_chain(vacrelstats, &tuple.t_self); + else + vacrelstats->num_non_convertible_warm_chains++; + } hastup = true; /* this page won't be truncatable */ continue; } - ItemPointerSet(&(tuple.t_self), blkno, offnum); - /* * DEAD item pointers are to be vacuumed normally; but we don't * count them in tups_vacuumed, else we'd be double-counting (at @@ -975,6 +1050,26 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, tuple.t_len = ItemIdGetLength(itemid); tuple.t_tableOid = RelationGetRelid(onerel); + if (!HeapTupleIsHeapOnly(&tuple)) + { + HeapCheckWarmChainStatus status = heap_check_warm_chain(page, + &tuple.t_self, false); + if (HCWC_IS_WARM_UPDATED(status)) + { + /* + * A chain which is either complete WARM or CLEAR is a + * candidate for chain conversion. Remember the chain and + * its color. + */ + if (HCWC_IS_ALL_WARM(status)) + lazy_record_warm_chain(vacrelstats, &tuple.t_self); + else if (HCWC_IS_ALL_CLEAR(status)) + lazy_record_clear_chain(vacrelstats, &tuple.t_self); + else + vacrelstats->num_non_convertible_warm_chains++; + } + } + tupgone = false; switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf)) @@ -1040,6 +1135,19 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, break; } + /* + * If this tuple was ever WARM updated or is a WARM + * tuple, there could be multiple index entries + * pointing to the root of this chain. We can't do + * index-only scans for such tuples without verifying + * index key check. So mark the page as !all_visible + */ + if (HeapTupleHeaderIsWarmUpdated(tuple.t_data)) + { + all_visible = false; + break; + } + /* Track newest xmin on page. */ if (TransactionIdFollows(xmin, visibility_cutoff_xid)) visibility_cutoff_xid = xmin; @@ -1282,7 +1390,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, /* If any tuples need to be deleted, perform final vacuum cycle */ /* XXX put a threshold on min number of tuples here? */ - if (vacrelstats->num_dead_tuples > 0) + if (vacrelstats->num_dead_tuples > 0 || vacrelstats->num_warm_chains > 0) { const int hvp_index[] = { PROGRESS_VACUUM_PHASE, @@ -1300,6 +1408,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], + (vacrelstats->num_warm_chains > 0), &indstats[i], vacrelstats); @@ -1367,7 +1476,10 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, * * This routine marks dead tuples as unused and compacts out free * space on their pages. Pages not having dead tuples recorded from - * lazy_scan_heap are not visited at all. + * lazy_scan_heap are not visited at all. This routine also converts + * candidate WARM chains to HOT chains by clearing WARM related flags. The + * candidate chains are determined by the preceeding index scans after + * looking at the data collected by the first heap scan. * * Note: the reason for doing this as a second pass is we cannot remove * the tuples until we've removed their index entries, and we want to @@ -1376,7 +1488,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) { - int tupindex; + int tupindex, chainindex; int npages; PGRUsage ru0; Buffer vmbuffer = InvalidBuffer; @@ -1385,33 +1497,69 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) npages = 0; tupindex = 0; - while (tupindex < vacrelstats->num_dead_tuples) + chainindex = 0; + while (tupindex < vacrelstats->num_dead_tuples || + chainindex < vacrelstats->num_warm_chains) { - BlockNumber tblk; + BlockNumber tblk, chainblk, vacblk; Buffer buf; Page page; Size freespace; vacuum_delay_point(); - tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]); - buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL, + tblk = chainblk = InvalidBlockNumber; + if (chainindex < vacrelstats->num_warm_chains) + chainblk = + ItemPointerGetBlockNumber(&(vacrelstats->warm_chains[chainindex].chain_tid)); + + if (tupindex < vacrelstats->num_dead_tuples) + tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]); + + if (tblk == InvalidBlockNumber) + vacblk = chainblk; + else if (chainblk == InvalidBlockNumber) + vacblk = tblk; + else + vacblk = Min(chainblk, tblk); + + Assert(vacblk != InvalidBlockNumber); + + buf = ReadBufferExtended(onerel, MAIN_FORKNUM, vacblk, RBM_NORMAL, vac_strategy); - if (!ConditionalLockBufferForCleanup(buf)) + + + if (vacblk == chainblk) + LockBufferForCleanup(buf); + else if (!ConditionalLockBufferForCleanup(buf)) { ReleaseBuffer(buf); ++tupindex; continue; } - tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats, - &vmbuffer); + + /* + * Convert WARM chains on this page. This should be done before + * vacuuming the page to ensure that we can correctly set visibility + * bits after clearing WARM chains. + * + * If we are going to vacuum this page then don't check for + * all-visibility just yet. + */ + if (vacblk == chainblk) + chainindex = lazy_warmclear_page(onerel, chainblk, buf, chainindex, + vacrelstats, &vmbuffer, chainblk != tblk); + + if (vacblk == tblk) + tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats, + &vmbuffer); /* Now that we've compacted the page, record its available space */ page = BufferGetPage(buf); freespace = PageGetHeapFreeSpace(page); UnlockReleaseBuffer(buf); - RecordPageWithFreeSpace(onerel, tblk, freespace); + RecordPageWithFreeSpace(onerel, vacblk, freespace); npages++; } @@ -1430,6 +1578,107 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) } /* + * lazy_warmclear_page() -- clear various WARM bits on the tuples. + * + * Caller must hold pin and buffer cleanup lock on the buffer. + * + * chainindex is the index in vacrelstats->warm_chains of the first dead + * tuple for this page. We assume the rest follow sequentially. + * The return value is the first tupindex after the tuples of this page. + * + * If check_all_visible is set then we also check if the page has now become + * all visible and update visibility map. + */ +static int +lazy_warmclear_page(Relation onerel, BlockNumber blkno, Buffer buffer, + int chainindex, LVRelStats *vacrelstats, Buffer *vmbuffer, + bool check_all_visible) +{ + Page page = BufferGetPage(buffer); + OffsetNumber cleared_offnums[MaxHeapTuplesPerPage]; + int num_cleared = 0; + TransactionId visibility_cutoff_xid; + bool all_frozen; + + pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_WARMCLEARED, blkno); + + START_CRIT_SECTION(); + + for (; chainindex < vacrelstats->num_warm_chains ; chainindex++) + { + BlockNumber tblk; + LVWarmChain *chain; + + chain = &vacrelstats->warm_chains[chainindex]; + + tblk = ItemPointerGetBlockNumber(&chain->chain_tid); + if (tblk != blkno) + break; /* past end of tuples for this block */ + + /* + * Since a heap page can have no more than MaxHeapTuplesPerPage + * offnums and we process each offnum only once, MaxHeapTuplesPerPage + * size array should be enough to hold all cleared tuples in this page. + */ + if (!chain->keep_warm_chain) + num_cleared += heap_clear_warm_chain(page, &chain->chain_tid, + cleared_offnums + num_cleared); + } + + /* + * Mark buffer dirty before we write WAL. + */ + MarkBufferDirty(buffer); + + /* XLOG stuff */ + if (RelationNeedsWAL(onerel)) + { + XLogRecPtr recptr; + + recptr = log_heap_warmclear(onerel, buffer, + cleared_offnums, num_cleared); + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + /* If not checking for all-visibility then we're done */ + if (!check_all_visible) + return chainindex; + + /* + * The following code should match the corresponding code in + * lazy_vacuum_page + **/ + if (heap_page_is_all_visible(onerel, buffer, &visibility_cutoff_xid, + &all_frozen)) + PageSetAllVisible(page); + + /* + * All the changes to the heap page have been done. If the all-visible + * flag is now set, also set the VM all-visible bit (and, if possible, the + * all-frozen bit) unless this has already been done previously. + */ + if (PageIsAllVisible(page)) + { + uint8 vm_status = visibilitymap_get_status(onerel, blkno, vmbuffer); + uint8 flags = 0; + + /* Set the VM all-frozen bit to flag, if needed */ + if ((vm_status & VISIBILITYMAP_ALL_VISIBLE) == 0) + flags |= VISIBILITYMAP_ALL_VISIBLE; + if ((vm_status & VISIBILITYMAP_ALL_FROZEN) == 0 && all_frozen) + flags |= VISIBILITYMAP_ALL_FROZEN; + + Assert(BufferIsValid(*vmbuffer)); + if (flags != 0) + visibilitymap_set(onerel, blkno, buffer, InvalidXLogRecPtr, + *vmbuffer, visibility_cutoff_xid, flags); + } + return chainindex; +} + +/* * lazy_vacuum_page() -- free dead tuples on a page * and repair its fragmentation. * @@ -1582,6 +1831,24 @@ lazy_check_needs_freeze(Buffer buf, bool *hastup) return false; } +/* + * Reset counters tracking number of WARM and CLEAR pointers per candidate TID. + * These counters are maintained per index and cleared when the next index is + * picked up for cleanup. + * + * We don't touch the keep_warm_chain since once a chain is known to be + * non-convertible, we must remember that across all indexes. + */ +static void +lazy_reset_warm_pointer_count(LVRelStats *vacrelstats) +{ + int i; + for (i = 0; i < vacrelstats->num_warm_chains; i++) + { + LVWarmChain *chain = &vacrelstats->warm_chains[i]; + chain->num_clear_pointers = chain->num_warm_pointers = 0; + } +} /* * lazy_vacuum_index() -- vacuum one index relation. @@ -1591,6 +1858,7 @@ lazy_check_needs_freeze(Buffer buf, bool *hastup) */ static void lazy_vacuum_index(Relation indrel, + bool clear_warm, IndexBulkDeleteResult **stats, LVRelStats *vacrelstats) { @@ -1606,15 +1874,87 @@ lazy_vacuum_index(Relation indrel, ivinfo.num_heap_tuples = vacrelstats->old_rel_tuples; ivinfo.strategy = vac_strategy; - /* Do bulk deletion */ - *stats = index_bulk_delete(&ivinfo, *stats, - lazy_tid_reaped, (void *) vacrelstats); + /* + * If told, convert WARM chains into HOT chains. + * + * We must have already collected candidate WARM chains i.e. chains that + * have either all tuples with HEAP_WARM_TUPLE flag set or none. + * + * This works in two phases. In the first phase, we do a complete index + * scan and collect information about index pointers to the candidate + * chains, but we don't do conversion. To be precise, we count the number + * of WARM and CLEAR index pointers to each candidate chain and use that + * knowledge to arrive at a decision and do the actual conversion during + * the second phase (we kill known dead pointers though in this phase). + * + * In the second phase, for each candidate chain we check if we have seen a + * WARM index pointer. For such chains, we kill the CLEAR pointer and + * convert the WARM pointer into a CLEAR pointer. The heap tuples are + * cleared of WARM flags in the second heap scan. If we did not find any + * WARM pointer to a WARM chain, that means that the chain is reachable + * from the CLEAR pointer (because say WARM update did not add a new entry + * for this index). In that case, we do nothing. There is a third case + * where we find two CLEAR pointers to a candidate chain. This can happen + * because of aborted vacuums. We don't handle that case yet, but it should + * be possible to apply the same recheck logic and find which of the clear + * pointers is redundant and should be removed. + * + * For CLEAR chains, we just kill the WARM pointer, if it exist,s and keep + * the CLEAR pointer. + */ + if (clear_warm) + { + /* + * Before starting the index scan, reset the counters of WARM and CLEAR + * pointers, probably carried forward from the previous index. + */ + lazy_reset_warm_pointer_count(vacrelstats); + + *stats = index_bulk_delete(&ivinfo, *stats, + lazy_indexvac_phase1, (void *) vacrelstats); + ereport(elevel, + (errmsg("scanned index \"%s\" to remove %d row version, found " + "%0.f warm pointers, %0.f clear pointers, removed " + "%0.f warm pointers, removed %0.f clear pointers", + RelationGetRelationName(indrel), + vacrelstats->num_dead_tuples, + (*stats)->num_warm_pointers, + (*stats)->num_clear_pointers, + (*stats)->warm_pointers_removed, + (*stats)->clear_pointers_removed))); + + (*stats)->num_warm_pointers = 0; + (*stats)->num_clear_pointers = 0; + (*stats)->warm_pointers_removed = 0; + (*stats)->clear_pointers_removed = 0; + (*stats)->pointers_cleared = 0; + + *stats = index_bulk_delete(&ivinfo, *stats, + lazy_indexvac_phase2, (void *) vacrelstats); + ereport(elevel, + (errmsg("scanned index \"%s\" to convert WARM pointers, found " + "%0.f WARM pointers, %0.f CLEAR pointers, removed " + "%0.f WARM pointers, removed %0.f CLEAR pointers, " + "cleared %0.f WARM pointers", + RelationGetRelationName(indrel), + (*stats)->num_warm_pointers, + (*stats)->num_clear_pointers, + (*stats)->warm_pointers_removed, + (*stats)->clear_pointers_removed, + (*stats)->pointers_cleared))); + } + else + { + /* Do bulk deletion */ + *stats = index_bulk_delete(&ivinfo, *stats, + lazy_tid_reaped, (void *) vacrelstats); + ereport(elevel, + (errmsg("scanned index \"%s\" to remove %d row versions", + RelationGetRelationName(indrel), + vacrelstats->num_dead_tuples), + errdetail("%s.", pg_rusage_show(&ru0)))); + } - ereport(elevel, - (errmsg("scanned index \"%s\" to remove %d row versions", - RelationGetRelationName(indrel), - vacrelstats->num_dead_tuples), - errdetail("%s.", pg_rusage_show(&ru0)))); } /* @@ -1988,9 +2328,11 @@ lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks) if (vacrelstats->hasindex) { - maxtuples = (vac_work_mem * 1024L) / sizeof(ItemPointerData); + maxtuples = (vac_work_mem * 1024L) / (sizeof(ItemPointerData) + + sizeof(LVWarmChain)); maxtuples = Min(maxtuples, INT_MAX); - maxtuples = Min(maxtuples, MaxAllocSize / sizeof(ItemPointerData)); + maxtuples = Min(maxtuples, MaxAllocSize / (sizeof(ItemPointerData) + + sizeof(LVWarmChain))); /* curious coding here to ensure the multiplication can't overflow */ if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks) @@ -2008,6 +2350,57 @@ lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks) vacrelstats->max_dead_tuples = (int) maxtuples; vacrelstats->dead_tuples = (ItemPointer) palloc(maxtuples * sizeof(ItemPointerData)); + + /* + * XXX Cheat for now and allocate the same size array for tracking warm + * chains. maxtuples must have been already adjusted above to ensure we + * don't cross vac_work_mem. + */ + vacrelstats->num_warm_chains = 0; + vacrelstats->max_warm_chains = (int) maxtuples; + vacrelstats->warm_chains = (LVWarmChain *) + palloc0(maxtuples * sizeof(LVWarmChain)); + +} + +/* + * lazy_record_clear_chain - remember one CLEAR chain + */ +static void +lazy_record_clear_chain(LVRelStats *vacrelstats, + ItemPointer itemptr) +{ + /* + * The array shouldn't overflow under normal behavior, but perhaps it + * could if we are given a really small maintenance_work_mem. In that + * case, just forget the last few tuples (we'll get 'em next time). + */ + if (vacrelstats->num_warm_chains < vacrelstats->max_warm_chains) + { + vacrelstats->warm_chains[vacrelstats->num_warm_chains].chain_tid = *itemptr; + vacrelstats->warm_chains[vacrelstats->num_warm_chains].is_postwarm_chain = 0; + vacrelstats->num_warm_chains++; + } +} + +/* + * lazy_record_warm_chain - remember one WARM chain + */ +static void +lazy_record_warm_chain(LVRelStats *vacrelstats, + ItemPointer itemptr) +{ + /* + * The array shouldn't overflow under normal behavior, but perhaps it + * could if we are given a really small maintenance_work_mem. In that + * case, just forget the last few tuples (we'll get 'em next time). + */ + if (vacrelstats->num_warm_chains < vacrelstats->max_warm_chains) + { + vacrelstats->warm_chains[vacrelstats->num_warm_chains].chain_tid = *itemptr; + vacrelstats->warm_chains[vacrelstats->num_warm_chains].is_postwarm_chain = 1; + vacrelstats->num_warm_chains++; + } } /* @@ -2038,8 +2431,8 @@ lazy_record_dead_tuple(LVRelStats *vacrelstats, * * Assumes dead_tuples array is in sorted order. */ -static bool -lazy_tid_reaped(ItemPointer itemptr, void *state) +static IndexBulkDeleteCallbackResult +lazy_tid_reaped(ItemPointer itemptr, bool is_warm, void *state) { LVRelStats *vacrelstats = (LVRelStats *) state; ItemPointer res; @@ -2050,7 +2443,193 @@ lazy_tid_reaped(ItemPointer itemptr, void *state) sizeof(ItemPointerData), vac_cmp_itemptr); - return (res != NULL); + return (res != NULL) ? IBDCR_DELETE : IBDCR_KEEP; +} + +/* + * lazy_indexvac_phase1() -- run first pass of index vacuum + * + * This has the right signature to be an IndexBulkDeleteCallback. + */ +static IndexBulkDeleteCallbackResult +lazy_indexvac_phase1(ItemPointer itemptr, bool is_warm, void *state) +{ + LVRelStats *vacrelstats = (LVRelStats *) state; + ItemPointer res; + LVWarmChain *chain; + + res = (ItemPointer) bsearch((void *) itemptr, + (void *) vacrelstats->dead_tuples, + vacrelstats->num_dead_tuples, + sizeof(ItemPointerData), + vac_cmp_itemptr); + + if (res != NULL) + return IBDCR_DELETE; + + chain = (LVWarmChain *) bsearch((void *) itemptr, + (void *) vacrelstats->warm_chains, + vacrelstats->num_warm_chains, + sizeof(LVWarmChain), + vac_cmp_warm_chain); + if (chain != NULL) + { + if (is_warm) + chain->num_warm_pointers++; + else + chain->num_clear_pointers++; + } + return IBDCR_KEEP; +} + +/* + * lazy_indexvac_phase2() -- run first pass of index vacuum + * + * This has the right signature to be an IndexBulkDeleteCallback. + */ +static IndexBulkDeleteCallbackResult +lazy_indexvac_phase2(ItemPointer itemptr, bool is_warm, void *state) +{ + LVRelStats *vacrelstats = (LVRelStats *) state; + LVWarmChain *chain; + + chain = (LVWarmChain *) bsearch((void *) itemptr, + (void *) vacrelstats->warm_chains, + vacrelstats->num_warm_chains, + sizeof(LVWarmChain), + vac_cmp_warm_chain); + + if (chain != NULL && (chain->keep_warm_chain != 1)) + { + /* + * At no point, we can have more than 1 warm pointer to any chain and + * no more than 2 clear pointers. + */ + Assert(chain->num_warm_pointers <= 1); + Assert(chain->num_clear_pointers <= 2); + + if (chain->is_postwarm_chain == 1) + { + if (is_warm) + { + /* + * A WARM pointer, pointing to a WARM chain. + * + * Clear the warm pointer (and delete the CLEAR pointer). We + * may have already seen the CLEAR pointer in the scan and + * deleted that or we may see it later in the scan. It doesn't + * matter if we fail at any point because we won't clear up + * WARM bits on the heap tuples until we have dealt with the + * index pointers cleanly. + */ + return IBDCR_CLEAR_WARM; + } + else + { + /* + * CLEAR pointer to a WARM chain. + */ + if (chain->num_warm_pointers > 0) + { + /* + * If there exists a WARM pointer to the chain, we can + * delete the CLEAR pointer and clear the WARM bits on the + * heap tuples. + */ + return IBDCR_DELETE; + } + else if (chain->num_clear_pointers == 1) + { + /* + * If this is the only pointer to a WARM chain, we must + * keep the CLEAR pointer. + * + * The presence of WARM chain indicates that the WARM update + * must have been committed good. But during the update + * this index was probably not updated and hence it + * contains just one, original CLEAR pointer to the chain. + * We should be able to clear the WARM bits on heap tuples + * unless we later find another index which prevents the + * cleanup. + */ + return IBDCR_KEEP; + } + } + } + else + { + /* + * This is a CLEAR chain. + */ + if (is_warm) + { + /* + * A WARM pointer to a CLEAR chain. + * + * This can happen when a WARM update is aborted. Later the HOT + * chain is pruned leaving behind only CLEAR tuples in the + * chain. But the WARM index pointer inserted in the index + * remains and it must now be deleted before we clear WARM bits + * from the heap tuple. + */ + return IBDCR_DELETE; + } + + /* + * CLEAR pointer to a CLEAR chain. + * + * If this is the only surviving CLEAR pointer, keep it and clear + * the WARM bits from the heap tuples. + */ + if (chain->num_clear_pointers == 1) + return IBDCR_KEEP; + + /* + * If there are more than 1 CLEAR pointers to this chain, we can + * apply the recheck logic and kill the redudant CLEAR pointer and + * convert the chain. But that's not yet done. + */ + } + + /* + * For everything else, we must keep the WARM bits and also keep the + * index pointers. + */ + chain->keep_warm_chain = 1; + return IBDCR_KEEP; + } + return IBDCR_KEEP; +} + +/* + * Comparator routines for use with qsort() and bsearch(). Similar to + * vac_cmp_itemptr, but right hand argument is LVWarmChain struct pointer. + */ +static int +vac_cmp_warm_chain(const void *left, const void *right) +{ + BlockNumber lblk, + rblk; + OffsetNumber loff, + roff; + + lblk = ItemPointerGetBlockNumber((ItemPointer) left); + rblk = ItemPointerGetBlockNumber(&((LVWarmChain *) right)->chain_tid); + + if (lblk < rblk) + return -1; + if (lblk > rblk) + return 1; + + loff = ItemPointerGetOffsetNumber((ItemPointer) left); + roff = ItemPointerGetOffsetNumber(&((LVWarmChain *) right)->chain_tid); + + if (loff < roff) + return -1; + if (loff > roff) + return 1; + + return 0; } /* @@ -2166,6 +2745,18 @@ heap_page_is_all_visible(Relation rel, Buffer buf, break; } + /* + * If this or any other tuple in the chain ever WARM + * updated, there could be multiple index entries pointing + * to the root of this chain. We can't do index-only scans + * for such tuples without verifying index key check. So + * mark the page as !all_visible + */ + if (HeapTupleHeaderIsWarmUpdated(tuple.t_data)) + { + all_visible = false; + } + /* Track newest xmin on page. */ if (TransactionIdFollows(xmin, *visibility_cutoff_xid)) *visibility_cutoff_xid = xmin; diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index 2142273..3e49a8f 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -270,6 +270,8 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo) List * ExecInsertIndexTuples(TupleTableSlot *slot, ItemPointer tupleid, + ItemPointer root_tid, + Bitmapset *modified_attrs, EState *estate, bool noDupErr, bool *specConflict, @@ -324,6 +326,17 @@ ExecInsertIndexTuples(TupleTableSlot *slot, if (!indexInfo->ii_ReadyForInserts) continue; + /* + * If modified_attrs is set, we only insert index entries for those + * indexes whose column has changed. All other indexes can use their + * existing index pointers to look up the new tuple + */ + if (modified_attrs) + { + if (!bms_overlap(modified_attrs, indexInfo->ii_indxattrs)) + continue; + } + /* Check for partial index */ if (indexInfo->ii_Predicate != NIL) { @@ -389,10 +402,11 @@ ExecInsertIndexTuples(TupleTableSlot *slot, index_insert(indexRelation, /* index relation */ values, /* array of index Datums */ isnull, /* null flags */ - tupleid, /* tid of heap tuple */ + root_tid, /* tid of heap or root tuple */ heapRelation, /* heap relation */ checkUnique, /* type of uniqueness check to do */ - indexInfo); /* index AM may need this */ + indexInfo, /* index AM may need this */ + (modified_attrs != NULL)); /* type of uniqueness check to do */ /* * If the index has an associated exclusion constraint, check that. @@ -791,6 +805,9 @@ retry: { if (!HeapTupleHeaderIsHeapLatest(tup->t_data, &tup->t_self)) HeapTupleHeaderGetNextTid(tup->t_data, &ctid_wait); + else + ItemPointerCopy(&tup->t_self, &ctid_wait); + reason_wait = indexInfo->ii_ExclusionOps ? XLTW_RecheckExclusionConstr : XLTW_InsertIndex; index_endscan(index_scan); diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index f20d728..747e4ce 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -399,6 +399,8 @@ ExecSimpleRelationInsert(EState *estate, TupleTableSlot *slot) if (resultRelInfo->ri_NumIndices > 0) recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), + &(tuple->t_self), + NULL, estate, false, NULL, NIL); @@ -445,6 +447,8 @@ ExecSimpleRelationUpdate(EState *estate, EPQState *epqstate, if (!skip_tuple) { List *recheckIndexes = NIL; + bool warm_update; + Bitmapset *modified_attrs; /* Check the constraints of the tuple */ if (rel->rd_att->constr) @@ -455,13 +459,35 @@ ExecSimpleRelationUpdate(EState *estate, EPQState *epqstate, /* OK, update the tuple and index entries for it */ simple_heap_update(rel, &searchslot->tts_tuple->t_self, - slot->tts_tuple); + slot->tts_tuple, &modified_attrs, &warm_update); if (resultRelInfo->ri_NumIndices > 0 && - !HeapTupleIsHeapOnly(slot->tts_tuple)) + (!HeapTupleIsHeapOnly(slot->tts_tuple) || warm_update)) + { + ItemPointerData root_tid; + + /* + * If we did a WARM update then we must index the tuple using its + * root line pointer and not the tuple TID itself. + */ + if (warm_update) + ItemPointerSet(&root_tid, + ItemPointerGetBlockNumber(&(tuple->t_self)), + HeapTupleHeaderGetRootOffset(tuple->t_data)); + else + { + ItemPointerCopy(&tuple->t_self, + &root_tid); + bms_free(modified_attrs); + modified_attrs = NULL; + } + recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), + &root_tid, + modified_attrs, estate, false, NULL, NIL); + } /* AFTER ROW UPDATE Triggers */ ExecARUpdateTriggers(estate, resultRelInfo, diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index 2e9ff7d..f7bb6ca 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -39,6 +39,7 @@ #include "access/relscan.h" #include "access/transam.h" +#include "access/valid.h" #include "executor/execdebug.h" #include "executor/nodeBitmapHeapscan.h" #include "pgstat.h" @@ -395,11 +396,27 @@ bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres) OffsetNumber offnum = tbmres->offsets[curslot]; ItemPointerData tid; HeapTupleData heapTuple; + bool recheck = false; ItemPointerSet(&tid, page, offnum); if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot, - &heapTuple, NULL, true)) - scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid); + &heapTuple, NULL, true, &recheck)) + { + bool valid = true; + + if (scan->rs_key) + HeapKeyTest(&heapTuple, RelationGetDescr(scan->rs_rd), + scan->rs_nkeys, scan->rs_key, valid); + if (valid) + scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid); + + /* + * If the heap tuple needs a recheck because of a WARM update, + * it's a lossy case. + */ + if (recheck) + tbmres->recheck = true; + } } } else diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index cb6aff9..dff4086 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -142,8 +142,8 @@ IndexNext(IndexScanState *node) false); /* don't pfree */ /* - * If the index was lossy, we have to recheck the index quals using - * the fetched tuple. + * If the index was lossy or the tuple was WARM, we have to recheck + * the index quals using the fetched tuple. */ if (scandesc->xs_recheck) { diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 95e1589..a1f3440 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -512,6 +512,7 @@ ExecInsert(ModifyTableState *mtstate, /* insert index entries for tuple */ recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), + &(tuple->t_self), NULL, estate, true, &specConflict, arbiterIndexes); @@ -558,6 +559,7 @@ ExecInsert(ModifyTableState *mtstate, /* insert index entries for tuple */ if (resultRelInfo->ri_NumIndices > 0) recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), + &(tuple->t_self), NULL, estate, false, NULL, arbiterIndexes); } @@ -891,6 +893,9 @@ ExecUpdate(ItemPointer tupleid, HTSU_Result result; HeapUpdateFailureData hufd; List *recheckIndexes = NIL; + Bitmapset *modified_attrs = NULL; + ItemPointerData root_tid; + bool warm_update; /* * abort the operation if not running transactions @@ -1007,7 +1012,7 @@ lreplace:; estate->es_output_cid, estate->es_crosscheck_snapshot, true /* wait for commit */ , - &hufd, &lockmode); + &hufd, &lockmode, &modified_attrs, &warm_update); switch (result) { case HeapTupleSelfUpdated: @@ -1094,10 +1099,28 @@ lreplace:; * the t_self field. * * If it's a HOT update, we mustn't insert new index entries. + * + * If it's a WARM update, then we must insert new entries with TID + * pointing to the root of the WARM chain. */ - if (resultRelInfo->ri_NumIndices > 0 && !HeapTupleIsHeapOnly(tuple)) + if (resultRelInfo->ri_NumIndices > 0 && + (!HeapTupleIsHeapOnly(tuple) || warm_update)) + { + if (warm_update) + ItemPointerSet(&root_tid, + ItemPointerGetBlockNumber(&(tuple->t_self)), + HeapTupleHeaderGetRootOffset(tuple->t_data)); + else + { + ItemPointerCopy(&tuple->t_self, &root_tid); + bms_free(modified_attrs); + modified_attrs = NULL; + } recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), + &root_tid, + modified_attrs, estate, false, NULL, NIL); + } } if (canSetTag) diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 3a50488..806d812 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -1824,7 +1824,7 @@ pgstat_count_heap_insert(Relation rel, PgStat_Counter n) * pgstat_count_heap_update - count a tuple update */ void -pgstat_count_heap_update(Relation rel, bool hot) +pgstat_count_heap_update(Relation rel, bool hot, bool warm) { PgStat_TableStatus *pgstat_info = rel->pgstat_info; @@ -1842,6 +1842,8 @@ pgstat_count_heap_update(Relation rel, bool hot) /* t_tuples_hot_updated is nontransactional, so just advance it */ if (hot) pgstat_info->t_counts.t_tuples_hot_updated++; + else if (warm) + pgstat_info->t_counts.t_tuples_warm_updated++; } } @@ -4324,6 +4326,7 @@ pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create) result->tuples_updated = 0; result->tuples_deleted = 0; result->tuples_hot_updated = 0; + result->tuples_warm_updated = 0; result->n_live_tuples = 0; result->n_dead_tuples = 0; result->changes_since_analyze = 0; @@ -5433,6 +5436,7 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len) tabentry->tuples_updated = tabmsg->t_counts.t_tuples_updated; tabentry->tuples_deleted = tabmsg->t_counts.t_tuples_deleted; tabentry->tuples_hot_updated = tabmsg->t_counts.t_tuples_hot_updated; + tabentry->tuples_warm_updated = tabmsg->t_counts.t_tuples_warm_updated; tabentry->n_live_tuples = tabmsg->t_counts.t_delta_live_tuples; tabentry->n_dead_tuples = tabmsg->t_counts.t_delta_dead_tuples; tabentry->changes_since_analyze = tabmsg->t_counts.t_changed_tuples; @@ -5460,6 +5464,7 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len) tabentry->tuples_updated += tabmsg->t_counts.t_tuples_updated; tabentry->tuples_deleted += tabmsg->t_counts.t_tuples_deleted; tabentry->tuples_hot_updated += tabmsg->t_counts.t_tuples_hot_updated; + tabentry->tuples_warm_updated += tabmsg->t_counts.t_tuples_warm_updated; /* If table was truncated, first reset the live/dead counters */ if (tabmsg->t_counts.t_truncated) { diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index 5c13d26..7a9b48a 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -347,7 +347,7 @@ DecodeStandbyOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) static void DecodeHeap2Op(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { - uint8 info = XLogRecGetInfo(buf->record) & XLOG_HEAP_OPMASK; + uint8 info = XLogRecGetInfo(buf->record) & XLOG_HEAP2_OPMASK; TransactionId xid = XLogRecGetXid(buf->record); SnapBuild *builder = ctx->snapshot_builder; @@ -359,10 +359,6 @@ DecodeHeap2Op(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) switch (info) { - case XLOG_HEAP2_MULTI_INSERT: - if (SnapBuildProcessChange(builder, xid, buf->origptr)) - DecodeMultiInsert(ctx, buf); - break; case XLOG_HEAP2_NEW_CID: { xl_heap_new_cid *xlrec; @@ -390,6 +386,7 @@ DecodeHeap2Op(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) case XLOG_HEAP2_CLEANUP_INFO: case XLOG_HEAP2_VISIBLE: case XLOG_HEAP2_LOCK_UPDATED: + case XLOG_HEAP2_WARMCLEAR: break; default: elog(ERROR, "unexpected RM_HEAP2_ID record type: %u", info); @@ -418,6 +415,10 @@ DecodeHeapOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) if (SnapBuildProcessChange(builder, xid, buf->origptr)) DecodeInsert(ctx, buf); break; + case XLOG_HEAP_MULTI_INSERT: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeMultiInsert(ctx, buf); + break; /* * Treat HOT update as normal updates. There is no useful @@ -809,7 +810,7 @@ DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) } /* - * Decode XLOG_HEAP2_MULTI_INSERT_insert record into multiple tuplebufs. + * Decode XLOG_HEAP_MULTI_INSERT_insert record into multiple tuplebufs. * * Currently MULTI_INSERT will always contain the full tuples. */ diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index a987d0d..b8677f3 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -145,6 +145,22 @@ pg_stat_get_tuples_hot_updated(PG_FUNCTION_ARGS) Datum +pg_stat_get_tuples_warm_updated(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + int64 result; + PgStat_StatTabEntry *tabentry; + + if ((tabentry = pgstat_fetch_stat_tabentry(relid)) == NULL) + result = 0; + else + result = (int64) (tabentry->tuples_warm_updated); + + PG_RETURN_INT64(result); +} + + +Datum pg_stat_get_live_tuples(PG_FUNCTION_ARGS) { Oid relid = PG_GETARG_OID(0); @@ -1644,6 +1660,21 @@ pg_stat_get_xact_tuples_hot_updated(PG_FUNCTION_ARGS) } Datum +pg_stat_get_xact_tuples_warm_updated(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + int64 result; + PgStat_TableStatus *tabentry; + + if ((tabentry = find_tabstat_entry(relid)) == NULL) + result = 0; + else + result = (int64) (tabentry->t_counts.t_tuples_warm_updated); + + PG_RETURN_INT64(result); +} + +Datum pg_stat_get_xact_blocks_fetched(PG_FUNCTION_ARGS) { Oid relid = PG_GETARG_OID(0); diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index ce55fc5..64dbaaa 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -2338,6 +2338,7 @@ RelationDestroyRelation(Relation relation, bool remember_tupdesc) list_free_deep(relation->rd_fkeylist); list_free(relation->rd_indexlist); bms_free(relation->rd_indexattr); + bms_free(relation->rd_exprindexattr); bms_free(relation->rd_keyattr); bms_free(relation->rd_pkattr); bms_free(relation->rd_idattr); @@ -4352,6 +4353,13 @@ RelationGetIndexList(Relation relation) return list_copy(relation->rd_indexlist); /* + * If the index list was invalidated, we better also invalidate the index + * attribute list (which should automatically invalidate other attributes + * such as primary key and replica identity) + */ + relation->rd_indexattr = NULL; + + /* * We build the list we intend to return (in the caller's context) while * doing the scan. After successfully completing the scan, we copy that * list into the relcache entry. This avoids cache-context memory leakage @@ -4759,15 +4767,19 @@ Bitmapset * RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind) { Bitmapset *indexattrs; /* indexed columns */ + Bitmapset *exprindexattrs; /* indexed columns in expression/prediacate + indexes */ Bitmapset *uindexattrs; /* columns in unique indexes */ Bitmapset *pkindexattrs; /* columns in the primary index */ Bitmapset *idindexattrs; /* columns in the replica identity */ + Bitmapset *indxnotreadyattrs; /* columns in not ready indexes */ List *indexoidlist; List *newindexoidlist; Oid relpkindex; Oid relreplindex; ListCell *l; MemoryContext oldcxt; + bool supportswarm = true;/* True if the table can be WARM updated */ /* Quick exit if we already computed the result. */ if (relation->rd_indexattr != NULL) @@ -4782,6 +4794,10 @@ RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind) return bms_copy(relation->rd_pkattr); case INDEX_ATTR_BITMAP_IDENTITY_KEY: return bms_copy(relation->rd_idattr); + case INDEX_ATTR_BITMAP_EXPR_PREDICATE: + return bms_copy(relation->rd_exprindexattr); + case INDEX_ATTR_BITMAP_NOTREADY: + return bms_copy(relation->rd_indxnotreadyattr); default: elog(ERROR, "unknown attrKind %u", attrKind); } @@ -4822,9 +4838,11 @@ restart: * won't be returned at all by RelationGetIndexList. */ indexattrs = NULL; + exprindexattrs = NULL; uindexattrs = NULL; pkindexattrs = NULL; idindexattrs = NULL; + indxnotreadyattrs = NULL; foreach(l, indexoidlist) { Oid indexOid = lfirst_oid(l); @@ -4861,6 +4879,10 @@ restart: indexattrs = bms_add_member(indexattrs, attrnum - FirstLowInvalidHeapAttributeNumber); + if (!indexInfo->ii_ReadyForInserts) + indxnotreadyattrs = bms_add_member(indxnotreadyattrs, + attrnum - FirstLowInvalidHeapAttributeNumber); + if (isKey) uindexattrs = bms_add_member(uindexattrs, attrnum - FirstLowInvalidHeapAttributeNumber); @@ -4876,10 +4898,29 @@ restart: } /* Collect all attributes used in expressions, too */ - pull_varattnos((Node *) indexInfo->ii_Expressions, 1, &indexattrs); + pull_varattnos((Node *) indexInfo->ii_Expressions, 1, &exprindexattrs); /* Collect all attributes in the index predicate, too */ - pull_varattnos((Node *) indexInfo->ii_Predicate, 1, &indexattrs); + pull_varattnos((Node *) indexInfo->ii_Predicate, 1, &exprindexattrs); + + /* + * indexattrs should include attributes referenced in index expressions + * and predicates too. + */ + indexattrs = bms_add_members(indexattrs, exprindexattrs); + + if (!indexInfo->ii_ReadyForInserts) + indxnotreadyattrs = bms_add_members(indxnotreadyattrs, + exprindexattrs); + + /* + * Check if the index has amrecheck method defined. If the method is + * not defined, the index does not support WARM update. Completely + * disable WARM updates on such tables. + */ + if (!indexDesc->rd_amroutine->amrecheck) + supportswarm = false; + index_close(indexDesc, AccessShareLock); } @@ -4912,15 +4953,22 @@ restart: goto restart; } + /* Remember if the table can do WARM updates */ + relation->rd_supportswarm = supportswarm; + /* Don't leak the old values of these bitmaps, if any */ bms_free(relation->rd_indexattr); relation->rd_indexattr = NULL; + bms_free(relation->rd_exprindexattr); + relation->rd_exprindexattr = NULL; bms_free(relation->rd_keyattr); relation->rd_keyattr = NULL; bms_free(relation->rd_pkattr); relation->rd_pkattr = NULL; bms_free(relation->rd_idattr); relation->rd_idattr = NULL; + bms_free(relation->rd_indxnotreadyattr); + relation->rd_indxnotreadyattr = NULL; /* * Now save copies of the bitmaps in the relcache entry. We intentionally @@ -4933,7 +4981,9 @@ restart: relation->rd_keyattr = bms_copy(uindexattrs); relation->rd_pkattr = bms_copy(pkindexattrs); relation->rd_idattr = bms_copy(idindexattrs); - relation->rd_indexattr = bms_copy(indexattrs); + relation->rd_exprindexattr = bms_copy(exprindexattrs); + relation->rd_indexattr = bms_copy(bms_union(indexattrs, exprindexattrs)); + relation->rd_indxnotreadyattr = bms_copy(indxnotreadyattrs); MemoryContextSwitchTo(oldcxt); /* We return our original working copy for caller to play with */ @@ -4947,6 +4997,10 @@ restart: return bms_copy(relation->rd_pkattr); case INDEX_ATTR_BITMAP_IDENTITY_KEY: return idindexattrs; + case INDEX_ATTR_BITMAP_EXPR_PREDICATE: + return exprindexattrs; + case INDEX_ATTR_BITMAP_NOTREADY: + return indxnotreadyattrs; default: elog(ERROR, "unknown attrKind %u", attrKind); return NULL; @@ -5559,6 +5613,7 @@ load_relcache_init_file(bool shared) rel->rd_keyattr = NULL; rel->rd_pkattr = NULL; rel->rd_idattr = NULL; + rel->rd_indxnotreadyattr = NULL; rel->rd_pubactions = NULL; rel->rd_createSubid = InvalidSubTransactionId; rel->rd_newRelfilenodeSubid = InvalidSubTransactionId; diff --git a/src/backend/utils/time/combocid.c b/src/backend/utils/time/combocid.c index baff998..6a2e2f2 100644 --- a/src/backend/utils/time/combocid.c +++ b/src/backend/utils/time/combocid.c @@ -106,7 +106,7 @@ HeapTupleHeaderGetCmin(HeapTupleHeader tup) { CommandId cid = HeapTupleHeaderGetRawCommandId(tup); - Assert(!(tup->t_infomask & HEAP_MOVED)); + Assert(!(HeapTupleHeaderIsMoved(tup))); Assert(TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tup))); if (tup->t_infomask & HEAP_COMBOCID) @@ -120,7 +120,7 @@ HeapTupleHeaderGetCmax(HeapTupleHeader tup) { CommandId cid = HeapTupleHeaderGetRawCommandId(tup); - Assert(!(tup->t_infomask & HEAP_MOVED)); + Assert(!(HeapTupleHeaderIsMoved(tup))); /* * Because GetUpdateXid() performs memory allocations if xmax is a diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c index 519f3b6..e54d0df 100644 --- a/src/backend/utils/time/tqual.c +++ b/src/backend/utils/time/tqual.c @@ -186,7 +186,7 @@ HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer) return false; /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) + if (HeapTupleHeaderIsMovedOff(tuple)) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); @@ -205,7 +205,7 @@ HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer) } } /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) + else if (HeapTupleHeaderIsMovedIn(tuple)) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); @@ -377,7 +377,7 @@ HeapTupleSatisfiesToast(HeapTuple htup, Snapshot snapshot, return false; /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) + if (HeapTupleHeaderIsMovedOff(tuple)) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); @@ -396,7 +396,7 @@ HeapTupleSatisfiesToast(HeapTuple htup, Snapshot snapshot, } } /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) + else if (HeapTupleHeaderIsMovedIn(tuple)) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); @@ -471,7 +471,7 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, return HeapTupleInvisible; /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) + if (HeapTupleHeaderIsMovedOff(tuple)) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); @@ -490,7 +490,7 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, } } /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) + else if (HeapTupleHeaderIsMovedIn(tuple)) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); @@ -753,7 +753,7 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, return false; /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) + if (HeapTupleHeaderIsMovedOff(tuple)) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); @@ -772,7 +772,7 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, } } /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) + else if (HeapTupleHeaderIsMovedIn(tuple)) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); @@ -974,7 +974,7 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, return false; /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) + if (HeapTupleHeaderIsMovedOff(tuple)) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); @@ -993,7 +993,7 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, } } /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) + else if (HeapTupleHeaderIsMovedIn(tuple)) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); @@ -1180,7 +1180,7 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, if (HeapTupleHeaderXminInvalid(tuple)) return HEAPTUPLE_DEAD; /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_OFF) + else if (HeapTupleHeaderIsMovedOff(tuple)) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); @@ -1198,7 +1198,7 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, InvalidTransactionId); } /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) + else if (HeapTupleHeaderIsMovedIn(tuple)) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index f919cf8..8b7af1e 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -13,6 +13,7 @@ #define AMAPI_H #include "access/genam.h" +#include "access/itup.h" /* * We don't wish to include planner header files here, since most of an index @@ -74,6 +75,14 @@ typedef bool (*aminsert_function) (Relation indexRelation, Relation heapRelation, IndexUniqueCheck checkUnique, struct IndexInfo *indexInfo); +/* insert this WARM tuple */ +typedef bool (*amwarminsert_function) (Relation indexRelation, + Datum *values, + bool *isnull, + ItemPointer heap_tid, + Relation heapRelation, + IndexUniqueCheck checkUnique, + struct IndexInfo *indexInfo); /* bulk delete */ typedef IndexBulkDeleteResult *(*ambulkdelete_function) (IndexVacuumInfo *info, @@ -152,6 +161,11 @@ typedef void (*aminitparallelscan_function) (void *target); /* (re)start parallel index scan */ typedef void (*amparallelrescan_function) (IndexScanDesc scan); +/* recheck index tuple and heap tuple match */ +typedef bool (*amrecheck_function) (Relation indexRel, + struct IndexInfo *indexInfo, IndexTuple indexTuple, + Relation heapRel, HeapTuple heapTuple); + /* * API struct for an index AM. Note this must be stored in a single palloc'd * chunk of memory. @@ -198,6 +212,7 @@ typedef struct IndexAmRoutine ambuild_function ambuild; ambuildempty_function ambuildempty; aminsert_function aminsert; + amwarminsert_function amwarminsert; ambulkdelete_function ambulkdelete; amvacuumcleanup_function amvacuumcleanup; amcanreturn_function amcanreturn; /* can be NULL */ @@ -217,6 +232,9 @@ typedef struct IndexAmRoutine amestimateparallelscan_function amestimateparallelscan; /* can be NULL */ aminitparallelscan_function aminitparallelscan; /* can be NULL */ amparallelrescan_function amparallelrescan; /* can be NULL */ + + /* interface function to support WARM */ + amrecheck_function amrecheck; /* can be NULL */ } IndexAmRoutine; diff --git a/src/include/access/genam.h b/src/include/access/genam.h index f467b18..965be45 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -75,12 +75,29 @@ typedef struct IndexBulkDeleteResult bool estimated_count; /* num_index_tuples is an estimate */ double num_index_tuples; /* tuples remaining */ double tuples_removed; /* # removed during vacuum operation */ + double num_warm_pointers; /* # WARM pointers found */ + double num_clear_pointers; /* # CLEAR pointers found */ + double pointers_cleared; /* # WARM pointers cleared */ + double warm_pointers_removed; /* # WARM pointers removed */ + double clear_pointers_removed; /* # CLEAR pointers removed */ BlockNumber pages_deleted; /* # unused pages in index */ BlockNumber pages_free; /* # pages available for reuse */ } IndexBulkDeleteResult; +/* + * IndexBulkDeleteCallback should return one of the following + */ +typedef enum IndexBulkDeleteCallbackResult +{ + IBDCR_KEEP, /* index tuple should be preserved */ + IBDCR_DELETE, /* index tuple should be deleted */ + IBDCR_CLEAR_WARM /* index tuple should be cleared of WARM bit */ +} IndexBulkDeleteCallbackResult; + /* Typedef for callback function to determine if a tuple is bulk-deletable */ -typedef bool (*IndexBulkDeleteCallback) (ItemPointer itemptr, void *state); +typedef IndexBulkDeleteCallbackResult (*IndexBulkDeleteCallback) ( + ItemPointer itemptr, + bool is_warm, void *state); /* struct definitions appear in relscan.h */ typedef struct IndexScanDescData *IndexScanDesc; @@ -135,7 +152,8 @@ extern bool index_insert(Relation indexRelation, ItemPointer heap_t_ctid, Relation heapRelation, IndexUniqueCheck checkUnique, - struct IndexInfo *indexInfo); + struct IndexInfo *indexInfo, + bool warm_update); extern IndexScanDesc index_beginscan(Relation heapRelation, Relation indexRelation, diff --git a/src/include/access/hash.h b/src/include/access/hash.h index eb1df57..f2094e3 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -281,6 +281,11 @@ typedef HashMetaPageData *HashMetaPage; #define HASHPROC 1 #define HASHNProcs 1 +/* + * Flags overloaded on t_tid.ip_posid field. They are managed by + * ItemPointerSetFlags and corresponing routines. + */ +#define HASH_INDEX_WARM_POINTER 0x01 /* public routines */ @@ -291,6 +296,10 @@ extern bool hashinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, struct IndexInfo *indexInfo); +extern bool hashwarminsert(Relation rel, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + struct IndexInfo *indexInfo); extern bool hashgettuple(IndexScanDesc scan, ScanDirection dir); extern int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); extern IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys); @@ -360,6 +369,8 @@ extern void _hash_expandtable(Relation rel, Buffer metabuf); extern void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket, uint32 maxbucket, uint32 highmask, uint32 lowmask); +extern void _hash_clear_items(Page page, OffsetNumber *clearitemnos, + uint16 nclearitems); /* hashsearch.c */ extern bool _hash_next(IndexScanDesc scan, ScanDirection dir); @@ -404,4 +415,8 @@ extern void hashbucketcleanup(Relation rel, Bucket cur_bucket, bool bucket_has_garbage, IndexBulkDeleteCallback callback, void *callback_state); +/* hash.c */ +extern bool hashrecheck(Relation indexRel, struct IndexInfo *indexInfo, + IndexTuple indexTuple, Relation heapRel, HeapTuple heapTuple); + #endif /* HASH_H */ diff --git a/src/include/access/hash_xlog.h b/src/include/access/hash_xlog.h index dfd9237..0549a5a 100644 --- a/src/include/access/hash_xlog.h +++ b/src/include/access/hash_xlog.h @@ -199,9 +199,10 @@ typedef struct xl_hash_delete { bool is_primary_bucket_page; /* TRUE if the operation is for * primary bucket page */ + uint16 nclearitems; /* # of items to clear of WARM bits */ } xl_hash_delete; -#define SizeOfHashDelete (offsetof(xl_hash_delete, is_primary_bucket_page) + sizeof(bool)) +#define SizeOfHashDelete (offsetof(xl_hash_delete, nclearitems) + sizeof(uint16)) /* * This is what we need for metapage update operation. diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 5540e12..2217af9 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -72,6 +72,20 @@ typedef struct HeapUpdateFailureData CommandId cmax; } HeapUpdateFailureData; +typedef int HeapCheckWarmChainStatus; + +#define HCWC_CLEAR_TUPLE 0x0001 +#define HCWC_WARM_TUPLE 0x0002 +#define HCWC_WARM_UPDATED_TUPLE 0x0004 + +#define HCWC_IS_MIXED(status) \ + (((status) & (HCWC_CLEAR_TUPLE | HCWC_WARM_TUPLE)) != 0) +#define HCWC_IS_ALL_WARM(status) \ + (((status) & HCWC_CLEAR_TUPLE) == 0) +#define HCWC_IS_ALL_CLEAR(status) \ + (((status) & HCWC_WARM_TUPLE) == 0) +#define HCWC_IS_WARM_UPDATED(status) \ + (((status) & HCWC_WARM_UPDATED_TUPLE) != 0) /* ---------------- * function prototypes for heap access method @@ -137,9 +151,10 @@ extern bool heap_fetch(Relation relation, Snapshot snapshot, Relation stats_relation); extern bool heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, - bool *all_dead, bool first_call); + bool *all_dead, bool first_call, bool *recheck); extern bool heap_hot_search(ItemPointer tid, Relation relation, - Snapshot snapshot, bool *all_dead); + Snapshot snapshot, bool *all_dead, + bool *recheck, Buffer *buffer, HeapTuple heapTuple); extern void heap_get_latest_tid(Relation relation, Snapshot snapshot, ItemPointer tid); @@ -161,7 +176,8 @@ extern void heap_abort_speculative(Relation relation, HeapTuple tuple); extern HTSU_Result heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, CommandId cid, Snapshot crosscheck, bool wait, - HeapUpdateFailureData *hufd, LockTupleMode *lockmode); + HeapUpdateFailureData *hufd, LockTupleMode *lockmode, + Bitmapset **modified_attrsp, bool *warm_update); extern HTSU_Result heap_lock_tuple(Relation relation, HeapTuple tuple, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, bool follow_update, @@ -176,10 +192,16 @@ extern bool heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple); extern Oid simple_heap_insert(Relation relation, HeapTuple tup); extern void simple_heap_delete(Relation relation, ItemPointer tid); extern void simple_heap_update(Relation relation, ItemPointer otid, - HeapTuple tup); + HeapTuple tup, + Bitmapset **modified_attrs, + bool *warm_update); extern void heap_sync(Relation relation); extern void heap_update_snapshot(HeapScanDesc scan, Snapshot snapshot); +extern HeapCheckWarmChainStatus heap_check_warm_chain(Page dp, + ItemPointer tid, bool stop_at_warm); +extern int heap_clear_warm_chain(Page dp, ItemPointer tid, + OffsetNumber *cleared_offnums); /* in heap/pruneheap.c */ extern void heap_page_prune_opt(Relation relation, Buffer buffer); diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index e6019d5..66fd0ea 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -32,7 +32,7 @@ #define XLOG_HEAP_INSERT 0x00 #define XLOG_HEAP_DELETE 0x10 #define XLOG_HEAP_UPDATE 0x20 -/* 0x030 is free, was XLOG_HEAP_MOVE */ +#define XLOG_HEAP_MULTI_INSERT 0x30 #define XLOG_HEAP_HOT_UPDATE 0x40 #define XLOG_HEAP_CONFIRM 0x50 #define XLOG_HEAP_LOCK 0x60 @@ -47,18 +47,23 @@ /* * We ran out of opcodes, so heapam.c now has a second RmgrId. These opcodes * are associated with RM_HEAP2_ID, but are not logically different from - * the ones above associated with RM_HEAP_ID. XLOG_HEAP_OPMASK applies to - * these, too. + * the ones above associated with RM_HEAP_ID. + * + * In PG 10, we moved XLOG_HEAP2_MULTI_INSERT to RM_HEAP_ID. That allows us to + * use 0x80 bit in RM_HEAP2_ID, thus potentially creating another 8 possible + * opcodes in RM_HEAP2_ID. */ #define XLOG_HEAP2_REWRITE 0x00 #define XLOG_HEAP2_CLEAN 0x10 #define XLOG_HEAP2_FREEZE_PAGE 0x20 #define XLOG_HEAP2_CLEANUP_INFO 0x30 #define XLOG_HEAP2_VISIBLE 0x40 -#define XLOG_HEAP2_MULTI_INSERT 0x50 +#define XLOG_HEAP2_WARMCLEAR 0x50 #define XLOG_HEAP2_LOCK_UPDATED 0x60 #define XLOG_HEAP2_NEW_CID 0x70 +#define XLOG_HEAP2_OPMASK 0x70 + /* * xl_heap_insert/xl_heap_multi_insert flag values, 8 bits are available. */ @@ -80,6 +85,7 @@ #define XLH_UPDATE_CONTAINS_NEW_TUPLE (1<<4) #define XLH_UPDATE_PREFIX_FROM_OLD (1<<5) #define XLH_UPDATE_SUFFIX_FROM_OLD (1<<6) +#define XLH_UPDATE_WARM_UPDATE (1<<7) /* convenience macro for checking whether any form of old tuple was logged */ #define XLH_UPDATE_CONTAINS_OLD \ @@ -225,6 +231,14 @@ typedef struct xl_heap_clean #define SizeOfHeapClean (offsetof(xl_heap_clean, ndead) + sizeof(uint16)) +typedef struct xl_heap_warmclear +{ + uint16 ncleared; + /* OFFSET NUMBERS are in the block reference 0 */ +} xl_heap_warmclear; + +#define SizeOfHeapWarmClear (offsetof(xl_heap_warmclear, ncleared) + sizeof(uint16)) + /* * Cleanup_info is required in some cases during a lazy VACUUM. * Used for reporting the results of HeapTupleHeaderAdvanceLatestRemovedXid() @@ -388,6 +402,8 @@ extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused, TransactionId latestRemovedXid); +extern XLogRecPtr log_heap_warmclear(Relation reln, Buffer buffer, + OffsetNumber *cleared, int ncleared); extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid, xl_heap_freeze_tuple *tuples, int ntuples); diff --git a/src/include/access/htup_details.h b/src/include/access/htup_details.h index 4d614b7..bcefba6 100644 --- a/src/include/access/htup_details.h +++ b/src/include/access/htup_details.h @@ -201,6 +201,21 @@ struct HeapTupleHeaderData * upgrade support */ #define HEAP_MOVED (HEAP_MOVED_OFF | HEAP_MOVED_IN) +/* + * A WARM chain usually consists of two parts. Each of these parts are HOT + * chains in themselves i.e. all indexed columns has the same value, but a WARM + * update separates these parts. We need a mechanism to identify which part a + * tuple belongs to. We can't just look at if it's a + * HeapTupleHeaderIsWarmUpdated() because during WARM update, both old and new + * tuples are marked as WARM tuples. + * + * We need another infomask bit for this. But we use the same infomask bit that + * was earlier used for by old-style VACUUM FULL. This is safe because + * HEAP_WARM_TUPLE flag will always be set along with HEAP_WARM_UPDATED. So if + * HEAP_WARM_TUPLE and HEAP_WARM_UPDATED is set then we know that it's + * referring to red part of the WARM chain. + */ +#define HEAP_WARM_TUPLE 0x4000 #define HEAP_XACT_MASK 0xFFF0 /* visibility-related bits */ /* @@ -260,7 +275,11 @@ struct HeapTupleHeaderData * information stored in t_infomask2: */ #define HEAP_NATTS_MASK 0x07FF /* 11 bits for number of attributes */ -/* bits 0x0800 are available */ +#define HEAP_WARM_UPDATED 0x0800 /* + * This or a prior version of this + * tuple in the current HOT chain was + * once WARM updated + */ #define HEAP_LATEST_TUPLE 0x1000 /* * This is the last tuple in chain and * ip_posid points to the root line @@ -271,7 +290,7 @@ struct HeapTupleHeaderData #define HEAP_HOT_UPDATED 0x4000 /* tuple was HOT-updated */ #define HEAP_ONLY_TUPLE 0x8000 /* this is heap-only tuple */ -#define HEAP2_XACT_MASK 0xF000 /* visibility-related bits */ +#define HEAP2_XACT_MASK 0xF800 /* visibility-related bits */ /* @@ -396,7 +415,7 @@ struct HeapTupleHeaderData /* SetCmin is reasonably simple since we never need a combo CID */ #define HeapTupleHeaderSetCmin(tup, cid) \ do { \ - Assert(!((tup)->t_infomask & HEAP_MOVED)); \ + Assert(!HeapTupleHeaderIsMoved(tup)); \ (tup)->t_choice.t_heap.t_field3.t_cid = (cid); \ (tup)->t_infomask &= ~HEAP_COMBOCID; \ } while (0) @@ -404,7 +423,7 @@ do { \ /* SetCmax must be used after HeapTupleHeaderAdjustCmax; see combocid.c */ #define HeapTupleHeaderSetCmax(tup, cid, iscombo) \ do { \ - Assert(!((tup)->t_infomask & HEAP_MOVED)); \ + Assert(!HeapTupleHeaderIsMoved(tup)); \ (tup)->t_choice.t_heap.t_field3.t_cid = (cid); \ if (iscombo) \ (tup)->t_infomask |= HEAP_COMBOCID; \ @@ -414,7 +433,7 @@ do { \ #define HeapTupleHeaderGetXvac(tup) \ ( \ - ((tup)->t_infomask & HEAP_MOVED) ? \ + HeapTupleHeaderIsMoved(tup) ? \ (tup)->t_choice.t_heap.t_field3.t_xvac \ : \ InvalidTransactionId \ @@ -422,7 +441,7 @@ do { \ #define HeapTupleHeaderSetXvac(tup, xid) \ do { \ - Assert((tup)->t_infomask & HEAP_MOVED); \ + Assert(HeapTupleHeaderIsMoved(tup)); \ (tup)->t_choice.t_heap.t_field3.t_xvac = (xid); \ } while (0) @@ -510,6 +529,21 @@ do { \ ((tup)->t_infomask2 & HEAP_ONLY_TUPLE) != 0 \ ) +#define HeapTupleHeaderSetWarmUpdated(tup) \ +do { \ + (tup)->t_infomask2 |= HEAP_WARM_UPDATED; \ +} while (0) + +#define HeapTupleHeaderClearWarmUpdated(tup) \ +do { \ + (tup)->t_infomask2 &= ~HEAP_WARM_UPDATED; \ +} while (0) + +#define HeapTupleHeaderIsWarmUpdated(tup) \ +( \ + ((tup)->t_infomask2 & HEAP_WARM_UPDATED) != 0 \ +) + /* * Mark this as the last tuple in the HOT chain. Before PG v10 we used to store * the TID of the tuple itself in t_ctid field to mark the end of the chain. @@ -635,6 +669,58 @@ do { \ ) /* + * Macros to check if tuple is a moved-off/in tuple by VACUUM FULL in from + * pre-9.0 era. Such tuple must not have HEAP_WARM_TUPLE flag set. + * + * Beware of multiple evaluations of the argument. + */ +#define HeapTupleHeaderIsMovedOff(tuple) \ +( \ + !HeapTupleHeaderIsWarmUpdated((tuple)) && \ + ((tuple)->t_infomask & HEAP_MOVED_OFF) \ +) + +#define HeapTupleHeaderIsMovedIn(tuple) \ +( \ + !HeapTupleHeaderIsWarmUpdated((tuple)) && \ + ((tuple)->t_infomask & HEAP_MOVED_IN) \ +) + +#define HeapTupleHeaderIsMoved(tuple) \ +( \ + !HeapTupleHeaderIsWarmUpdated((tuple)) && \ + ((tuple)->t_infomask & HEAP_MOVED) \ +) + +/* + * Check if tuple belongs to the second part of the WARM chain. + * + * Beware of multiple evaluations of the argument. + */ +#define HeapTupleHeaderIsWarm(tuple) \ +( \ + HeapTupleHeaderIsWarmUpdated(tuple) && \ + (((tuple)->t_infomask & HEAP_WARM_TUPLE) != 0) \ +) + +/* + * Mark tuple as a member of the second part of the chain. Must only be done on + * a tuple which is already marked a WARM-tuple. + * + * Beware of multiple evaluations of the argument. + */ +#define HeapTupleHeaderSetWarm(tuple) \ +( \ + AssertMacro(HeapTupleHeaderIsWarmUpdated(tuple)), \ + (tuple)->t_infomask |= HEAP_WARM_TUPLE \ +) + +#define HeapTupleHeaderClearWarm(tuple) \ +( \ + (tuple)->t_infomask &= ~HEAP_WARM_TUPLE \ +) + +/* * BITMAPLEN(NATTS) - * Computes size of null bitmap given number of data columns. */ @@ -785,6 +871,24 @@ struct MinimalTupleData #define HeapTupleClearHeapOnly(tuple) \ HeapTupleHeaderClearHeapOnly((tuple)->t_data) +#define HeapTupleIsWarmUpdated(tuple) \ + HeapTupleHeaderIsWarmUpdated((tuple)->t_data) + +#define HeapTupleSetWarmUpdated(tuple) \ + HeapTupleHeaderSetWarmUpdated((tuple)->t_data) + +#define HeapTupleClearWarmUpdated(tuple) \ + HeapTupleHeaderClearWarmUpdated((tuple)->t_data) + +#define HeapTupleIsWarm(tuple) \ + HeapTupleHeaderIsWarm((tuple)->t_data) + +#define HeapTupleSetWarm(tuple) \ + HeapTupleHeaderSetWarm((tuple)->t_data) + +#define HeapTupleClearWarm(tuple) \ + HeapTupleHeaderClearWarm((tuple)->t_data) + #define HeapTupleGetOid(tuple) \ HeapTupleHeaderGetOid((tuple)->t_data) diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index f9304db..163180d 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -427,6 +427,12 @@ typedef BTScanOpaqueData *BTScanOpaque; #define SK_BT_NULLS_FIRST (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT) /* + * Flags overloaded on t_tid.ip_posid field. They are managed by + * ItemPointerSetFlags and corresponing routines. + */ +#define BTREE_INDEX_WARM_POINTER 0x01 + +/* * external entry points for btree, in nbtree.c */ extern IndexBuildResult *btbuild(Relation heap, Relation index, @@ -436,6 +442,10 @@ extern bool btinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, struct IndexInfo *indexInfo); +extern bool btwarminsert(Relation rel, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + struct IndexInfo *indexInfo); extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys); extern Size btestimateparallelscan(void); extern void btinitparallelscan(void *target); @@ -487,10 +497,12 @@ extern void _bt_pageinit(Page page, Size size); extern bool _bt_page_recyclable(Page page); extern void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, Relation heapRel); -extern void _bt_delitems_vacuum(Relation rel, Buffer buf, - OffsetNumber *itemnos, int nitems, - BlockNumber lastBlockVacuumed); +extern void _bt_handleitems_vacuum(Relation rel, Buffer buf, + OffsetNumber *delitemnos, int ndelitems, + OffsetNumber *clearitemnos, int nclearitems); extern int _bt_pagedel(Relation rel, Buffer buf); +extern void _bt_clear_items(Page page, OffsetNumber *clearitemnos, + uint16 nclearitems); /* * prototypes for functions in nbtsearch.c @@ -537,6 +549,9 @@ extern bytea *btoptions(Datum reloptions, bool validate); extern bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull); +extern bool btrecheck(Relation indexRel, struct IndexInfo *indexInfo, + IndexTuple indexTuple, + Relation heapRel, HeapTuple heapTuple); /* * prototypes for functions in nbtvalidate.c diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index d6a3085..7efd0d7 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -142,34 +142,20 @@ typedef struct xl_btree_reuse_page /* * This is what we need to know about vacuum of individual leaf index tuples. * The WAL record can represent deletion of any number of index tuples on a - * single index page when executed by VACUUM. - * - * For MVCC scans, lastBlockVacuumed will be set to InvalidBlockNumber. - * For a non-MVCC index scans there is an additional correctness requirement - * for applying these changes during recovery, which is that we must do one - * of these two things for every block in the index: - * * lock the block for cleanup and apply any required changes - * * EnsureBlockUnpinned() - * The purpose of this is to ensure that no index scans started before we - * finish scanning the index are still running by the time we begin to remove - * heap tuples. - * - * Any changes to any one block are registered on just one WAL record. All - * blocks that we need to run EnsureBlockUnpinned() are listed as a block range - * starting from the last block vacuumed through until this one. Individual - * block numbers aren't given. + * single index page when executed by VACUUM. It also includes tuples which + * are cleared of WARM bits by VACUUM. * * Note that the *last* WAL record in any vacuum of an index is allowed to * have a zero length array of offsets. Earlier records must have at least one. */ typedef struct xl_btree_vacuum { - BlockNumber lastBlockVacuumed; - - /* TARGET OFFSET NUMBERS FOLLOW */ + uint16 ndelitems; + uint16 nclearitems; + /* ndelitems + nclearitems TARGET OFFSET NUMBERS FOLLOW */ } xl_btree_vacuum; -#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber)) +#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, nclearitems) + sizeof(uint16)) /* * This is what we need to know about marking an empty branch for deletion. diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 3fc726d..fa178d3 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -104,6 +104,9 @@ typedef struct IndexScanDescData /* index access method's private state */ void *opaque; /* access-method-specific info */ + /* IndexInfo structure for this index */ + struct IndexInfo *indexInfo; + /* * In an index-only scan, a successful amgettuple call must fill either * xs_itup (and xs_itupdesc) or xs_hitup (and xs_hitupdesc) to provide the @@ -119,7 +122,7 @@ typedef struct IndexScanDescData HeapTupleData xs_ctup; /* current heap tuple, if any */ Buffer xs_cbuf; /* current heap buffer in scan, if any */ /* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ - bool xs_recheck; /* T means scan keys must be rechecked */ + bool xs_recheck; /* T means scan keys must be rechecked for each tuple */ /* * When fetching with an ordering operator, the values of the ORDER BY diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 20bec90..f92ec29 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -89,6 +89,13 @@ extern void FormIndexDatum(IndexInfo *indexInfo, Datum *values, bool *isnull); +extern void FormIndexPlainDatum(IndexInfo *indexInfo, + Relation heapRel, + HeapTuple heapTup, + Datum *values, + bool *isnull, + bool *isavail); + extern void index_build(Relation heapRelation, Relation indexRelation, IndexInfo *indexInfo, diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 836d6ff..0ca6e22 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -2769,6 +2769,8 @@ DATA(insert OID = 1933 ( pg_stat_get_tuples_deleted PGNSP PGUID 12 1 0 0 0 f f DESCR("statistics: number of tuples deleted"); DATA(insert OID = 1972 ( pg_stat_get_tuples_hot_updated PGNSP PGUID 12 1 0 0 0 f f f f t f s r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_tuples_hot_updated _null_ _null_ _null_ )); DESCR("statistics: number of tuples hot updated"); +DATA(insert OID = 3355 ( pg_stat_get_tuples_warm_updated PGNSP PGUID 12 1 0 0 0 f f f f t f s r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_tuples_warm_updated _null_ _null_ _null_ )); +DESCR("statistics: number of tuples warm updated"); DATA(insert OID = 2878 ( pg_stat_get_live_tuples PGNSP PGUID 12 1 0 0 0 f f f f t f s r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_live_tuples _null_ _null_ _null_ )); DESCR("statistics: number of live tuples"); DATA(insert OID = 2879 ( pg_stat_get_dead_tuples PGNSP PGUID 12 1 0 0 0 f f f f t f s r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_dead_tuples _null_ _null_ _null_ )); @@ -2921,6 +2923,8 @@ DATA(insert OID = 3042 ( pg_stat_get_xact_tuples_deleted PGNSP PGUID 12 1 0 0 DESCR("statistics: number of tuples deleted in current transaction"); DATA(insert OID = 3043 ( pg_stat_get_xact_tuples_hot_updated PGNSP PGUID 12 1 0 0 0 f f f f t f v r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_xact_tuples_hot_updated _null_ _null_ _null_ )); DESCR("statistics: number of tuples hot updated in current transaction"); +DATA(insert OID = 3356 ( pg_stat_get_xact_tuples_warm_updated PGNSP PGUID 12 1 0 0 0 f f f f t f v r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_xact_tuples_warm_updated _null_ _null_ _null_ )); +DESCR("statistics: number of tuples warm updated in current transaction"); DATA(insert OID = 3044 ( pg_stat_get_xact_blocks_fetched PGNSP PGUID 12 1 0 0 0 f f f f t f v r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_xact_blocks_fetched _null_ _null_ _null_ )); DESCR("statistics: number of blocks fetched in current transaction"); DATA(insert OID = 3045 ( pg_stat_get_xact_blocks_hit PGNSP PGUID 12 1 0 0 0 f f f f t f v r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_xact_blocks_hit _null_ _null_ _null_ )); diff --git a/src/include/commands/progress.h b/src/include/commands/progress.h index 9472ecc..b355b61 100644 --- a/src/include/commands/progress.h +++ b/src/include/commands/progress.h @@ -25,6 +25,7 @@ #define PROGRESS_VACUUM_NUM_INDEX_VACUUMS 4 #define PROGRESS_VACUUM_MAX_DEAD_TUPLES 5 #define PROGRESS_VACUUM_NUM_DEAD_TUPLES 6 +#define PROGRESS_VACUUM_HEAP_BLKS_WARMCLEARED 7 /* Phases of vacuum (as advertised via PROGRESS_VACUUM_PHASE) */ #define PROGRESS_VACUUM_PHASE_SCAN_HEAP 1 diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 02dbe7b..c4495a3 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -382,6 +382,7 @@ extern void UnregisterExprContextCallback(ExprContext *econtext, extern void ExecOpenIndices(ResultRelInfo *resultRelInfo, bool speculative); extern void ExecCloseIndices(ResultRelInfo *resultRelInfo); extern List *ExecInsertIndexTuples(TupleTableSlot *slot, ItemPointer tupleid, + ItemPointer root_tid, Bitmapset *modified_attrs, EState *estate, bool noDupErr, bool *specConflict, List *arbiterIndexes); extern bool ExecCheckIndexConstraints(TupleTableSlot *slot, EState *estate, diff --git a/src/include/executor/nodeIndexscan.h b/src/include/executor/nodeIndexscan.h index ea3f3a5..ebeec74 100644 --- a/src/include/executor/nodeIndexscan.h +++ b/src/include/executor/nodeIndexscan.h @@ -41,5 +41,4 @@ extern void ExecIndexEvalRuntimeKeys(ExprContext *econtext, extern bool ExecIndexEvalArrayKeys(ExprContext *econtext, IndexArrayKeyInfo *arrayKeys, int numArrayKeys); extern bool ExecIndexAdvanceArrayKeys(IndexArrayKeyInfo *arrayKeys, int numArrayKeys); - #endif /* NODEINDEXSCAN_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index f856f60..cd09553 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -66,6 +66,7 @@ typedef struct IndexInfo NodeTag type; int ii_NumIndexAttrs; AttrNumber ii_KeyAttrNumbers[INDEX_MAX_KEYS]; + Bitmapset *ii_indxattrs; /* bitmap of all columns used in this index */ List *ii_Expressions; /* list of Expr */ List *ii_ExpressionsState; /* list of ExprState */ List *ii_Predicate; /* list of Expr */ diff --git a/src/include/pgstat.h b/src/include/pgstat.h index f2daf32..af8a3ba 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -105,6 +105,7 @@ typedef struct PgStat_TableCounts PgStat_Counter t_tuples_updated; PgStat_Counter t_tuples_deleted; PgStat_Counter t_tuples_hot_updated; + PgStat_Counter t_tuples_warm_updated; bool t_truncated; PgStat_Counter t_delta_live_tuples; @@ -625,6 +626,7 @@ typedef struct PgStat_StatTabEntry PgStat_Counter tuples_updated; PgStat_Counter tuples_deleted; PgStat_Counter tuples_hot_updated; + PgStat_Counter tuples_warm_updated; PgStat_Counter n_live_tuples; PgStat_Counter n_dead_tuples; @@ -1257,7 +1259,7 @@ pgstat_report_wait_end(void) (pgStatBlockWriteTime += (n)) extern void pgstat_count_heap_insert(Relation rel, PgStat_Counter n); -extern void pgstat_count_heap_update(Relation rel, bool hot); +extern void pgstat_count_heap_update(Relation rel, bool hot, bool warm); extern void pgstat_count_heap_delete(Relation rel); extern void pgstat_count_truncate(Relation rel); extern void pgstat_update_heap_dead_tuples(Relation rel, int delta); diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index a617a7c..fbac7c0 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -138,9 +138,14 @@ typedef struct RelationData /* data managed by RelationGetIndexAttrBitmap: */ Bitmapset *rd_indexattr; /* identifies columns used in indexes */ + Bitmapset *rd_exprindexattr; /* indentified columns used in expression or + predicate indexes */ + Bitmapset *rd_indxnotreadyattr; /* columns used by indexes not yet + ready */ Bitmapset *rd_keyattr; /* cols that can be ref'd by foreign keys */ Bitmapset *rd_pkattr; /* cols included in primary key */ Bitmapset *rd_idattr; /* included in replica identity index */ + bool rd_supportswarm;/* True if the table can be WARM updated */ PublicationActions *rd_pubactions; /* publication actions */ diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h index da36b67..d18bd09 100644 --- a/src/include/utils/relcache.h +++ b/src/include/utils/relcache.h @@ -50,7 +50,9 @@ typedef enum IndexAttrBitmapKind INDEX_ATTR_BITMAP_ALL, INDEX_ATTR_BITMAP_KEY, INDEX_ATTR_BITMAP_PRIMARY_KEY, - INDEX_ATTR_BITMAP_IDENTITY_KEY + INDEX_ATTR_BITMAP_IDENTITY_KEY, + INDEX_ATTR_BITMAP_EXPR_PREDICATE, + INDEX_ATTR_BITMAP_NOTREADY } IndexAttrBitmapKind; extern Bitmapset *RelationGetIndexAttrBitmap(Relation relation, diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile index 3ce9904..347c4ce 100644 --- a/src/test/modules/Makefile +++ b/src/test/modules/Makefile @@ -15,6 +15,7 @@ SUBDIRS = \ test_pg_dump \ test_rls_hooks \ test_shm_mq \ + warm \ worker_spi all: submake-generated-headers diff --git a/src/test/regress/expected/alter_generic.out b/src/test/regress/expected/alter_generic.out index b01be59..37719c9 100644 --- a/src/test/regress/expected/alter_generic.out +++ b/src/test/regress/expected/alter_generic.out @@ -161,15 +161,15 @@ ALTER SERVER alt_fserv1 RENAME TO alt_fserv3; -- OK SELECT fdwname FROM pg_foreign_data_wrapper WHERE fdwname like 'alt_fdw%'; fdwname ---------- - alt_fdw2 alt_fdw3 + alt_fdw2 (2 rows) SELECT srvname FROM pg_foreign_server WHERE srvname like 'alt_fserv%'; srvname ------------ - alt_fserv2 alt_fserv3 + alt_fserv2 (2 rows) -- diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index bd13ae6..44c59ae 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1732,6 +1732,7 @@ pg_stat_all_tables| SELECT c.oid AS relid, pg_stat_get_tuples_updated(c.oid) AS n_tup_upd, pg_stat_get_tuples_deleted(c.oid) AS n_tup_del, pg_stat_get_tuples_hot_updated(c.oid) AS n_tup_hot_upd, + pg_stat_get_tuples_warm_updated(c.oid) AS n_tup_warm_upd, pg_stat_get_live_tuples(c.oid) AS n_live_tup, pg_stat_get_dead_tuples(c.oid) AS n_dead_tup, pg_stat_get_mod_since_analyze(c.oid) AS n_mod_since_analyze, @@ -1875,6 +1876,7 @@ pg_stat_sys_tables| SELECT pg_stat_all_tables.relid, pg_stat_all_tables.n_tup_upd, pg_stat_all_tables.n_tup_del, pg_stat_all_tables.n_tup_hot_upd, + pg_stat_all_tables.n_tup_warm_upd, pg_stat_all_tables.n_live_tup, pg_stat_all_tables.n_dead_tup, pg_stat_all_tables.n_mod_since_analyze, @@ -1918,6 +1920,7 @@ pg_stat_user_tables| SELECT pg_stat_all_tables.relid, pg_stat_all_tables.n_tup_upd, pg_stat_all_tables.n_tup_del, pg_stat_all_tables.n_tup_hot_upd, + pg_stat_all_tables.n_tup_warm_upd, pg_stat_all_tables.n_live_tup, pg_stat_all_tables.n_dead_tup, pg_stat_all_tables.n_mod_since_analyze, @@ -1955,7 +1958,8 @@ pg_stat_xact_all_tables| SELECT c.oid AS relid, pg_stat_get_xact_tuples_inserted(c.oid) AS n_tup_ins, pg_stat_get_xact_tuples_updated(c.oid) AS n_tup_upd, pg_stat_get_xact_tuples_deleted(c.oid) AS n_tup_del, - pg_stat_get_xact_tuples_hot_updated(c.oid) AS n_tup_hot_upd + pg_stat_get_xact_tuples_hot_updated(c.oid) AS n_tup_hot_upd, + pg_stat_get_xact_tuples_warm_updated(c.oid) AS n_tup_warm_upd FROM ((pg_class c LEFT JOIN pg_index i ON ((c.oid = i.indrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) @@ -1971,7 +1975,8 @@ pg_stat_xact_sys_tables| SELECT pg_stat_xact_all_tables.relid, pg_stat_xact_all_tables.n_tup_ins, pg_stat_xact_all_tables.n_tup_upd, pg_stat_xact_all_tables.n_tup_del, - pg_stat_xact_all_tables.n_tup_hot_upd + pg_stat_xact_all_tables.n_tup_hot_upd, + pg_stat_xact_all_tables.n_tup_warm_upd FROM pg_stat_xact_all_tables WHERE ((pg_stat_xact_all_tables.schemaname = ANY (ARRAY['pg_catalog'::name, 'information_schema'::name])) OR (pg_stat_xact_all_tables.schemaname ~ '^pg_toast'::text)); pg_stat_xact_user_functions| SELECT p.oid AS funcid, @@ -1993,7 +1998,8 @@ pg_stat_xact_user_tables| SELECT pg_stat_xact_all_tables.relid, pg_stat_xact_all_tables.n_tup_ins, pg_stat_xact_all_tables.n_tup_upd, pg_stat_xact_all_tables.n_tup_del, - pg_stat_xact_all_tables.n_tup_hot_upd + pg_stat_xact_all_tables.n_tup_hot_upd, + pg_stat_xact_all_tables.n_tup_warm_upd FROM pg_stat_xact_all_tables WHERE ((pg_stat_xact_all_tables.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_stat_xact_all_tables.schemaname !~ '^pg_toast'::text)); pg_statio_all_indexes| SELECT c.oid AS relid, diff --git a/src/test/regress/expected/warm.out b/src/test/regress/expected/warm.out new file mode 100644 index 0000000..6391891 --- /dev/null +++ b/src/test/regress/expected/warm.out @@ -0,0 +1,367 @@ +CREATE TABLE updtst_tab1 (a integer unique, b int, c text, d text); +CREATE INDEX updtst_indx1 ON updtst_tab1 (b); +INSERT INTO updtst_tab1 + SELECT generate_series(1,10000), generate_series(70001, 80000), 'foo', 'bar'; +-- This should be a HOT update as non-index key is updated, but the +-- page won't have any free space, so probably a non-HOT update +UPDATE updtst_tab1 SET c = 'foo1' WHERE a = 1; +-- Next update should be a HOT update as dead space is recycled +UPDATE updtst_tab1 SET c = 'foo2' WHERE a = 1; +-- And next too +UPDATE updtst_tab1 SET c = 'foo3' WHERE a = 1; +-- Now update one of the index key columns +UPDATE updtst_tab1 SET b = b + 70000 WHERE a = 1; +-- Ensure that the correct row is fetched +SELECT * FROM updtst_tab1 WHERE a = 1; + a | b | c | d +---+--------+------+----- + 1 | 140001 | foo3 | bar +(1 row) + +SELECT * FROM updtst_tab1 WHERE b = 70001 + 70000; + a | b | c | d +---+--------+------+----- + 1 | 140001 | foo3 | bar +(1 row) + +-- Even when seqscan is disabled and indexscan is forced +SET enable_seqscan = false; +EXPLAIN (costs off) SELECT * FROM updtst_tab1 WHERE b = 70001 + 70000; + QUERY PLAN +----------------------------------------- + Bitmap Heap Scan on updtst_tab1 + Recheck Cond: (b = 140001) + -> Bitmap Index Scan on updtst_indx1 + Index Cond: (b = 140001) +(4 rows) + +SELECT * FROM updtst_tab1 WHERE b = 70001 + 70000; + a | b | c | d +---+--------+------+----- + 1 | 140001 | foo3 | bar +(1 row) + +-- Check if index only scan works correctly +EXPLAIN (costs off) SELECT b FROM updtst_tab1 WHERE b = 70001 + 70000; + QUERY PLAN +----------------------------------------- + Bitmap Heap Scan on updtst_tab1 + Recheck Cond: (b = 140001) + -> Bitmap Index Scan on updtst_indx1 + Index Cond: (b = 140001) +(4 rows) + +SELECT b FROM updtst_tab1 WHERE b = 70001 + 70000; + b +-------- + 140001 +(1 row) + +-- Table must be vacuumed to force index-only scan +VACUUM updtst_tab1; +EXPLAIN (costs off) SELECT b FROM updtst_tab1 WHERE b = 70001 + 70000; + QUERY PLAN +--------------------------------------------------- + Index Only Scan using updtst_indx1 on updtst_tab1 + Index Cond: (b = 140001) +(2 rows) + +SELECT b FROM updtst_tab1 WHERE b = 70001 + 70000; + b +-------- + 140001 +(1 row) + +SET enable_seqscan = true; +DROP TABLE updtst_tab1; +------------------ +CREATE TABLE updtst_tab2 (a integer unique, b int, c text, d text) WITH (fillfactor = 80); +CREATE INDEX updtst_indx2 ON updtst_tab2 (b); +INSERT INTO updtst_tab2 + SELECT generate_series(1,100), generate_series(701, 800), 'foo', 'bar'; +UPDATE updtst_tab2 SET b = b + 700 WHERE a = 1; +UPDATE updtst_tab2 SET c = 'foo1' WHERE a = 1; +UPDATE updtst_tab2 SET c = 'foo2' WHERE a = 1; +UPDATE updtst_tab2 SET c = 'foo3' WHERE a = 1; +UPDATE updtst_tab2 SET b = b - 700 WHERE a = 1; +UPDATE updtst_tab2 SET c = 'foo4' WHERE a = 1; +UPDATE updtst_tab2 SET c = 'foo5' WHERE a = 1; +UPDATE updtst_tab2 SET c = 'foo6' WHERE a = 1; +SELECT count(*) FROM updtst_tab2 WHERE c = 'foo'; + count +------- + 99 +(1 row) + +SELECT * FROM updtst_tab2 WHERE c = 'foo6'; + a | b | c | d +---+-----+------+----- + 1 | 701 | foo6 | bar +(1 row) + +EXPLAIN (costs off) SELECT * FROM updtst_tab2 WHERE b = 701; + QUERY PLAN +----------------------------------------- + Bitmap Heap Scan on updtst_tab2 + Recheck Cond: (b = 701) + -> Bitmap Index Scan on updtst_indx2 + Index Cond: (b = 701) +(4 rows) + +SELECT * FROM updtst_tab2 WHERE a = 1; + a | b | c | d +---+-----+------+----- + 1 | 701 | foo6 | bar +(1 row) + +SET enable_seqscan = false; +EXPLAIN (costs off) SELECT * FROM updtst_tab2 WHERE b = 701; + QUERY PLAN +----------------------------------------- + Bitmap Heap Scan on updtst_tab2 + Recheck Cond: (b = 701) + -> Bitmap Index Scan on updtst_indx2 + Index Cond: (b = 701) +(4 rows) + +SELECT * FROM updtst_tab2 WHERE b = 701; + a | b | c | d +---+-----+------+----- + 1 | 701 | foo6 | bar +(1 row) + +VACUUM updtst_tab2; +EXPLAIN (costs off) SELECT b FROM updtst_tab2 WHERE b = 701; + QUERY PLAN +--------------------------------------------------- + Index Only Scan using updtst_indx2 on updtst_tab2 + Index Cond: (b = 701) +(2 rows) + +SELECT b FROM updtst_tab2 WHERE b = 701; + b +----- + 701 +(1 row) + +SET enable_seqscan = true; +DROP TABLE updtst_tab2; +------------------ +CREATE TABLE updtst_tab3 (a integer unique, b int, c text, d text) WITH (fillfactor = 80); +CREATE INDEX updtst_indx3 ON updtst_tab3 (b); +INSERT INTO updtst_tab3 + SELECT generate_series(1,100), generate_series(701, 800), 'foo', 'bar'; +BEGIN; +UPDATE updtst_tab3 SET c = 'foo1', b = b + 700 WHERE a = 1; +UPDATE updtst_tab3 SET c = 'foo2' WHERE a = 1; +UPDATE updtst_tab3 SET c = 'foo3' WHERE a = 1; +UPDATE updtst_tab3 SET b = b - 700 WHERE a = 1; +UPDATE updtst_tab3 SET c = 'foo4' WHERE a = 1; +UPDATE updtst_tab3 SET c = 'foo5' WHERE a = 1; +UPDATE updtst_tab3 SET c = 'foo6' WHERE a = 1; +-- Abort the transaction and ensure the original tuple is visible correctly +ROLLBACK; +BEGIN; +UPDATE updtst_tab3 SET c = 'foo11', b = b + 750 WHERE b = 701; +UPDATE updtst_tab3 SET c = 'foo12' WHERE a = 1; +UPDATE updtst_tab3 SET b = b - 30 WHERE a = 1; +COMMIT; +SELECT count(*) FROM updtst_tab3 WHERE c = 'foo'; + count +------- + 99 +(1 row) + +SELECT * FROM updtst_tab3 WHERE c = 'foo6'; + a | b | c | d +---+---+---+--- +(0 rows) + +SELECT * FROM updtst_tab3 WHERE c = 'foo12'; + a | b | c | d +---+------+-------+----- + 1 | 1421 | foo12 | bar +(1 row) + +SELECT * FROM updtst_tab3 WHERE b = 701; + a | b | c | d +---+---+---+--- +(0 rows) + +SELECT * FROM updtst_tab3 WHERE b = 1421; + a | b | c | d +---+------+-------+----- + 1 | 1421 | foo12 | bar +(1 row) + +SELECT * FROM updtst_tab3 WHERE a = 1; + a | b | c | d +---+------+-------+----- + 1 | 1421 | foo12 | bar +(1 row) + +SELECT * FROM updtst_tab3 WHERE b = 701; + a | b | c | d +---+---+---+--- +(0 rows) + +SELECT * FROM updtst_tab3 WHERE b = 1421; + a | b | c | d +---+------+-------+----- + 1 | 1421 | foo12 | bar +(1 row) + +VACUUM updtst_tab3; +EXPLAIN (costs off) SELECT b FROM updtst_tab3 WHERE b = 701; + QUERY PLAN +------------------------- + Seq Scan on updtst_tab3 + Filter: (b = 701) +(2 rows) + +SELECT b FROM updtst_tab3 WHERE b = 701; + b +--- +(0 rows) + +SELECT b FROM updtst_tab3 WHERE b = 1421; + b +------ + 1421 +(1 row) + +BEGIN; +UPDATE updtst_tab3 SET c = 'foo21', b = b + 700 WHERE a = 2; +UPDATE updtst_tab3 SET c = 'foo22' WHERE a = 2; +UPDATE updtst_tab3 SET c = 'foo23' WHERE a = 2; +UPDATE updtst_tab3 SET b = b - 700 WHERE a = 2; +UPDATE updtst_tab3 SET c = 'foo24' WHERE a = 2; +UPDATE updtst_tab3 SET c = 'foo25' WHERE a = 2; +UPDATE updtst_tab3 SET c = 'foo26' WHERE a = 2; +-- Abort the transaction and ensure the original tuple is visible correctly +ROLLBACK; +SET enable_seqscan = false; +BEGIN; +UPDATE updtst_tab3 SET c = 'foo21', b = b + 750 WHERE b = 702; +UPDATE updtst_tab3 SET c = 'foo22' WHERE a = 2; +UPDATE updtst_tab3 SET b = b - 30 WHERE a = 2; +COMMIT; +SELECT count(*) FROM updtst_tab3 WHERE c = 'foo'; + count +------- + 98 +(1 row) + +SELECT * FROM updtst_tab3 WHERE c = 'foo26'; + a | b | c | d +---+---+---+--- +(0 rows) + +SELECT * FROM updtst_tab3 WHERE c = 'foo22'; + a | b | c | d +---+------+-------+----- + 2 | 1422 | foo22 | bar +(1 row) + +SELECT * FROM updtst_tab3 WHERE b = 702; + a | b | c | d +---+---+---+--- +(0 rows) + +SELECT * FROM updtst_tab3 WHERE b = 1422; + a | b | c | d +---+------+-------+----- + 2 | 1422 | foo22 | bar +(1 row) + +SELECT * FROM updtst_tab3 WHERE a = 2; + a | b | c | d +---+------+-------+----- + 2 | 1422 | foo22 | bar +(1 row) + +-- Try fetching both old and new value using updtst_indx3 +SELECT * FROM updtst_tab3 WHERE b = 702; + a | b | c | d +---+---+---+--- +(0 rows) + +SELECT * FROM updtst_tab3 WHERE b = 1422; + a | b | c | d +---+------+-------+----- + 2 | 1422 | foo22 | bar +(1 row) + +VACUUM updtst_tab3; +EXPLAIN (costs off) SELECT b FROM updtst_tab3 WHERE b = 702; + QUERY PLAN +--------------------------------------------------- + Index Only Scan using updtst_indx3 on updtst_tab3 + Index Cond: (b = 702) +(2 rows) + +SELECT b FROM updtst_tab3 WHERE b = 702; + b +--- +(0 rows) + +SELECT b FROM updtst_tab3 WHERE b = 1422; + b +------ + 1422 +(1 row) + +SET enable_seqscan = true; +DROP TABLE updtst_tab3; +------------------ +CREATE TABLE test_warm (a text unique, b text); +CREATE INDEX test_warmindx ON test_warm (lower(a)); +INSERT INTO test_warm values ('test', 'foo'); +UPDATE test_warm SET a = 'TEST'; +select *, ctid from test_warm where lower(a) = 'test'; + a | b | ctid +------+-----+------- + TEST | foo | (0,2) +(1 row) + +explain select * from test_warm where lower(a) = 'test'; + QUERY PLAN +---------------------------------------------------------------------------- + Bitmap Heap Scan on test_warm (cost=4.18..12.65 rows=4 width=64) + Recheck Cond: (lower(a) = 'test'::text) + -> Bitmap Index Scan on test_warmindx (cost=0.00..4.18 rows=4 width=0) + Index Cond: (lower(a) = 'test'::text) +(4 rows) + +select *, ctid from test_warm where lower(a) = 'test'; + a | b | ctid +------+-----+------- + TEST | foo | (0,2) +(1 row) + +select *, ctid from test_warm where a = 'test'; + a | b | ctid +---+---+------ +(0 rows) + +select *, ctid from test_warm where a = 'TEST'; + a | b | ctid +------+-----+------- + TEST | foo | (0,2) +(1 row) + +set enable_bitmapscan TO false; +explain select * from test_warm where lower(a) = 'test'; + QUERY PLAN +--------------------------------------------------------------------------------- + Index Scan using test_warmindx on test_warm (cost=0.15..20.22 rows=4 width=64) + Index Cond: (lower(a) = 'test'::text) +(2 rows) + +select *, ctid from test_warm where lower(a) = 'test'; + a | b | ctid +------+-----+------- + TEST | foo | (0,2) +(1 row) + +DROP TABLE test_warm; diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index ea7b5b4..7cc0d21 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -42,6 +42,8 @@ test: create_type test: create_table test: create_function_2 +test: warm + # ---------- # Load huge amounts of data # We should split the data files into single files and then diff --git a/src/test/regress/sql/warm.sql b/src/test/regress/sql/warm.sql new file mode 100644 index 0000000..3a078dd --- /dev/null +++ b/src/test/regress/sql/warm.sql @@ -0,0 +1,170 @@ + +CREATE TABLE updtst_tab1 (a integer unique, b int, c text, d text); +CREATE INDEX updtst_indx1 ON updtst_tab1 (b); +INSERT INTO updtst_tab1 + SELECT generate_series(1,10000), generate_series(70001, 80000), 'foo', 'bar'; + +-- This should be a HOT update as non-index key is updated, but the +-- page won't have any free space, so probably a non-HOT update +UPDATE updtst_tab1 SET c = 'foo1' WHERE a = 1; + +-- Next update should be a HOT update as dead space is recycled +UPDATE updtst_tab1 SET c = 'foo2' WHERE a = 1; + +-- And next too +UPDATE updtst_tab1 SET c = 'foo3' WHERE a = 1; + +-- Now update one of the index key columns +UPDATE updtst_tab1 SET b = b + 70000 WHERE a = 1; + +-- Ensure that the correct row is fetched +SELECT * FROM updtst_tab1 WHERE a = 1; +SELECT * FROM updtst_tab1 WHERE b = 70001 + 70000; + +-- Even when seqscan is disabled and indexscan is forced +SET enable_seqscan = false; +EXPLAIN (costs off) SELECT * FROM updtst_tab1 WHERE b = 70001 + 70000; +SELECT * FROM updtst_tab1 WHERE b = 70001 + 70000; + +-- Check if index only scan works correctly +EXPLAIN (costs off) SELECT b FROM updtst_tab1 WHERE b = 70001 + 70000; +SELECT b FROM updtst_tab1 WHERE b = 70001 + 70000; + +-- Table must be vacuumed to force index-only scan +VACUUM updtst_tab1; +EXPLAIN (costs off) SELECT b FROM updtst_tab1 WHERE b = 70001 + 70000; +SELECT b FROM updtst_tab1 WHERE b = 70001 + 70000; + +SET enable_seqscan = true; + +DROP TABLE updtst_tab1; + +------------------ + +CREATE TABLE updtst_tab2 (a integer unique, b int, c text, d text) WITH (fillfactor = 80); +CREATE INDEX updtst_indx2 ON updtst_tab2 (b); +INSERT INTO updtst_tab2 + SELECT generate_series(1,100), generate_series(701, 800), 'foo', 'bar'; + +UPDATE updtst_tab2 SET b = b + 700 WHERE a = 1; +UPDATE updtst_tab2 SET c = 'foo1' WHERE a = 1; +UPDATE updtst_tab2 SET c = 'foo2' WHERE a = 1; +UPDATE updtst_tab2 SET c = 'foo3' WHERE a = 1; +UPDATE updtst_tab2 SET b = b - 700 WHERE a = 1; +UPDATE updtst_tab2 SET c = 'foo4' WHERE a = 1; +UPDATE updtst_tab2 SET c = 'foo5' WHERE a = 1; +UPDATE updtst_tab2 SET c = 'foo6' WHERE a = 1; + +SELECT count(*) FROM updtst_tab2 WHERE c = 'foo'; +SELECT * FROM updtst_tab2 WHERE c = 'foo6'; + +EXPLAIN (costs off) SELECT * FROM updtst_tab2 WHERE b = 701; +SELECT * FROM updtst_tab2 WHERE a = 1; + +SET enable_seqscan = false; +EXPLAIN (costs off) SELECT * FROM updtst_tab2 WHERE b = 701; +SELECT * FROM updtst_tab2 WHERE b = 701; + +VACUUM updtst_tab2; +EXPLAIN (costs off) SELECT b FROM updtst_tab2 WHERE b = 701; +SELECT b FROM updtst_tab2 WHERE b = 701; + +SET enable_seqscan = true; + +DROP TABLE updtst_tab2; +------------------ + +CREATE TABLE updtst_tab3 (a integer unique, b int, c text, d text) WITH (fillfactor = 80); +CREATE INDEX updtst_indx3 ON updtst_tab3 (b); +INSERT INTO updtst_tab3 + SELECT generate_series(1,100), generate_series(701, 800), 'foo', 'bar'; + +BEGIN; +UPDATE updtst_tab3 SET c = 'foo1', b = b + 700 WHERE a = 1; +UPDATE updtst_tab3 SET c = 'foo2' WHERE a = 1; +UPDATE updtst_tab3 SET c = 'foo3' WHERE a = 1; +UPDATE updtst_tab3 SET b = b - 700 WHERE a = 1; +UPDATE updtst_tab3 SET c = 'foo4' WHERE a = 1; +UPDATE updtst_tab3 SET c = 'foo5' WHERE a = 1; +UPDATE updtst_tab3 SET c = 'foo6' WHERE a = 1; + +-- Abort the transaction and ensure the original tuple is visible correctly +ROLLBACK; + +BEGIN; +UPDATE updtst_tab3 SET c = 'foo11', b = b + 750 WHERE b = 701; +UPDATE updtst_tab3 SET c = 'foo12' WHERE a = 1; +UPDATE updtst_tab3 SET b = b - 30 WHERE a = 1; +COMMIT; + +SELECT count(*) FROM updtst_tab3 WHERE c = 'foo'; +SELECT * FROM updtst_tab3 WHERE c = 'foo6'; +SELECT * FROM updtst_tab3 WHERE c = 'foo12'; + +SELECT * FROM updtst_tab3 WHERE b = 701; +SELECT * FROM updtst_tab3 WHERE b = 1421; +SELECT * FROM updtst_tab3 WHERE a = 1; + +SELECT * FROM updtst_tab3 WHERE b = 701; +SELECT * FROM updtst_tab3 WHERE b = 1421; + +VACUUM updtst_tab3; +EXPLAIN (costs off) SELECT b FROM updtst_tab3 WHERE b = 701; +SELECT b FROM updtst_tab3 WHERE b = 701; +SELECT b FROM updtst_tab3 WHERE b = 1421; + +BEGIN; +UPDATE updtst_tab3 SET c = 'foo21', b = b + 700 WHERE a = 2; +UPDATE updtst_tab3 SET c = 'foo22' WHERE a = 2; +UPDATE updtst_tab3 SET c = 'foo23' WHERE a = 2; +UPDATE updtst_tab3 SET b = b - 700 WHERE a = 2; +UPDATE updtst_tab3 SET c = 'foo24' WHERE a = 2; +UPDATE updtst_tab3 SET c = 'foo25' WHERE a = 2; +UPDATE updtst_tab3 SET c = 'foo26' WHERE a = 2; + +-- Abort the transaction and ensure the original tuple is visible correctly +ROLLBACK; + +SET enable_seqscan = false; + +BEGIN; +UPDATE updtst_tab3 SET c = 'foo21', b = b + 750 WHERE b = 702; +UPDATE updtst_tab3 SET c = 'foo22' WHERE a = 2; +UPDATE updtst_tab3 SET b = b - 30 WHERE a = 2; +COMMIT; + +SELECT count(*) FROM updtst_tab3 WHERE c = 'foo'; +SELECT * FROM updtst_tab3 WHERE c = 'foo26'; +SELECT * FROM updtst_tab3 WHERE c = 'foo22'; + +SELECT * FROM updtst_tab3 WHERE b = 702; +SELECT * FROM updtst_tab3 WHERE b = 1422; +SELECT * FROM updtst_tab3 WHERE a = 2; + +-- Try fetching both old and new value using updtst_indx3 +SELECT * FROM updtst_tab3 WHERE b = 702; +SELECT * FROM updtst_tab3 WHERE b = 1422; + +VACUUM updtst_tab3; +EXPLAIN (costs off) SELECT b FROM updtst_tab3 WHERE b = 702; +SELECT b FROM updtst_tab3 WHERE b = 702; +SELECT b FROM updtst_tab3 WHERE b = 1422; + +SET enable_seqscan = true; + +DROP TABLE updtst_tab3; +------------------ + +CREATE TABLE test_warm (a text unique, b text); +CREATE INDEX test_warmindx ON test_warm (lower(a)); +INSERT INTO test_warm values ('test', 'foo'); +UPDATE test_warm SET a = 'TEST'; +select *, ctid from test_warm where lower(a) = 'test'; +explain select * from test_warm where lower(a) = 'test'; +select *, ctid from test_warm where lower(a) = 'test'; +select *, ctid from test_warm where a = 'test'; +select *, ctid from test_warm where a = 'TEST'; +set enable_bitmapscan TO false; +explain select * from test_warm where lower(a) = 'test'; +select *, ctid from test_warm where lower(a) = 'test'; +DROP TABLE test_warm;