diff --git a/contrib/pgstattuple/pgstattuple.c b/contrib/pgstattuple/pgstattuple.c index c1122b4..d02539a 100644 --- a/contrib/pgstattuple/pgstattuple.c +++ b/contrib/pgstattuple/pgstattuple.c @@ -400,7 +400,6 @@ pgstat_hash_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno, Buffer buf; Page page; - _hash_getlock(rel, blkno, HASH_SHARE); buf = _hash_getbuf_with_strategy(rel, blkno, HASH_READ, 0, bstrategy); page = BufferGetPage(buf); @@ -431,7 +430,6 @@ pgstat_hash_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno, } _hash_relbuf(rel, buf); - _hash_droplock(rel, blkno, HASH_SHARE); } /* diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 49a6c81..861dbc8 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -407,12 +407,15 @@ hashbeginscan(Relation rel, int nkeys, int norderbys) so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData)); so->hashso_bucket_valid = false; - so->hashso_bucket_blkno = 0; so->hashso_curbuf = InvalidBuffer; + so->hashso_bucket_buf = InvalidBuffer; + so->hashso_old_bucket_buf = InvalidBuffer; /* set position invalid (this will cause _hash_first call) */ ItemPointerSetInvalid(&(so->hashso_curpos)); ItemPointerSetInvalid(&(so->hashso_heappos)); + so->hashso_skip_moved_tuples = false; + scan->opaque = so; /* register scan in case we change pages it's using */ @@ -436,10 +439,15 @@ hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, _hash_dropbuf(rel, so->hashso_curbuf); so->hashso_curbuf = InvalidBuffer; - /* release lock on bucket, too */ - if (so->hashso_bucket_blkno) - _hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE); - so->hashso_bucket_blkno = 0; + /* release pin we hold on old primary bucket */ + if (BufferIsValid(so->hashso_old_bucket_buf)) + _hash_dropbuf(rel, so->hashso_old_bucket_buf); + so->hashso_old_bucket_buf = InvalidBuffer; + + /* release pin we hold on primary bucket */ + if (BufferIsValid(so->hashso_bucket_buf)) + _hash_dropbuf(rel, so->hashso_bucket_buf); + so->hashso_bucket_buf = InvalidBuffer; /* set position invalid (this will cause _hash_first call) */ ItemPointerSetInvalid(&(so->hashso_curpos)); @@ -453,6 +461,8 @@ hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, scan->numberOfKeys * sizeof(ScanKeyData)); so->hashso_bucket_valid = false; } + + so->hashso_skip_moved_tuples = false; } /* @@ -472,10 +482,15 @@ hashendscan(IndexScanDesc scan) _hash_dropbuf(rel, so->hashso_curbuf); so->hashso_curbuf = InvalidBuffer; - /* release lock on bucket, too */ - if (so->hashso_bucket_blkno) - _hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE); - so->hashso_bucket_blkno = 0; + /* release pin we hold on old primary bucket */ + if (BufferIsValid(so->hashso_old_bucket_buf)) + _hash_dropbuf(rel, so->hashso_old_bucket_buf); + so->hashso_old_bucket_buf = InvalidBuffer; + + /* release pin we hold on primary bucket */ + if (BufferIsValid(so->hashso_bucket_buf)) + _hash_dropbuf(rel, so->hashso_bucket_buf); + so->hashso_bucket_buf = InvalidBuffer; pfree(so); scan->opaque = NULL; @@ -486,6 +501,9 @@ hashendscan(IndexScanDesc scan) * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * + * This function also delete the tuples that are moved by split to other + * bucket. + * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ IndexBulkDeleteResult * @@ -530,83 +548,60 @@ loop_top: { BlockNumber bucket_blkno; BlockNumber blkno; - bool bucket_dirty = false; + Buffer bucket_buf; + Buffer buf; + HashPageOpaque bucket_opaque; + Page page; + bool bucket_has_garbage = false; /* Get address of bucket's start page */ bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket); - /* Exclusive-lock the bucket so we can shrink it */ - _hash_getlock(rel, bucket_blkno, HASH_EXCLUSIVE); - /* Shouldn't have any active scans locally, either */ if (_hash_has_active_scan(rel, cur_bucket)) elog(ERROR, "hash index has active scan during VACUUM"); - /* Scan each page in bucket */ blkno = bucket_blkno; - while (BlockNumberIsValid(blkno)) - { - Buffer buf; - Page page; - HashPageOpaque opaque; - OffsetNumber offno; - OffsetNumber maxoffno; - OffsetNumber deletable[MaxOffsetNumber]; - int ndeletable = 0; - - vacuum_delay_point(); - buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, - LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, - info->strategy); - page = BufferGetPage(buf); - opaque = (HashPageOpaque) PageGetSpecialPointer(page); - Assert(opaque->hasho_bucket == cur_bucket); - - /* Scan each tuple in page */ - maxoffno = PageGetMaxOffsetNumber(page); - for (offno = FirstOffsetNumber; - offno <= maxoffno; - offno = OffsetNumberNext(offno)) - { - IndexTuple itup; - ItemPointer htup; + /* + * Maintain a cleanup lock on primary bucket till we scan all the + * pages in bucket. This is required to ensure that we don't delete + * tuples which are needed for concurrent scans on buckets where split + * is in progress. Retaining it till end of bucket scan ensures that + * concurrent split can't be started on it. In future, we might want + * to relax the requirement for vacuum to take cleanup lock only for + * buckets where split is in progress, however for squeeze phase we + * need a cleanup lock, otherwise squeeze will move the tuples to a + * different location and that can lead to change in order of results. + */ + buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); + LockBufferForCleanup(buf); + _hash_checkpage(rel, buf, LH_BUCKET_PAGE); - itup = (IndexTuple) PageGetItem(page, - PageGetItemId(page, offno)); - htup = &(itup->t_tid); - if (callback(htup, callback_state)) - { - /* mark the item for deletion */ - deletable[ndeletable++] = offno; - tuples_removed += 1; - } - else - num_index_tuples += 1; - } + page = BufferGetPage(buf); + bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); - /* - * Apply deletions and write page if needed, advance to next page. - */ - blkno = opaque->hasho_nextblkno; + /* + * If the bucket contains tuples that are moved by split, then we need + * to delete such tuples on completion of split. The cleanup lock on + * bucket is not sufficient to detect whether a split is complete, as + * the previous split could have been interrupted by cancel request or + * error. + */ + if (H_HAS_GARBAGE(bucket_opaque) && + !H_INCOMPLETE_SPLIT(bucket_opaque)) + bucket_has_garbage = true; - if (ndeletable > 0) - { - PageIndexMultiDelete(page, deletable, ndeletable); - _hash_wrtbuf(rel, buf); - bucket_dirty = true; - } - else - _hash_relbuf(rel, buf); - } + bucket_buf = buf; - /* If we deleted anything, try to compact free space */ - if (bucket_dirty) - _hash_squeezebucket(rel, cur_bucket, bucket_blkno, - info->strategy); + hashbucketcleanup(rel, bucket_buf, blkno, info->strategy, + local_metapage.hashm_maxbucket, + local_metapage.hashm_highmask, + local_metapage.hashm_lowmask, &tuples_removed, + &num_index_tuples, bucket_has_garbage, true, + callback, callback_state); - /* Release bucket lock */ - _hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE); + _hash_relbuf(rel, bucket_buf); /* Advance to next bucket */ cur_bucket++; @@ -687,6 +682,155 @@ hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) return stats; } +/* + * Helper function to perform deletion of index entries from a bucket. + * + * This expects that the caller has acquired a cleanup lock on the target + * bucket (primary page of a bucket) and it is reponsibility of caller to + * release that lock. + */ +void +hashbucketcleanup(Relation rel, Buffer bucket_buf, + BlockNumber bucket_blkno, + BufferAccessStrategy bstrategy, + uint32 maxbucket, + uint32 highmask, uint32 lowmask, + double *tuples_removed, + double *num_index_tuples, + bool bucket_has_garbage, + bool delay, + IndexBulkDeleteCallback callback, + void *callback_state) +{ + BlockNumber blkno; + Buffer buf; + Bucket cur_bucket; + Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY; + Page page; + bool bucket_dirty = false; + + blkno = bucket_blkno; + buf = bucket_buf; + page = BufferGetPage(buf); + cur_bucket = ((HashPageOpaque) PageGetSpecialPointer(page))->hasho_bucket; + + if (bucket_has_garbage) + new_bucket = _hash_get_newbucket(rel, cur_bucket, + lowmask, maxbucket); + + /* Scan each page in bucket */ + for (;;) + { + HashPageOpaque opaque; + OffsetNumber offno; + OffsetNumber maxoffno; + OffsetNumber deletable[MaxOffsetNumber]; + int ndeletable = 0; + bool release_buf = false; + + if (delay) + vacuum_delay_point(); + + page = BufferGetPage(buf); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + + /* Scan each tuple in page */ + maxoffno = PageGetMaxOffsetNumber(page); + for (offno = FirstOffsetNumber; + offno <= maxoffno; + offno = OffsetNumberNext(offno)) + { + IndexTuple itup; + ItemPointer htup; + Bucket bucket; + + itup = (IndexTuple) PageGetItem(page, + PageGetItemId(page, offno)); + htup = &(itup->t_tid); + if (callback && callback(htup, callback_state)) + { + /* mark the item for deletion */ + deletable[ndeletable++] = offno; + tuples_removed += 1; + } + else if (bucket_has_garbage) + { + /* delete the tuples that are moved by split. */ + bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), + maxbucket, + highmask, + lowmask); + /* mark the item for deletion */ + if (bucket != cur_bucket) + { + /* + * We expect tuples to either belong to curent bucket or + * new_bucket. This is ensured because we don't allow + * further splits from bucket that contains garbage. See + * comments in _hash_expandtable. + */ + Assert(bucket == new_bucket); + deletable[ndeletable++] = offno; + } + } + else + num_index_tuples += 1; + } + + /* + * We don't release the lock on primary bucket till end of bucket + * scan. + */ + if (blkno != bucket_blkno) + release_buf = true; + + blkno = opaque->hasho_nextblkno; + + /* + * Apply deletions and write page if needed, advance to next page. + */ + if (ndeletable > 0) + { + PageIndexMultiDelete(page, deletable, ndeletable); + if (release_buf) + _hash_wrtbuf(rel, buf); + else + MarkBufferDirty(buf); + bucket_dirty = true; + } + else if (release_buf) + _hash_relbuf(rel, buf); + + /* bail out if there are no more pages to scan. */ + if (!BlockNumberIsValid(blkno)) + break; + + buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, + LH_OVERFLOW_PAGE, + bstrategy); + } + + /* + * Clear the garbage flag from bucket after deleting the tuples that are + * moved by split. We purposefully clear the flag before squeeze bucket, + * so that after restart, vacuum shouldn't again try to delete the moved + * by split tuples. + */ + if (bucket_has_garbage) + { + HashPageOpaque bucket_opaque; + + page = BufferGetPage(bucket_buf); + bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); + + bucket_opaque->hasho_flag &= ~LH_BUCKET_PAGE_HAS_GARBAGE; + } + + /* If we deleted anything, try to compact free space */ + if (bucket_dirty) + _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf, + bstrategy); +} void hash_redo(XLogReaderState *record) diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c index acd2e64..e7a7b51 100644 --- a/src/backend/access/hash/hashinsert.c +++ b/src/backend/access/hash/hashinsert.c @@ -18,6 +18,8 @@ #include "access/hash.h" #include "utils/rel.h" +static void + _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Buffer nbuf); /* * _hash_doinsert() -- Handle insertion of a single index tuple. @@ -28,7 +30,8 @@ void _hash_doinsert(Relation rel, IndexTuple itup) { - Buffer buf; + Buffer buf = InvalidBuffer; + Buffer bucket_buf; Buffer metabuf; HashMetaPage metap; BlockNumber blkno; @@ -70,51 +73,136 @@ _hash_doinsert(Relation rel, IndexTuple itup) errhint("Values larger than a buffer page cannot be indexed."))); /* - * Loop until we get a lock on the correct target bucket. + * Conditionally get the lock on primary bucket page for insertion while + * holding lock on meta page. If we have to wait, then release the meta + * page lock and retry it in a hard way. */ - for (;;) - { - /* - * Compute the target bucket number, and convert to block number. - */ - bucket = _hash_hashkey2bucket(hashkey, - metap->hashm_maxbucket, - metap->hashm_highmask, - metap->hashm_lowmask); + bucket = _hash_hashkey2bucket(hashkey, + metap->hashm_maxbucket, + metap->hashm_highmask, + metap->hashm_lowmask); - blkno = BUCKET_TO_BLKNO(metap, bucket); + blkno = BUCKET_TO_BLKNO(metap, bucket); - /* Release metapage lock, but keep pin. */ + /* Fetch the primary bucket page for the bucket */ + buf = ReadBuffer(rel, blkno); + if (!ConditionalLockBuffer(buf)) + { + _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); + LockBuffer(buf, HASH_WRITE); + _hash_checkpage(rel, buf, LH_BUCKET_PAGE); + _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ); + oldblkno = blkno; + retry = true; + } + else + { + _hash_checkpage(rel, buf, LH_BUCKET_PAGE); _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); + } + if (retry) + { /* - * If the previous iteration of this loop locked what is still the - * correct target bucket, we are done. Otherwise, drop any old lock - * and lock what now appears to be the correct bucket. + * Loop until we get a lock on the correct target bucket. We get the + * lock on primary bucket page and retain the pin on it during insert + * operation to prevent the concurrent splits. Retaining pin on a + * primary bucket page ensures that split can't happen as it needs to + * acquire the cleanup lock on primary bucket page. Acquiring lock on + * primary bucket and rechecking if it is a target bucket is mandatory + * as otherwise a concurrent split might cause this insertion to fall + * in wrong bucket. */ - if (retry) + for (;;) { - if (oldblkno == blkno) - break; - _hash_droplock(rel, oldblkno, HASH_SHARE); - } - _hash_getlock(rel, blkno, HASH_SHARE); + /* + * Compute the target bucket number, and convert to block number. + */ + bucket = _hash_hashkey2bucket(hashkey, + metap->hashm_maxbucket, + metap->hashm_highmask, + metap->hashm_lowmask); - /* - * Reacquire metapage lock and check that no bucket split has taken - * place while we were awaiting the bucket lock. - */ - _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ); - oldblkno = blkno; - retry = true; + blkno = BUCKET_TO_BLKNO(metap, bucket); + + /* Release metapage lock, but keep pin. */ + _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); + + /* + * If the previous iteration of this loop locked what is still the + * correct target bucket, we are done. Otherwise, drop any old + * lock and lock what now appears to be the correct bucket. + */ + if (retry) + { + if (oldblkno == blkno) + break; + _hash_relbuf(rel, buf); + } + + /* Fetch the primary bucket page for the bucket */ + buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE); + + /* + * Reacquire metapage lock and check that no bucket split has + * taken place while we were awaiting the bucket lock. + */ + _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ); + oldblkno = blkno; + retry = true; + } } - /* Fetch the primary bucket page for the bucket */ - buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE); + /* remember the primary bucket buffer to release the pin on it at end. */ + bucket_buf = buf; + page = BufferGetPage(buf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(pageopaque->hasho_bucket == bucket); + /* + * if there is any pending split, finish it before proceeding for the + * insertion as insertion can cause a new split. We don't want to allow + * split from a bucket where there is a pending split as there is no + * apparent benefit by doing so and it will make the code complicated to + * finish the split that involves multiple buckets considering the case + * where new split can also fail. + */ + if (H_NEW_INCOMPLETE_SPLIT(pageopaque)) + { + BlockNumber oblkno; + Buffer obuf; + + oblkno = _hash_get_oldblk(rel, pageopaque); + + /* Fetch the primary bucket page for the bucket */ + obuf = _hash_getbuf(rel, oblkno, HASH_READ, LH_BUCKET_PAGE); + + _hash_finish_split(rel, metabuf, obuf, buf); + + /* + * release the buffer here as the insertion will happen in new bucket. + */ + _hash_relbuf(rel, obuf); + } + else if (H_OLD_INCOMPLETE_SPLIT(pageopaque)) + { + BlockNumber nblkno; + Buffer nbuf; + + nblkno = _hash_get_newblk(rel, pageopaque); + + /* Fetch the primary bucket page for the bucket */ + nbuf = _hash_getbuf(rel, nblkno, HASH_WRITE, LH_BUCKET_PAGE); + + _hash_finish_split(rel, metabuf, buf, nbuf); + + /* + * release the buffer here as the insertion will happen in old bucket. + */ + _hash_relbuf(rel, nbuf); + } + /* Do the insertion */ while (PageGetFreeSpace(page) < itemsz) { @@ -127,14 +215,23 @@ _hash_doinsert(Relation rel, IndexTuple itup) { /* * ovfl page exists; go get it. if it doesn't have room, we'll - * find out next pass through the loop test above. + * find out next pass through the loop test above. Retain the pin + * if it is a primary bucket. */ - _hash_relbuf(rel, buf); + if (pageopaque->hasho_flag & LH_BUCKET_PAGE) + _hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK); + else + _hash_relbuf(rel, buf); buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); page = BufferGetPage(buf); } else { + bool retain_pin = false; + + /* page flags must be accessed before releasing lock on a page. */ + retain_pin = pageopaque->hasho_flag & LH_BUCKET_PAGE; + /* * we're at the end of the bucket chain and we haven't found a * page with enough room. allocate a new overflow page. @@ -144,7 +241,7 @@ _hash_doinsert(Relation rel, IndexTuple itup) _hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK); /* chain to a new overflow page */ - buf = _hash_addovflpage(rel, metabuf, buf); + buf = _hash_addovflpage(rel, metabuf, buf, retain_pin); page = BufferGetPage(buf); /* should fit now, given test above */ @@ -158,11 +255,13 @@ _hash_doinsert(Relation rel, IndexTuple itup) /* found page with enough space, so add the item here */ (void) _hash_pgaddtup(rel, buf, itemsz, itup); - /* write and release the modified page */ + /* + * write and release the modified page and ensure to release the pin on + * primary page. + */ _hash_wrtbuf(rel, buf); - - /* We can drop the bucket lock now */ - _hash_droplock(rel, blkno, HASH_SHARE); + if (buf != bucket_buf) + _hash_dropbuf(rel, bucket_buf); /* * Write-lock the metapage so we can increment the tuple count. After @@ -188,6 +287,127 @@ _hash_doinsert(Relation rel, IndexTuple itup) } /* + * _hash_finish_split() -- Finish the previously interrupted split operation + * + * To complete the split operation, we form the hash table of TIDs in new + * bucket which is then used by split operation to skip tuples that are + * already moved before the split operation was previously interruptted. + * + * The caller must hold a pin, but no lock, on the metapage buffer. + * The buffer is returned in the same state. (The metapage is only + * touched if it becomes necessary to add or remove overflow pages.) + * + * 'obuf' and 'nbuf' must be locked by the caller which is also responsible + * for unlocking it. + */ +static void +_hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Buffer nbuf) +{ + HASHCTL hash_ctl; + HTAB *tidhtab; + Buffer bucket_nbuf; + Page opage; + Page npage; + HashPageOpaque opageopaque; + HashPageOpaque npageopaque; + HashMetaPage metap; + Bucket obucket; + Bucket nbucket; + uint32 maxbucket; + uint32 highmask; + uint32 lowmask; + bool found; + + /* Initialize hash tables used to track TIDs */ + memset(&hash_ctl, 0, sizeof(hash_ctl)); + hash_ctl.keysize = sizeof(ItemPointerData); + hash_ctl.entrysize = sizeof(ItemPointerData); + hash_ctl.hcxt = CurrentMemoryContext; + + tidhtab = + hash_create("bucket ctids", + 256, /* arbitrary initial size */ + &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + /* + * Scan the new bucket and build hash table of TIDs + */ + bucket_nbuf = nbuf; + npage = BufferGetPage(nbuf); + npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage); + for (;;) + { + BlockNumber nblkno; + OffsetNumber noffnum; + OffsetNumber nmaxoffnum; + + /* Scan each tuple in new page */ + nmaxoffnum = PageGetMaxOffsetNumber(npage); + for (noffnum = FirstOffsetNumber; + noffnum <= nmaxoffnum; + noffnum = OffsetNumberNext(noffnum)) + { + IndexTuple itup; + + /* Fetch the item's TID and insert it in hash table. */ + itup = (IndexTuple) PageGetItem(npage, + PageGetItemId(npage, noffnum)); + + (void) hash_search(tidhtab, &itup->t_tid, HASH_ENTER, &found); + + Assert(!found); + } + + nblkno = npageopaque->hasho_nextblkno; + + /* + * release our write lock without modifying buffer and ensure to + * retain the pin on primary bucket. + */ + if (nbuf == bucket_nbuf) + _hash_chgbufaccess(rel, nbuf, HASH_READ, HASH_NOLOCK); + else + _hash_relbuf(rel, nbuf); + + /* Exit loop if no more overflow pages in new bucket */ + if (!BlockNumberIsValid(nblkno)) + break; + + /* Else, advance to next page */ + nbuf = _hash_getbuf(rel, nblkno, HASH_READ, LH_OVERFLOW_PAGE); + npage = BufferGetPage(nbuf); + npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage); + } + + /* Get the metapage info */ + _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ); + metap = HashPageGetMeta(BufferGetPage(metabuf)); + + maxbucket = metap->hashm_maxbucket; + highmask = metap->hashm_highmask; + lowmask = metap->hashm_lowmask; + + _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); + + _hash_chgbufaccess(rel, bucket_nbuf, HASH_NOLOCK, HASH_WRITE); + + npage = BufferGetPage(bucket_nbuf); + npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage); + nbucket = npageopaque->hasho_bucket; + + opage = BufferGetPage(obuf); + opageopaque = (HashPageOpaque) PageGetSpecialPointer(opage); + obucket = opageopaque->hasho_bucket; + + _hash_splitbucket_guts(rel, metabuf, obucket, + nbucket, obuf, bucket_nbuf, tidhtab, + maxbucket, highmask, lowmask); + + hash_destroy(tidhtab); +} + +/* * _hash_pgaddtup() -- add a tuple to a particular page in the index. * * This routine adds the tuple to the page as requested; it does not write out diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c index db3e268..760563a 100644 --- a/src/backend/access/hash/hashovfl.c +++ b/src/backend/access/hash/hashovfl.c @@ -82,23 +82,20 @@ blkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno) * * On entry, the caller must hold a pin but no lock on 'buf'. The pin is * dropped before exiting (we assume the caller is not interested in 'buf' - * anymore). The returned overflow page will be pinned and write-locked; - * it is guaranteed to be empty. + * anymore) if not asked to retain. The pin will be retained only for the + * primary bucket. The returned overflow page will be pinned and + * write-locked; it is guaranteed to be empty. * * The caller must hold a pin, but no lock, on the metapage buffer. * That buffer is returned in the same state. * - * The caller must hold at least share lock on the bucket, to ensure that - * no one else tries to compact the bucket meanwhile. This guarantees that - * 'buf' won't stop being part of the bucket while it's unlocked. - * * NB: since this could be executed concurrently by multiple processes, * one should not assume that the returned overflow page will be the * immediate successor of the originally passed 'buf'. Additional overflow * pages might have been added to the bucket chain in between. */ Buffer -_hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf) +_hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin) { Buffer ovflbuf; Page page; @@ -131,7 +128,10 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf) break; /* we assume we do not need to write the unmodified page */ - _hash_relbuf(rel, buf); + if ((pageopaque->hasho_flag & LH_BUCKET_PAGE) && retain_pin) + _hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK); + else + _hash_relbuf(rel, buf); buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); } @@ -149,7 +149,10 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf) /* logically chain overflow page to previous page */ pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf); - _hash_wrtbuf(rel, buf); + if ((pageopaque->hasho_flag & LH_BUCKET_PAGE) && retain_pin) + _hash_chgbufaccess(rel, buf, HASH_WRITE, HASH_NOLOCK); + else + _hash_wrtbuf(rel, buf); return ovflbuf; } @@ -370,11 +373,11 @@ _hash_firstfreebit(uint32 map) * in the bucket, or InvalidBlockNumber if no following page. * * NB: caller must not hold lock on metapage, nor on either page that's - * adjacent in the bucket chain. The caller had better hold exclusive lock - * on the bucket, too. + * adjacent in the bucket chain except from primary bucket. The caller had + * better hold cleanup lock on the primary bucket. */ BlockNumber -_hash_freeovflpage(Relation rel, Buffer ovflbuf, +_hash_freeovflpage(Relation rel, Buffer ovflbuf, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy) { HashMetaPage metap; @@ -413,22 +416,41 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf, /* * Fix up the bucket chain. this is a doubly-linked list, so we must fix * up the bucket chain members behind and ahead of the overflow page being - * deleted. No concurrency issues since we hold exclusive lock on the - * entire bucket. + * deleted. No concurrency issues since we hold the cleanup lock on + * primary bucket. We don't need to aqcuire buffer lock to fix the + * primary bucket, as we already have that lock. */ if (BlockNumberIsValid(prevblkno)) { - Buffer prevbuf = _hash_getbuf_with_strategy(rel, - prevblkno, - HASH_WRITE, + if (prevblkno == bucket_blkno) + { + Buffer prevbuf = ReadBufferExtended(rel, MAIN_FORKNUM, + prevblkno, + RBM_NORMAL, + bstrategy); + + Page prevpage = BufferGetPage(prevbuf); + HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); + + Assert(prevopaque->hasho_bucket == bucket); + prevopaque->hasho_nextblkno = nextblkno; + MarkBufferDirty(prevbuf); + ReleaseBuffer(prevbuf); + } + else + { + Buffer prevbuf = _hash_getbuf_with_strategy(rel, + prevblkno, + HASH_WRITE, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, - bstrategy); - Page prevpage = BufferGetPage(prevbuf); - HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); + bstrategy); + Page prevpage = BufferGetPage(prevbuf); + HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); - Assert(prevopaque->hasho_bucket == bucket); - prevopaque->hasho_nextblkno = nextblkno; - _hash_wrtbuf(rel, prevbuf); + Assert(prevopaque->hasho_bucket == bucket); + prevopaque->hasho_nextblkno = nextblkno; + _hash_wrtbuf(rel, prevbuf); + } } if (BlockNumberIsValid(nextblkno)) { @@ -570,7 +592,7 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno, * required that to be true on entry as well, but it's a lot easier for * callers to leave empty overflow pages and let this guy clean it up. * - * Caller must hold exclusive lock on the target bucket. This allows + * Caller must hold cleanup lock on the target bucket. This allows * us to safely lock multiple pages in the bucket. * * Since this function is invoked in VACUUM, we provide an access strategy @@ -580,6 +602,7 @@ void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, + Buffer bucket_buf, BufferAccessStrategy bstrategy) { BlockNumber wblkno; @@ -591,27 +614,22 @@ _hash_squeezebucket(Relation rel, HashPageOpaque wopaque; HashPageOpaque ropaque; bool wbuf_dirty; + bool release_buf = false; /* * start squeezing into the base bucket page. */ wblkno = bucket_blkno; - wbuf = _hash_getbuf_with_strategy(rel, - wblkno, - HASH_WRITE, - LH_BUCKET_PAGE, - bstrategy); + wbuf = bucket_buf; wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); /* - * if there aren't any overflow pages, there's nothing to squeeze. + * if there aren't any overflow pages, there's nothing to squeeze. caller + * is responsible to release the lock on primary bucket. */ if (!BlockNumberIsValid(wopaque->hasho_nextblkno)) - { - _hash_relbuf(rel, wbuf); return; - } /* * Find the last page in the bucket chain by starting at the base bucket @@ -669,12 +687,17 @@ _hash_squeezebucket(Relation rel, { Assert(!PageIsEmpty(wpage)); + if (wblkno != bucket_blkno) + release_buf = true; + wblkno = wopaque->hasho_nextblkno; Assert(BlockNumberIsValid(wblkno)); - if (wbuf_dirty) + if (wbuf_dirty && release_buf) _hash_wrtbuf(rel, wbuf); - else + else if (wbuf_dirty) + MarkBufferDirty(wbuf); + else if (release_buf) _hash_relbuf(rel, wbuf); /* nothing more to do if we reached the read page */ @@ -700,6 +723,7 @@ _hash_squeezebucket(Relation rel, wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); Assert(wopaque->hasho_bucket == bucket); wbuf_dirty = false; + release_buf = false; } /* @@ -733,19 +757,25 @@ _hash_squeezebucket(Relation rel, /* are we freeing the page adjacent to wbuf? */ if (rblkno == wblkno) { - /* yes, so release wbuf lock first */ - if (wbuf_dirty) + if (wblkno != bucket_blkno) + release_buf = true; + + /* yes, so release wbuf lock first if needed */ + if (wbuf_dirty && release_buf) _hash_wrtbuf(rel, wbuf); - else + else if (wbuf_dirty) + MarkBufferDirty(wbuf); + else if (release_buf) _hash_relbuf(rel, wbuf); + /* free this overflow page (releases rbuf) */ - _hash_freeovflpage(rel, rbuf, bstrategy); + _hash_freeovflpage(rel, rbuf, bucket_blkno, bstrategy); /* done */ return; } /* free this overflow page, then get the previous one */ - _hash_freeovflpage(rel, rbuf, bstrategy); + _hash_freeovflpage(rel, rbuf, bucket_blkno, bstrategy); rbuf = _hash_getbuf_with_strategy(rel, rblkno, diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 178463f..83007ac 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -38,7 +38,7 @@ static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks); static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, - BlockNumber start_oblkno, + Buffer obuf, Buffer nbuf, uint32 maxbucket, uint32 highmask, uint32 lowmask); @@ -55,46 +55,6 @@ static void _hash_splitbucket(Relation rel, Buffer metabuf, /* - * _hash_getlock() -- Acquire an lmgr lock. - * - * 'whichlock' should the block number of a bucket's primary bucket page to - * acquire the per-bucket lock. (See README for details of the use of these - * locks.) - * - * 'access' must be HASH_SHARE or HASH_EXCLUSIVE. - */ -void -_hash_getlock(Relation rel, BlockNumber whichlock, int access) -{ - if (USELOCKING(rel)) - LockPage(rel, whichlock, access); -} - -/* - * _hash_try_getlock() -- Acquire an lmgr lock, but only if it's free. - * - * Same as above except we return FALSE without blocking if lock isn't free. - */ -bool -_hash_try_getlock(Relation rel, BlockNumber whichlock, int access) -{ - if (USELOCKING(rel)) - return ConditionalLockPage(rel, whichlock, access); - else - return true; -} - -/* - * _hash_droplock() -- Release an lmgr lock. - */ -void -_hash_droplock(Relation rel, BlockNumber whichlock, int access) -{ - if (USELOCKING(rel)) - UnlockPage(rel, whichlock, access); -} - -/* * _hash_getbuf() -- Get a buffer by block number for read or write. * * 'access' must be HASH_READ, HASH_WRITE, or HASH_NOLOCK. @@ -489,9 +449,11 @@ _hash_pageinit(Page page, Size size) /* * Attempt to expand the hash table by creating one new bucket. * - * This will silently do nothing if it cannot get the needed locks. + * This will silently do nothing if there are active scans of our own + * backend or if we don't get cleanup lock on old bucket. * - * The caller should hold no locks on the hash index. + * We do remove the tuples from old bucket, if there are any left over from + * previous split. * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. @@ -506,10 +468,15 @@ _hash_expandtable(Relation rel, Buffer metabuf) BlockNumber start_oblkno; BlockNumber start_nblkno; Buffer buf_nblkno; + Buffer buf_oblkno; + Page opage; + HashPageOpaque oopaque; uint32 maxbucket; uint32 highmask; uint32 lowmask; +restart_expand: + /* * Write-lock the meta page. It used to be necessary to acquire a * heavyweight lock to begin a split, but that is no longer required. @@ -548,11 +515,15 @@ _hash_expandtable(Relation rel, Buffer metabuf) goto fail; /* - * Determine which bucket is to be split, and attempt to lock the old - * bucket. If we can't get the lock, give up. + * Determine which bucket is to be split, and attempt to take cleanup lock + * on the old bucket. If we can't get the lock, give up. * - * The lock protects us against other backends, but not against our own - * backend. Must check for active scans separately. + * The cleanup lock protects us against other backends, but not against + * our own backend. Must check for active scans separately. + * + * The cleanup lock is mainly to protect the split from concurrent + * inserts, however if there is any pending scan it will give up which is + * not good, but harmless. */ new_bucket = metap->hashm_maxbucket + 1; @@ -563,11 +534,50 @@ _hash_expandtable(Relation rel, Buffer metabuf) if (_hash_has_active_scan(rel, old_bucket)) goto fail; - if (!_hash_try_getlock(rel, start_oblkno, HASH_EXCLUSIVE)) + buf_oblkno = ReadBuffer(rel, start_oblkno); + if (!ConditionalLockBufferForCleanup(buf_oblkno)) + { + ReleaseBuffer(buf_oblkno); goto fail; + } + _hash_checkpage(rel, buf_oblkno, LH_BUCKET_PAGE); + + opage = BufferGetPage(buf_oblkno); + oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); + + /* we don't expect any pending split at this stage. */ + Assert(!H_INCOMPLETE_SPLIT(oopaque)); + + /* + * Clean the tuples remained from previous split. This operation requires + * cleanup lock and we already have one on old bucket, so let's do it. We + * also don't want to allow further splits from the bucket till the + * garbage of previous split is cleaned. This has two advantages, first + * it helps in avoiding the bloat due to garbage and second is, during + * cleanup of bucket, we are always sure that the garbage tuples belong to + * most recently splitted bucket. On the contrary, if we allow cleanup of + * bucket after meta page is updated to indicate the new split and before + * the actual split, the cleanup operation won't be able to decide whether + * the tuple has been moved to the newly created bucket and ended up + * deleting such tuples. + */ + if (H_HAS_GARBAGE(oopaque)) + { + /* Release the metapage lock. */ + _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); + + hashbucketcleanup(rel, buf_oblkno, start_oblkno, NULL, + metap->hashm_maxbucket, metap->hashm_highmask, + metap->hashm_lowmask, NULL, + NULL, true, false, NULL, NULL); + + _hash_relbuf(rel, buf_oblkno); + + goto restart_expand; + } /* - * Likewise lock the new bucket (should never fail). + * There shouldn't be any active scan on new bucket. * * Note: it is safe to compute the new bucket's blkno here, even though we * may still need to update the BUCKET_TO_BLKNO mapping. This is because @@ -579,9 +589,6 @@ _hash_expandtable(Relation rel, Buffer metabuf) if (_hash_has_active_scan(rel, new_bucket)) elog(ERROR, "scan in progress on supposedly new bucket"); - if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE)) - elog(ERROR, "could not get lock on supposedly new bucket"); - /* * If the split point is increasing (hashm_maxbucket's log base 2 * increases), we need to allocate a new batch of bucket pages. @@ -600,8 +607,7 @@ _hash_expandtable(Relation rel, Buffer metabuf) if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket)) { /* can't split due to BlockNumber overflow */ - _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE); - _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE); + _hash_relbuf(rel, buf_oblkno); goto fail; } } @@ -609,7 +615,8 @@ _hash_expandtable(Relation rel, Buffer metabuf) /* * Physically allocate the new bucket's primary page. We want to do this * before changing the metapage's mapping info, in case we can't get the - * disk space. + * disk space. We don't need to take cleanup lock on new bucket as no + * other backend could find this bucket unless meta page is updated. */ buf_nblkno = _hash_getnewbuf(rel, start_nblkno, MAIN_FORKNUM); @@ -665,13 +672,9 @@ _hash_expandtable(Relation rel, Buffer metabuf) /* Relocate records to the new bucket */ _hash_splitbucket(rel, metabuf, old_bucket, new_bucket, - start_oblkno, buf_nblkno, + buf_oblkno, buf_nblkno, maxbucket, highmask, lowmask); - /* Release bucket locks, allowing others to access them */ - _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE); - _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE); - return; /* Here if decide not to split or fail to acquire old bucket lock */ @@ -745,6 +748,10 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) * The buffer is returned in the same state. (The metapage is only * touched if it becomes necessary to add or remove overflow pages.) * + * Split needs to hold pin on primary bucket pages of both old and new + * buckets till end of operation. This is to prevent vacuum to start + * when split is in progress. + * * In addition, the caller must have created the new bucket's base page, * which is passed in buffer nbuf, pinned and write-locked. That lock and * pin are released here. (The API is set up this way because we must do @@ -756,37 +763,87 @@ _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, - BlockNumber start_oblkno, + Buffer obuf, Buffer nbuf, uint32 maxbucket, uint32 highmask, uint32 lowmask) { - Buffer obuf; Page opage; Page npage; HashPageOpaque oopaque; HashPageOpaque nopaque; - /* - * It should be okay to simultaneously write-lock pages from each bucket, - * since no one else can be trying to acquire buffer lock on pages of - * either bucket. - */ - obuf = _hash_getbuf(rel, start_oblkno, HASH_WRITE, LH_BUCKET_PAGE); opage = BufferGetPage(obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); + /* + * Mark the old bucket to indicate that split is in progress and it has + * deletable tuples. At operation end, we clear split in progress flag and + * vacuum will clear page_has_garbage flag after deleting such tuples. + */ + oopaque->hasho_flag |= LH_BUCKET_PAGE_HAS_GARBAGE | LH_BUCKET_OLD_PAGE_SPLIT; + npage = BufferGetPage(nbuf); - /* initialize the new bucket's primary page */ + /* + * initialize the new bucket's primary page and mark it to indicate that + * split is in progress. + */ nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); nopaque->hasho_prevblkno = InvalidBlockNumber; nopaque->hasho_nextblkno = InvalidBlockNumber; nopaque->hasho_bucket = nbucket; - nopaque->hasho_flag = LH_BUCKET_PAGE; + nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_NEW_PAGE_SPLIT; nopaque->hasho_page_id = HASHO_PAGE_ID; + _hash_splitbucket_guts(rel, metabuf, obucket, + nbucket, obuf, nbuf, NULL, + maxbucket, highmask, lowmask); + + /* all done, now release the locks and pins on primary buckets. */ + _hash_relbuf(rel, obuf); + _hash_relbuf(rel, nbuf); +} + +/* + * _hash_splitbucket_guts -- Helper function to perform the split operation + * + * This routine is used to partition the tuples between old and new bucket and + * is used to finish the incomplete split operations. To finish the previously + * interrupted split operation, caller needs to fill htab. If htab is set, then + * we skip the movement of tuples that exists in htab, otherwise NULL value of + * htab indicates movement of all the tuples that belong to new bucket. + * + * Caller needs to lock and unlock the old and new primary buckets. + */ +void +_hash_splitbucket_guts(Relation rel, + Buffer metabuf, + Bucket obucket, + Bucket nbucket, + Buffer obuf, + Buffer nbuf, + HTAB *htab, + uint32 maxbucket, + uint32 highmask, + uint32 lowmask) +{ + Buffer bucket_obuf; + Buffer bucket_nbuf; + Page opage; + Page npage; + HashPageOpaque oopaque; + HashPageOpaque nopaque; + + bucket_obuf = obuf; + opage = BufferGetPage(obuf); + oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); + + bucket_nbuf = nbuf; + npage = BufferGetPage(nbuf); + nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); + /* * Partition the tuples in the old bucket between the old bucket and the * new bucket, advancing along the old bucket's overflow bucket chain and @@ -798,8 +855,6 @@ _hash_splitbucket(Relation rel, BlockNumber oblkno; OffsetNumber ooffnum; OffsetNumber omaxoffnum; - OffsetNumber deletable[MaxOffsetNumber]; - int ndeletable = 0; /* Scan each tuple in old page */ omaxoffnum = PageGetMaxOffsetNumber(opage); @@ -810,18 +865,45 @@ _hash_splitbucket(Relation rel, IndexTuple itup; Size itemsz; Bucket bucket; + bool found = false; /* - * Fetch the item's hash key (conveniently stored in the item) and - * determine which bucket it now belongs in. + * Before inserting tuple, probe the hash table containing TIDs of + * tuples belonging to new bucket, if we find a match, then skip + * that tuple, else fetch the item's hash key (conveniently stored + * in the item) and determine which bucket it now belongs in. */ itup = (IndexTuple) PageGetItem(opage, PageGetItemId(opage, ooffnum)); + + if (htab) + (void) hash_search(htab, &itup->t_tid, HASH_FIND, &found); + + if (found) + continue; + bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), maxbucket, highmask, lowmask); if (bucket == nbucket) { + Size itupsize = 0; + IndexTuple new_itup; + + /* + * make a copy of index tuple as we have to scribble on it. + */ + new_itup = CopyIndexTuple(itup); + + /* + * mark the index tuple as moved by split, such tuples are + * skipped by scan if there is split in progress for a bucket. + */ + itupsize = new_itup->t_info & INDEX_SIZE_MASK; + new_itup->t_info &= ~INDEX_SIZE_MASK; + new_itup->t_info |= INDEX_MOVED_BY_SPLIT_MASK; + new_itup->t_info |= itupsize; + /* * insert the tuple into the new bucket. if it doesn't fit on * the current page in the new bucket, we must allocate a new @@ -832,17 +914,25 @@ _hash_splitbucket(Relation rel, * only partially complete, meaning the index is corrupt, * since searches may fail to find entries they should find. */ - itemsz = IndexTupleDSize(*itup); + itemsz = IndexTupleDSize(*new_itup); itemsz = MAXALIGN(itemsz); if (PageGetFreeSpace(npage) < itemsz) { + bool retain_pin = false; + + /* + * page flags must be accessed before releasing lock on a + * page. + */ + retain_pin = nopaque->hasho_flag & LH_BUCKET_PAGE; + /* write out nbuf and drop lock, but keep pin */ _hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK); /* chain to a new overflow page */ - nbuf = _hash_addovflpage(rel, metabuf, nbuf); + nbuf = _hash_addovflpage(rel, metabuf, nbuf, retain_pin); npage = BufferGetPage(nbuf); - /* we don't need nopaque within the loop */ + nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); } /* @@ -852,12 +942,10 @@ _hash_splitbucket(Relation rel, * Possible future improvement: accumulate all the items for * the new page and qsort them before insertion. */ - (void) _hash_pgaddtup(rel, nbuf, itemsz, itup); + (void) _hash_pgaddtup(rel, nbuf, itemsz, new_itup); - /* - * Mark tuple for deletion from old page. - */ - deletable[ndeletable++] = ooffnum; + /* be tidy */ + pfree(new_itup); } else { @@ -870,15 +958,9 @@ _hash_splitbucket(Relation rel, oblkno = oopaque->hasho_nextblkno; - /* - * Done scanning this old page. If we moved any tuples, delete them - * from the old page. - */ - if (ndeletable > 0) - { - PageIndexMultiDelete(opage, deletable, ndeletable); - _hash_wrtbuf(rel, obuf); - } + /* retain the pin on the old primary bucket */ + if (obuf == bucket_obuf) + _hash_chgbufaccess(rel, obuf, HASH_READ, HASH_NOLOCK); else _hash_relbuf(rel, obuf); @@ -887,18 +969,42 @@ _hash_splitbucket(Relation rel, break; /* Else, advance to next old page */ - obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_OVERFLOW_PAGE); + obuf = _hash_getbuf(rel, oblkno, HASH_READ, LH_OVERFLOW_PAGE); opage = BufferGetPage(obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); } /* * We're at the end of the old bucket chain, so we're done partitioning - * the tuples. Before quitting, call _hash_squeezebucket to ensure the - * tuples remaining in the old bucket (including the overflow pages) are - * packed as tightly as possible. The new bucket is already tight. + * the tuples. Mark the old and new buckets to indicate split is + * finished. + */ + if (!(nopaque->hasho_flag & LH_BUCKET_PAGE)) + _hash_wrtbuf(rel, nbuf); + + _hash_chgbufaccess(rel, bucket_obuf, HASH_NOLOCK, HASH_WRITE); + opage = BufferGetPage(bucket_obuf); + oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); + + /* + * need to acquire the write lock only if current bucket is not a primary + * bucket, otherwise we already have a lock on it. */ - _hash_wrtbuf(rel, nbuf); + if (!(nopaque->hasho_flag & LH_BUCKET_PAGE)) + { + _hash_chgbufaccess(rel, bucket_nbuf, HASH_NOLOCK, HASH_WRITE); + npage = BufferGetPage(bucket_nbuf); + nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); + } - _hash_squeezebucket(rel, obucket, start_oblkno, NULL); + /* indicate that split is finished */ + oopaque->hasho_flag &= ~LH_BUCKET_OLD_PAGE_SPLIT; + nopaque->hasho_flag &= ~LH_BUCKET_NEW_PAGE_SPLIT; + + /* + * now write the buffers, here we don't release the locks as caller is + * responsible to release locks. + */ + MarkBufferDirty(bucket_obuf); + MarkBufferDirty(bucket_nbuf); } diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c index 4825558..d87cf8b 100644 --- a/src/backend/access/hash/hashsearch.c +++ b/src/backend/access/hash/hashsearch.c @@ -72,7 +72,23 @@ _hash_readnext(Relation rel, BlockNumber blkno; blkno = (*opaquep)->hasho_nextblkno; - _hash_relbuf(rel, *bufp); + + /* + * Retain the pin on primary bucket page till the end of scan to ensure + * that vacuum can't delete the tuples that are moved by split to new + * bucket. Such tuples are required by the scans that are started on + * splitted buckets, before a new buckets split in progress flag + * (LH_BUCKET_NEW_PAGE_SPLIT) is cleared. Now the requirement to retain a + * pin on primary bucket can be relaxed for buckets that are not splitted + * by checking has_garbage flag in bucket, but still we need to retain pin + * for squeeze phase otherwise the movement of tuples could lead to change + * the ordering of scan results, so let's keep it for all buckets. + */ + if ((*opaquep)->hasho_flag & LH_BUCKET_PAGE) + _hash_chgbufaccess(rel, *bufp, HASH_READ, HASH_NOLOCK); + else + _hash_relbuf(rel, *bufp); + *bufp = InvalidBuffer; /* check for interrupts while we're not holding any buffer lock */ CHECK_FOR_INTERRUPTS(); @@ -94,7 +110,16 @@ _hash_readprev(Relation rel, BlockNumber blkno; blkno = (*opaquep)->hasho_prevblkno; - _hash_relbuf(rel, *bufp); + + /* + * Retain the pin on primary bucket page till the end of scan. See + * comments in _hash_readnext to know the reason of retaining pin. + */ + if ((*opaquep)->hasho_flag & LH_BUCKET_PAGE) + _hash_chgbufaccess(rel, *bufp, HASH_READ, HASH_NOLOCK); + else + _hash_relbuf(rel, *bufp); + *bufp = InvalidBuffer; /* check for interrupts while we're not holding any buffer lock */ CHECK_FOR_INTERRUPTS(); @@ -192,43 +217,85 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) metap = HashPageGetMeta(page); /* - * Loop until we get a lock on the correct target bucket. + * Conditionally get the lock on primary bucket page for search while + * holding lock on meta page. If we have to wait, then release the meta + * page lock and retry it in a hard way. */ - for (;;) - { - /* - * Compute the target bucket number, and convert to block number. - */ - bucket = _hash_hashkey2bucket(hashkey, - metap->hashm_maxbucket, - metap->hashm_highmask, - metap->hashm_lowmask); + bucket = _hash_hashkey2bucket(hashkey, + metap->hashm_maxbucket, + metap->hashm_highmask, + metap->hashm_lowmask); - blkno = BUCKET_TO_BLKNO(metap, bucket); + blkno = BUCKET_TO_BLKNO(metap, bucket); - /* Release metapage lock, but keep pin. */ + /* Fetch the primary bucket page for the bucket */ + buf = ReadBuffer(rel, blkno); + if (!ConditionalLockBufferShared(buf)) + { + _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); + LockBuffer(buf, HASH_READ); + _hash_checkpage(rel, buf, LH_BUCKET_PAGE); + _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ); + oldblkno = blkno; + retry = true; + } + else + { + _hash_checkpage(rel, buf, LH_BUCKET_PAGE); _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); + } + if (retry) + { /* - * If the previous iteration of this loop locked what is still the - * correct target bucket, we are done. Otherwise, drop any old lock - * and lock what now appears to be the correct bucket. + * Loop until we get a lock on the correct target bucket. We get the + * lock on primary bucket page and retain the pin on it during read + * operation to prevent the concurrent splits. Retaining pin on a + * primary bucket page ensures that split can't happen as it needs to + * acquire the cleanup lock on primary bucket page. Acquiring lock on + * primary bucket and rechecking if it is a target bucket is mandatory + * as otherwise a concurrent split followed by vacuum could remove + * tuples from the selected bucket which otherwise would have been + * visible. */ - if (retry) + for (;;) { - if (oldblkno == blkno) - break; - _hash_droplock(rel, oldblkno, HASH_SHARE); + /* + * Compute the target bucket number, and convert to block number. + */ + bucket = _hash_hashkey2bucket(hashkey, + metap->hashm_maxbucket, + metap->hashm_highmask, + metap->hashm_lowmask); + + blkno = BUCKET_TO_BLKNO(metap, bucket); + + /* Release metapage lock, but keep pin. */ + _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); + + /* + * If the previous iteration of this loop locked what is still the + * correct target bucket, we are done. Otherwise, drop any old + * lock and lock what now appears to be the correct bucket. + */ + if (retry) + { + if (oldblkno == blkno) + break; + _hash_relbuf(rel, buf); + } + + /* Fetch the primary bucket page for the bucket */ + buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE); + + /* + * Reacquire metapage lock and check that no bucket split has + * taken place while we were awaiting the bucket lock. + */ + _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ); + oldblkno = blkno; + retry = true; } - _hash_getlock(rel, blkno, HASH_SHARE); - - /* - * Reacquire metapage lock and check that no bucket split has taken - * place while we were awaiting the bucket lock. - */ - _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ); - oldblkno = blkno; - retry = true; } /* done with the metapage */ @@ -237,14 +304,60 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) /* Update scan opaque state to show we have lock on the bucket */ so->hashso_bucket = bucket; so->hashso_bucket_valid = true; - so->hashso_bucket_blkno = blkno; - /* Fetch the primary bucket page for the bucket */ - buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE); + page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(opaque->hasho_bucket == bucket); + so->hashso_bucket_buf = buf; + + /* + * If the bucket split is in progress, then we need to skip tuples that + * are moved from old bucket. To ensure that vacuum doesn't clean any + * tuples from old or new buckets till this scan is in progress, maintain + * a pin on both of the buckets. Here, we have to be cautious about lock + * ordering, first acquire the lock on old bucket, release the lock on old + * bucket, but not pin, then acuire the lock on new bucket and again + * re-verify whether the bucket split still is in progress. Acquiring lock + * on old bucket first ensures that the vacuum waits for this scan to + * finish. + */ + if (opaque->hasho_flag & LH_BUCKET_NEW_PAGE_SPLIT) + { + BlockNumber old_blkno; + Buffer old_buf; + + old_blkno = _hash_get_oldblk(rel, opaque); + + /* + * release the lock on new bucket and re-acquire it after acquiring + * the lock on old bucket. + */ + _hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK); + + old_buf = _hash_getbuf(rel, old_blkno, HASH_READ, LH_BUCKET_PAGE); + + /* + * remember the old bucket buffer so as to use it later for scanning. + */ + so->hashso_old_bucket_buf = old_buf; + _hash_chgbufaccess(rel, old_buf, HASH_READ, HASH_NOLOCK); + + _hash_chgbufaccess(rel, buf, HASH_NOLOCK, HASH_READ); + page = BufferGetPage(buf); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + Assert(opaque->hasho_bucket == bucket); + + if (opaque->hasho_flag & LH_BUCKET_NEW_PAGE_SPLIT) + so->hashso_skip_moved_tuples = true; + else + { + _hash_dropbuf(rel, so->hashso_old_bucket_buf); + so->hashso_old_bucket_buf = InvalidBuffer; + } + } + /* If a backwards scan is requested, move to the end of the chain */ if (ScanDirectionIsBackward(dir)) { @@ -273,6 +386,13 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) * false. Else, return true and set the hashso_curpos for the * scan to the right thing. * + * Here we also scan the old bucket if the split for current bucket + * was in progress at the start of scan. The basic idea is that + * skip the tuples that are moved by split while scanning current + * bucket and then scan the old bucket to cover all such tuples. This + * is done to ensure that we don't miss any tuples in the current scan + * when split was in progress. + * * 'bufP' points to the current buffer, which is pinned and read-locked. * On success exit, we have pin and read-lock on whichever page * contains the right item; on failure, we have released all buffers. @@ -338,6 +458,19 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) { Assert(offnum >= FirstOffsetNumber); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + + /* + * skip the tuples that are moved by split operation + * for the scan that has started when split was in + * progress + */ + if (so->hashso_skip_moved_tuples && + (itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) + { + offnum = OffsetNumberNext(offnum); /* move forward */ + continue; + } + if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup)) break; /* yes, so exit for-loop */ } @@ -353,9 +486,41 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) } else { - /* end of bucket */ - itup = NULL; - break; /* exit for-loop */ + /* + * end of bucket, scan old bucket if there was a split + * in progress at the start of scan. + */ + if (so->hashso_skip_moved_tuples) + { + buf = so->hashso_old_bucket_buf; + + /* + * old buket buffer must be valid as we acquire + * the pin on it before the start of scan and + * retain it till end of scan. + */ + Assert(BufferIsValid(buf)); + + _hash_chgbufaccess(rel, buf, HASH_NOLOCK, HASH_READ); + + page = BufferGetPage(buf); + maxoff = PageGetMaxOffsetNumber(page); + offnum = _hash_binsearch(page, so->hashso_sk_hash); + + /* + * setting hashso_skip_moved_tuples to false + * ensures that we don't check for tuples that are + * moved by split in old bucket and it also + * ensures that we won't retry to scan the old + * bucket once the scan for same is finished. + */ + so->hashso_skip_moved_tuples = false; + } + else + { + itup = NULL; + break; /* exit for-loop */ + } } } break; @@ -379,6 +544,19 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) { Assert(offnum <= maxoff); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + + /* + * skip the tuples that are moved by split operation + * for the scan that has started when split was in + * progress + */ + if (so->hashso_skip_moved_tuples && + (itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) + { + offnum = OffsetNumberPrev(offnum); /* move back */ + continue; + } + if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup)) break; /* yes, so exit for-loop */ } @@ -394,9 +572,41 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) } else { - /* end of bucket */ - itup = NULL; - break; /* exit for-loop */ + /* + * end of bucket, scan old bucket if there was a split + * in progress at the start of scan. + */ + if (so->hashso_skip_moved_tuples) + { + buf = so->hashso_old_bucket_buf; + + /* + * old buket buffer must be valid as we acquire + * the pin on it before the start of scan and + * retain it till end of scan. + */ + Assert(BufferIsValid(buf)); + + _hash_chgbufaccess(rel, buf, HASH_NOLOCK, HASH_READ); + + page = BufferGetPage(buf); + maxoff = PageGetMaxOffsetNumber(page); + offnum = _hash_binsearch(page, so->hashso_sk_hash); + + /* + * setting hashso_skip_moved_tuples to false + * ensures that we don't check for tuples that are + * moved by split in old bucket and it also + * ensures that we won't retry to scan the old + * bucket once the scan for same is finished. + */ + so->hashso_skip_moved_tuples = false; + } + else + { + itup = NULL; + break; /* exit for-loop */ + } } } break; diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c index 456954b..bdbeb84 100644 --- a/src/backend/access/hash/hashutil.c +++ b/src/backend/access/hash/hashutil.c @@ -147,6 +147,23 @@ _hash_log2(uint32 num) } /* + * _hash_msb-- returns most significant bit position. + */ +static uint32 +_hash_msb(uint32 num) +{ + uint32 i = 0; + + while (num) + { + num = num >> 1; + ++i; + } + + return i - 1; +} + +/* * _hash_checkpage -- sanity checks on the format of all hash pages * * If flags is not zero, it is a bitwise OR of the acceptable values of @@ -342,3 +359,123 @@ _hash_binsearch_last(Page page, uint32 hash_value) return lower; } + +/* + * _hash_get_oldblk() -- get the block number from which current bucket + * is being splitted. + */ +BlockNumber +_hash_get_oldblk(Relation rel, HashPageOpaque opaque) +{ + Bucket curr_bucket; + Bucket old_bucket; + uint32 mask; + Buffer metabuf; + HashMetaPage metap; + BlockNumber blkno; + + /* + * To get the old bucket from the current bucket, we need a mask to modulo + * into lower half of table. This mask is stored in meta page as + * hashm_lowmask, but here we can't rely on the same, because we need a + * value of lowmask that was prevalent at the time when bucket split was + * started. Masking the most significant bit of new bucket would give us + * old bucket. + */ + curr_bucket = opaque->hasho_bucket; + mask = (((uint32) 1) << _hash_msb(curr_bucket)) - 1; + old_bucket = curr_bucket & mask; + + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); + metap = HashPageGetMeta(BufferGetPage(metabuf)); + + blkno = BUCKET_TO_BLKNO(metap, old_bucket); + + _hash_relbuf(rel, metabuf); + + return blkno; +} + +/* + * _hash_get_newblk() -- get the block number of bucket for the new bucket + * that will be generated after split from current bucket. + * + * This is used to find the new bucket from old bucket based on current table + * half. It is mainly required to finsh the incomplete splits where we are + * sure that not more than one bucket could have split in progress from old + * bucket. + */ +BlockNumber +_hash_get_newblk(Relation rel, HashPageOpaque opaque) +{ + Bucket curr_bucket; + Bucket new_bucket; + uint32 lowmask; + uint32 mask; + Buffer metabuf; + HashMetaPage metap; + BlockNumber blkno; + + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); + metap = HashPageGetMeta(BufferGetPage(metabuf)); + + curr_bucket = opaque->hasho_bucket; + + /* + * new bucket can be obtained by OR'ing old bucket with most significant + * bit of current table half. There could be multiple buckets that could + * have splitted from curent bucket. We need the first such bucket that + * exists based on current table half. + */ + lowmask = metap->hashm_lowmask; + + for (;;) + { + mask = lowmask + 1; + new_bucket = curr_bucket | mask; + if (new_bucket > metap->hashm_maxbucket) + { + lowmask = lowmask >> 1; + continue; + } + blkno = BUCKET_TO_BLKNO(metap, new_bucket); + break; + } + + _hash_relbuf(rel, metabuf); + + return blkno; +} + +/* + * _hash_get_newbucket() -- get the new bucket that will be generated after + * split from current bucket. + * + * This is used to find the new bucket from old bucket. New bucket can be + * obtained by OR'ing old bucket with most significant bit of table half + * for lowmask passed in this function. There could be multiple buckets that + * could have splitted from curent bucket. We need the first such bucket that + * exists. Caller must ensure that no more than one split has happened from + * old bucket. + */ +Bucket +_hash_get_newbucket(Relation rel, Bucket curr_bucket, + uint32 lowmask, uint32 maxbucket) +{ + Bucket new_bucket; + uint32 mask; + + for (;;) + { + mask = lowmask + 1; + new_bucket = curr_bucket | mask; + if (new_bucket > maxbucket) + { + lowmask = lowmask >> 1; + continue; + } + break; + } + + return new_bucket; +} diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index b7ca9bf..00129ed 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -3567,6 +3567,26 @@ ConditionalLockBuffer(Buffer buffer) } /* + * Acquire the content_lock for the buffer, but only if we don't have to wait. + * + * This assumes the caller wants BUFFER_LOCK_SHARED mode. + */ +bool +ConditionalLockBufferShared(Buffer buffer) +{ + BufferDesc *buf; + + Assert(BufferIsValid(buffer)); + if (BufferIsLocal(buffer)) + return true; /* act as though we got it */ + + buf = GetBufferDescriptor(buffer - 1); + + return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf), + LW_SHARED); +} + +/* * LockBufferForCleanup - lock a buffer in preparation for deleting items * * Items may be deleted from a disk page only when the caller (a) holds an diff --git a/src/include/access/hash.h b/src/include/access/hash.h index fa3f9b6..3a64c9d 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -25,6 +25,7 @@ #include "lib/stringinfo.h" #include "storage/bufmgr.h" #include "storage/lockdefs.h" +#include "utils/hsearch.h" #include "utils/relcache.h" /* @@ -52,6 +53,9 @@ typedef uint32 Bucket; #define LH_BUCKET_PAGE (1 << 1) #define LH_BITMAP_PAGE (1 << 2) #define LH_META_PAGE (1 << 3) +#define LH_BUCKET_NEW_PAGE_SPLIT (1 << 4) +#define LH_BUCKET_OLD_PAGE_SPLIT (1 << 5) +#define LH_BUCKET_PAGE_HAS_GARBAGE (1 << 6) typedef struct HashPageOpaqueData { @@ -64,6 +68,12 @@ typedef struct HashPageOpaqueData typedef HashPageOpaqueData *HashPageOpaque; +#define H_HAS_GARBAGE(opaque) ((opaque)->hasho_flag & LH_BUCKET_PAGE_HAS_GARBAGE) +#define H_OLD_INCOMPLETE_SPLIT(opaque) ((opaque)->hasho_flag & LH_BUCKET_OLD_PAGE_SPLIT) +#define H_NEW_INCOMPLETE_SPLIT(opaque) ((opaque)->hasho_flag & LH_BUCKET_NEW_PAGE_SPLIT) +#define H_INCOMPLETE_SPLIT(opaque) (((opaque)->hasho_flag & LH_BUCKET_NEW_PAGE_SPLIT) || \ + ((opaque)->hasho_flag & LH_BUCKET_OLD_PAGE_SPLIT)) + /* * The page ID is for the convenience of pg_filedump and similar utilities, * which otherwise would have a hard time telling pages of different index @@ -88,12 +98,6 @@ typedef struct HashScanOpaqueData bool hashso_bucket_valid; /* - * If we have a share lock on the bucket, we record it here. When - * hashso_bucket_blkno is zero, we have no such lock. - */ - BlockNumber hashso_bucket_blkno; - - /* * We also want to remember which buffer we're currently examining in the * scan. We keep the buffer pinned (but not locked) across hashgettuple * calls, in order to avoid doing a ReadBuffer() for every tuple in the @@ -101,11 +105,23 @@ typedef struct HashScanOpaqueData */ Buffer hashso_curbuf; + /* remember the buffer associated with primary bucket */ + Buffer hashso_bucket_buf; + + /* + * remember the buffer associated with old primary bucket which is + * required during the scan of the bucket for which split is in progress. + */ + Buffer hashso_old_bucket_buf; + /* Current position of the scan, as an index TID */ ItemPointerData hashso_curpos; /* Current position of the scan, as a heap TID */ ItemPointerData hashso_heappos; + + /* Whether scan needs to skip tuples that are moved by split */ + bool hashso_skip_moved_tuples; } HashScanOpaqueData; typedef HashScanOpaqueData *HashScanOpaque; @@ -176,6 +192,8 @@ typedef HashMetaPageData *HashMetaPage; sizeof(ItemIdData) - \ MAXALIGN(sizeof(HashPageOpaqueData))) +#define INDEX_MOVED_BY_SPLIT_MASK 0x2000 + #define HASH_MIN_FILLFACTOR 10 #define HASH_DEFAULT_FILLFACTOR 75 @@ -224,9 +242,6 @@ typedef HashMetaPageData *HashMetaPage; #define HASH_WRITE BUFFER_LOCK_EXCLUSIVE #define HASH_NOLOCK (-1) -#define HASH_SHARE ShareLock -#define HASH_EXCLUSIVE ExclusiveLock - /* * Strategy number. There's only one valid strategy for hashing: equality. */ @@ -299,19 +314,17 @@ extern OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple itup); /* hashovfl.c */ -extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf); +extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin); extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf, - BufferAccessStrategy bstrategy); + BlockNumber bucket_blkno, BufferAccessStrategy bstrategy); extern void _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno, ForkNumber forkNum); extern void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, + Buffer bucket_buf, BufferAccessStrategy bstrategy); /* hashpage.c */ -extern void _hash_getlock(Relation rel, BlockNumber whichlock, int access); -extern bool _hash_try_getlock(Relation rel, BlockNumber whichlock, int access); -extern void _hash_droplock(Relation rel, BlockNumber whichlock, int access); extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags); extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno); @@ -329,6 +342,10 @@ extern uint32 _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum); extern void _hash_pageinit(Page page, Size size); extern void _hash_expandtable(Relation rel, Buffer metabuf); +extern void _hash_splitbucket_guts(Relation rel, Buffer metabuf, + Bucket obucket, Bucket nbucket, Buffer obuf, + Buffer nbuf, HTAB *htab, uint32 maxbucket, + uint32 highmask, uint32 lowmask); /* hashscan.c */ extern void _hash_regscan(IndexScanDesc scan); @@ -363,10 +380,20 @@ extern IndexTuple _hash_form_tuple(Relation index, Datum *values, bool *isnull); extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value); extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value); +extern BlockNumber _hash_get_oldblk(Relation rel, HashPageOpaque opaque); +extern BlockNumber _hash_get_newblk(Relation rel, HashPageOpaque opaque); +extern Bucket _hash_get_newbucket(Relation rel, Bucket curr_bucket, + uint32 lowmask, uint32 maxbucket); /* hash.c */ extern void hash_redo(XLogReaderState *record); extern void hash_desc(StringInfo buf, XLogReaderState *record); extern const char *hash_identify(uint8 info); +extern void hashbucketcleanup(Relation rel, Buffer bucket_buf, + BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, + uint32 maxbucket, uint32 highmask, uint32 lowmask, + double *tuples_removed, double *num_index_tuples, + bool bucket_has_garbage, bool delay, + IndexBulkDeleteCallback callback, void *callback_state); #endif /* HASH_H */ diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 3d5dea7..4b318a8 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -226,6 +226,7 @@ extern void MarkBufferDirtyHint(Buffer buffer, bool buffer_std); extern void UnlockBuffers(void); extern void LockBuffer(Buffer buffer, int mode); extern bool ConditionalLockBuffer(Buffer buffer); +extern bool ConditionalLockBufferShared(Buffer buffer); extern void LockBufferForCleanup(Buffer buffer); extern bool ConditionalLockBufferForCleanup(Buffer buffer); extern bool HoldingBufferPinThatDelaysRecovery(void);