diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c index 5f1513b..eaefd90 100644 --- a/src/backend/access/hash/hashovfl.c +++ b/src/backend/access/hash/hashovfl.c @@ -371,8 +371,8 @@ _hash_firstfreebit(uint32 map) * Since this function is invoked in VACUUM, we provide an access strategy * parameter that controls fetches of the bucket pages. * - * Returns the block number of the page that followed the given page - * in the bucket, or InvalidBlockNumber if no following page. + * Returns the buffer that followed the given wbuf in the bucket, or + * InvalidBuffer if no following page. * * NB: caller must not hold lock on metapage, nor on page, that's next to * ovflbuf in the bucket chain. We don't acquire the lock on page that's @@ -380,7 +380,7 @@ _hash_firstfreebit(uint32 map) * has a lock on same. This function releases the lock on wbuf and caller * is responsible for releasing the pin on same. */ -BlockNumber +Buffer _hash_freeovflpage(Relation rel, Buffer ovflbuf, Buffer wbuf, bool wbuf_dirty, BufferAccessStrategy bstrategy) { @@ -388,14 +388,17 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf, Buffer wbuf, Buffer metabuf; Buffer mapbuf; Buffer prevbuf = InvalidBuffer; + Buffer next_wbuf = InvalidBuffer; BlockNumber ovflblkno; BlockNumber prevblkno; BlockNumber blkno; BlockNumber nextblkno; BlockNumber writeblkno; HashPageOpaque ovflopaque; + HashPageOpaque wopaque; Page ovflpage; Page mappage; + Page wpage; uint32 *freep; uint32 ovflbitno; int32 bitmappage, @@ -458,13 +461,6 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf, Buffer wbuf, wbuf_dirty = true; } } - - /* write and unlock the write buffer */ - if (wbuf_dirty) - _hash_chgbufaccess(rel, wbuf, HASH_WRITE, HASH_NOLOCK); - else - _hash_chgbufaccess(rel, wbuf, HASH_READ, HASH_NOLOCK); - if (BlockNumberIsValid(nextblkno)) { Buffer nextbuf = _hash_getbuf_with_strategy(rel, @@ -481,6 +477,38 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf, Buffer wbuf, _hash_relbuf(rel, nextbuf); } + /* + * To maintain lock chaining as described atop hashbucketcleanup, we need + * to lock next bucket buffer in chain before releasing current. This is + * required only if the next overflow page from which to read is not same + * as page to which we need to write. + * + * XXX Here, we are moving to next overflow page for writing without + * ensuring if the previous write page is full. This is annoying, but + * should not hurt much in practice as that space will anyway be consumed + * by future inserts. + */ + if (prevblkno != writeblkno) + { + wpage = BufferGetPage(wbuf); + wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); + Assert(wopaque->hasho_bucket == bucket); + writeblkno = wopaque->hasho_nextblkno; + Assert(BlockNumberIsValid(writeblkno)); + + next_wbuf = _hash_getbuf_with_strategy(rel, + writeblkno, + HASH_WRITE, + LH_OVERFLOW_PAGE, + bstrategy); + } + + /* write and unlock the write buffer */ + if (wbuf_dirty) + _hash_chgbufaccess(rel, wbuf, HASH_WRITE, HASH_NOLOCK); + else + _hash_chgbufaccess(rel, wbuf, HASH_READ, HASH_NOLOCK); + /* Note: bstrategy is intentionally not used for metapage and bitmap */ /* Read the metapage so we can determine which bitmap page to use */ @@ -520,7 +548,7 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf, Buffer wbuf, } _hash_relbuf(rel, metabuf); - return nextblkno; + return next_wbuf; } @@ -686,6 +714,7 @@ _hash_squeezebucket(Relation rel, OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; bool retain_pin = false; + Buffer next_wbuf = InvalidBuffer; /* Scan each tuple in "read" page */ maxroffnum = PageGetMaxOffsetNumber(rpage); @@ -793,19 +822,29 @@ _hash_squeezebucket(Relation rel, Assert(BlockNumberIsValid(rblkno)); /* free this overflow page (releases rbuf) */ - _hash_freeovflpage(rel, rbuf, wbuf, wbuf_dirty, bstrategy); + next_wbuf = _hash_freeovflpage(rel, rbuf, wbuf, wbuf_dirty, bstrategy); + + /* retain the pin on primary bucket page till end of bucket scan */ + if (wblkno != bucket_blkno) + _hash_dropbuf(rel, wbuf); /* are we freeing the page adjacent to wbuf? */ if (rblkno == wblkno) + return; + + /* are we freeing the page adjacent to next_wbuf? */ + if (BufferIsValid(next_wbuf) && + rblkno == BufferGetBlockNumber(next_wbuf)) { - /* retain the pin on primary bucket page till end of bucket scan */ - if (wblkno != bucket_blkno) - _hash_dropbuf(rel, wbuf); + _hash_relbuf(rel, next_wbuf); return; } - /* lock the overflow page being written, then get the previous one */ - _hash_chgbufaccess(rel, wbuf, HASH_NOLOCK, HASH_WRITE); + wbuf = next_wbuf; + wblkno = BufferGetBlockNumber(wbuf); + wpage = BufferGetPage(wbuf); + wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); + Assert(wopaque->hasho_bucket == bucket); rbuf = _hash_getbuf_with_strategy(rel, rblkno, diff --git a/src/include/access/hash.h b/src/include/access/hash.h index 9ce44a7..5691ee3 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -313,7 +313,7 @@ extern OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf, /* hashovfl.c */ extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin); -extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf, Buffer wbuf, +extern Buffer _hash_freeovflpage(Relation rel, Buffer ovflbuf, Buffer wbuf, bool wbuf_dirty, BufferAccessStrategy bstrategy); extern void _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno, ForkNumber forkNum);