From 564259f35a3d30df8cc1848b5afe06448d028494 Mon Sep 17 00:00:00 2001 From: Amit Kapila Date: Mon, 13 Feb 2017 16:19:06 +0530 Subject: [PATCH 3/6] Restructure _hash_addovflpage so that the operation can be performed atomically and can be WAL-logged. --- src/backend/access/hash/hashovfl.c | 232 +++++++++++++++++++++---------------- src/backend/access/hash/hashpage.c | 2 +- 2 files changed, 135 insertions(+), 99 deletions(-) diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c index b3cc7b9..051410b 100644 --- a/src/backend/access/hash/hashovfl.c +++ b/src/backend/access/hash/hashovfl.c @@ -22,7 +22,6 @@ #include "utils/rel.h" -static Buffer _hash_getovflpage(Relation rel, Buffer metabuf); static uint32 _hash_firstfreebit(uint32 map); @@ -96,7 +95,9 @@ _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno) * dropped before exiting (we assume the caller is not interested in 'buf' * anymore) if not asked to retain. The pin will be retained only for the * primary bucket. The returned overflow page will be pinned and - * write-locked; it is guaranteed to be empty. + * write-locked; it is not guaranteed to be empty as we release and require + * lock on it, so the caller must ensure that it has required space + * in the page. * * The caller must hold a pin, but no lock, on the metapage buffer. * That buffer is returned in the same state. @@ -114,13 +115,37 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin) Page ovflpage; HashPageOpaque pageopaque; HashPageOpaque ovflopaque; - - /* allocate and lock an empty overflow page */ - ovflbuf = _hash_getovflpage(rel, metabuf); + HashMetaPage metap; + Buffer mapbuf = InvalidBuffer; + Buffer newmapbuf = InvalidBuffer; + BlockNumber blkno; + uint32 orig_firstfree; + uint32 splitnum; + uint32 *freep = NULL; + uint32 max_ovflpg; + uint32 bit; + uint32 bitmap_page_bit; + uint32 first_page; + uint32 last_bit; + uint32 last_page; + uint32 i, + j; + bool page_found = false; /* - * Write-lock the tail page. It is okay to hold two buffer locks here - * since there cannot be anyone else contending for access to ovflbuf. + * Write-lock the tail page. Here, we need to maintain locking order such + * that, first acquire the lock on tail page of bucket, then on meta page + * to find and lock the bitmap page and if it is found, then lock on meta + * page is released, then finally acquire the lock on new overflow buffer. + * We need this locking order to avoid deadlock with backends that are + * doing inserts. + * + * Note: We could have avoided locking many buffers here if we make two + * WAL records for acquiring an overflow page (one to allocate an overflow + * page and another to add it to overflow bucket chain). However, doing + * so can leak an overflow page, if the system crashes after allocation. + * Needless to say, it is better to have a single record from a + * performance point of view as well. */ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); @@ -154,60 +179,6 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin) buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); } - /* now that we have correct backlink, initialize new overflow page */ - ovflpage = BufferGetPage(ovflbuf); - ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); - ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf); - ovflopaque->hasho_nextblkno = InvalidBlockNumber; - ovflopaque->hasho_bucket = pageopaque->hasho_bucket; - ovflopaque->hasho_flag = LH_OVERFLOW_PAGE; - ovflopaque->hasho_page_id = HASHO_PAGE_ID; - - MarkBufferDirty(ovflbuf); - - /* logically chain overflow page to previous page */ - pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf); - MarkBufferDirty(buf); - if (retain_pin) - { - /* pin will be retained only for the primary bucket page */ - Assert(pageopaque->hasho_flag & LH_BUCKET_PAGE); - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - } - else - _hash_relbuf(rel, buf); - - return ovflbuf; -} - -/* - * _hash_getovflpage() - * - * Find an available overflow page and return it. The returned buffer - * is pinned and write-locked, and has had _hash_pageinit() applied, - * but it is caller's responsibility to fill the special space. - * - * The caller must hold a pin, but no lock, on the metapage buffer. - * That buffer is left in the same state at exit. - */ -static Buffer -_hash_getovflpage(Relation rel, Buffer metabuf) -{ - HashMetaPage metap; - Buffer mapbuf = 0; - Buffer newbuf; - BlockNumber blkno; - uint32 orig_firstfree; - uint32 splitnum; - uint32 *freep = NULL; - uint32 max_ovflpg; - uint32 bit; - uint32 first_page; - uint32 last_bit; - uint32 last_page; - uint32 i, - j; - /* Get exclusive lock on the meta page */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); @@ -256,11 +227,31 @@ _hash_getovflpage(Relation rel, Buffer metabuf) for (; bit <= last_inpage; j++, bit += BITS_PER_MAP) { if (freep[j] != ALL_SET) + { + page_found = true; + + /* Reacquire exclusive lock on the meta page */ + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + + /* convert bit to bit number within page */ + bit += _hash_firstfreebit(freep[j]); + bitmap_page_bit = bit; + + /* convert bit to absolute bit number */ + bit += (i << BMPG_SHIFT(metap)); + /* Calculate address of the recycled overflow page */ + blkno = bitno_to_blkno(metap, bit); + + /* Fetch and init the recycled page */ + ovflbuf = _hash_getinitbuf(rel, blkno); + goto found; + } } /* No free space here, try to advance to next map page */ _hash_relbuf(rel, mapbuf); + mapbuf = InvalidBuffer; i++; j = 0; /* scan from start of next map page */ bit = 0; @@ -284,8 +275,15 @@ _hash_getovflpage(Relation rel, Buffer metabuf) * convenient to pre-mark them as "in use" too. */ bit = metap->hashm_spares[splitnum]; - _hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit), MAIN_FORKNUM); - metap->hashm_spares[splitnum]++; + newmapbuf = _hash_getnewbuf(rel, bitno_to_blkno(metap, bit), MAIN_FORKNUM); + + /* add the new bitmap page to the metapage's list of bitmaps */ + /* metapage already has a write lock */ + if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("out of overflow pages in hash index \"%s\"", + RelationGetRelationName(rel)))); } else { @@ -296,7 +294,8 @@ _hash_getovflpage(Relation rel, Buffer metabuf) } /* Calculate address of the new overflow page */ - bit = metap->hashm_spares[splitnum]; + bit = BufferIsValid(newmapbuf) ? + metap->hashm_spares[splitnum] + 1 : metap->hashm_spares[splitnum]; blkno = bitno_to_blkno(metap, bit); /* @@ -304,41 +303,49 @@ _hash_getovflpage(Relation rel, Buffer metabuf) * relation length stays in sync with ours. XXX It's annoying to do this * with metapage write lock held; would be better to use a lock that * doesn't block incoming searches. + * + * It is okay to hold two buffer locks here (one on tail page of bucket + * and other on new overflow page) since there cannot be anyone else + * contending for access to ovflbuf. */ - newbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM); + ovflbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM); - metap->hashm_spares[splitnum]++; +found: /* - * Adjust hashm_firstfree to avoid redundant searches. But don't risk - * changing it if someone moved it while we were searching bitmap pages. + * Do the update. */ - if (metap->hashm_firstfree == orig_firstfree) - metap->hashm_firstfree = bit + 1; - - /* Write updated metapage and release lock, but not pin */ - MarkBufferDirty(metabuf); - LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); - - return newbuf; + START_CRIT_SECTION(); -found: - /* convert bit to bit number within page */ - bit += _hash_firstfreebit(freep[j]); + if (page_found) + { + Assert(BufferIsValid(mapbuf)); - /* mark page "in use" in the bitmap */ - SETBIT(freep, bit); - MarkBufferDirty(mapbuf); - _hash_relbuf(rel, mapbuf); + /* mark page "in use" in the bitmap */ + SETBIT(freep, bitmap_page_bit); + MarkBufferDirty(mapbuf); + } + else + { + /* update the count to indicate new overflow page is added */ + metap->hashm_spares[splitnum]++; - /* Reacquire exclusive lock on the meta page */ - LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + if (BufferIsValid(newmapbuf)) + { + _hash_initbitmapbuffer(newmapbuf, metap->hashm_bmsize, false); + MarkBufferDirty(newmapbuf); - /* convert bit to absolute bit number */ - bit += (i << BMPG_SHIFT(metap)); + metap->hashm_mapp[metap->hashm_nmaps] = BufferGetBlockNumber(newmapbuf); + metap->hashm_nmaps++; + metap->hashm_spares[splitnum]++; + MarkBufferDirty(metabuf); + } - /* Calculate address of the recycled overflow page */ - blkno = bitno_to_blkno(metap, bit); + /* + * for new overflow page, we don't need to explicitly set the bit in + * bitmap page, as by default that will be set to "in use". + */ + } /* * Adjust hashm_firstfree to avoid redundant searches. But don't risk @@ -347,19 +354,48 @@ found: if (metap->hashm_firstfree == orig_firstfree) { metap->hashm_firstfree = bit + 1; - - /* Write updated metapage and release lock, but not pin */ MarkBufferDirty(metabuf); - LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); } + + /* now that we have correct backlink, initialize new overflow page */ + ovflpage = BufferGetPage(ovflbuf); + ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); + ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf); + ovflopaque->hasho_nextblkno = InvalidBlockNumber; + ovflopaque->hasho_bucket = pageopaque->hasho_bucket; + ovflopaque->hasho_flag = LH_OVERFLOW_PAGE; + ovflopaque->hasho_page_id = HASHO_PAGE_ID; + + MarkBufferDirty(ovflbuf); + + /* logically chain overflow page to previous page */ + pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf); + + MarkBufferDirty(buf); + + END_CRIT_SECTION(); + + if (retain_pin) + LockBuffer(buf, BUFFER_LOCK_UNLOCK); else - { - /* We didn't change the metapage, so no need to write */ - LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); - } + _hash_relbuf(rel, buf); + + if (BufferIsValid(mapbuf)) + _hash_relbuf(rel, mapbuf); - /* Fetch, init, and return the recycled page */ - return _hash_getinitbuf(rel, blkno); + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + + if (BufferIsValid(newmapbuf)) + _hash_relbuf(rel, newmapbuf); + + /* + * we need to release and reacquire the lock on overflow buffer to ensure + * that standby shouldn't see an intermediate state of it. + */ + LockBuffer(ovflbuf, BUFFER_LOCK_UNLOCK); + LockBuffer(ovflbuf, BUFFER_LOCK_EXCLUSIVE); + + return ovflbuf; } /* diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 00f3ea8..ea72892 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -983,7 +983,7 @@ _hash_splitbucket_guts(Relation rel, itemsz = IndexTupleDSize(*new_itup); itemsz = MAXALIGN(itemsz); - if (PageGetFreeSpace(npage) < itemsz) + while (PageGetFreeSpace(npage) < itemsz) { /* write out nbuf and drop lock, but keep pin */ MarkBufferDirty(nbuf); -- 1.8.4.msysgit.0