diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 2d0f8f4b79..0aefb1a5fb 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -550,21 +550,21 @@ right to recover from a "concurrent" page split or page deletion, just like on the primary.) However, there are a couple of differences in how pages are locked by -replay/the startup process as compared to the original write operation -on the primary. The exceptions involve page splits and page deletions. -The first phase and second phase of a page split are processed -independently during replay, since they are independent atomic actions. -We do not attempt to recreate the coupling of parent and child page -write locks that took place on the primary. This is safe because readers -never care about the incomplete split flag anyway. Holding on to an -extra write lock on the primary is only necessary so that a second -writer cannot observe the incomplete split flag before the first writer -finishes the split. If we let concurrent writers on the primary observe -an incomplete split flag on the same page, each writer would attempt to -complete the unfinished split, corrupting the parent page. (Similarly, -replay of page deletion records does not hold a write lock on the target -leaf page throughout; only the primary needs to block out concurrent -writers that insert on to the page being deleted.) +replay/the startup process as compared to the original write operation on +the primary. This is mainly about the page splits. The first phase and +second phase of a page split are processed independently during replay, +since they are independent atomic actions. We do not attempt to recreate +the coupling of parent and child page write locks that took place on the +primary. This is safe because readers never care about the incomplete +split flag anyway. Holding on to an extra write lock on the primary is +only necessary so that a second writer cannot observe the incomplete split +flag before the first writer finishes the split. If we let concurrent +writers on the primary observe an incomplete split flag on the same page, +each writer would attempt to complete the unfinished split, corrupting the +parent page. (However, although only the primary needs to block out +concurrent writers that insert on to the page being deleted – page +deletion lock coupling on standby is the same as on primary. It is +required to avoid race condition during the scan in a backward direction.) During recovery all index scans start with ignore_killed_tuples = false and we never set kill_prior_tuple. We do this because the oldest xmin diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 99d0914e72..679fc5866e 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -772,7 +772,7 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) XLogRecGetData(record); BlockNumber leftsib; BlockNumber rightsib; - Buffer buffer; + Buffer leafbuf, lbuff = InvalidBuffer, rbuff, buff; Page page; BTPageOpaque pageop; @@ -780,47 +780,29 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) rightsib = xlrec->rightsib; /* - * In normal operation, we would lock all the pages this WAL record - * touches before changing any of them. In WAL replay, it should be okay - * to lock just one page at a time, since no concurrent index updates can - * be happening, and readers should not care whether they arrive at the - * target page or not (since it's surely empty). + * We have to lock the pages we need to modify in the moving right order. + * Else we will go into the race against _bt_walk_left. */ - /* Fix left-link of right sibling */ - if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO) - { - page = (Page) BufferGetPage(buffer); - pageop = (BTPageOpaque) PageGetSpecialPointer(page); - pageop->btpo_prev = leftsib; - - PageSetLSN(page, lsn); - MarkBufferDirty(buffer); - } - if (BufferIsValid(buffer)) - UnlockReleaseBuffer(buffer); - /* Fix right-link of left sibling, if any */ if (leftsib != P_NONE) { - if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 1, &lbuff) == BLK_NEEDS_REDO) { - page = (Page) BufferGetPage(buffer); + page = (Page) BufferGetPage(lbuff); pageop = (BTPageOpaque) PageGetSpecialPointer(page); pageop->btpo_next = rightsib; PageSetLSN(page, lsn); - MarkBufferDirty(buffer); + MarkBufferDirty(lbuff); } - if (BufferIsValid(buffer)) - UnlockReleaseBuffer(buffer); } /* Rewrite target page as empty deleted page */ - buffer = XLogInitBufferForRedo(record, 0); - page = (Page) BufferGetPage(buffer); + buff = XLogInitBufferForRedo(record, 0); + page = (Page) BufferGetPage(buff); - _bt_pageinit(page, BufferGetPageSize(buffer)); + _bt_pageinit(page, BufferGetPageSize(buff)); pageop = (BTPageOpaque) PageGetSpecialPointer(page); pageop->btpo_prev = leftsib; @@ -830,8 +812,25 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) pageop->btpo_cycleid = 0; PageSetLSN(page, lsn); - MarkBufferDirty(buffer); - UnlockReleaseBuffer(buffer); + MarkBufferDirty(buff); + + /* Fix left-link of right sibling */ + if (XLogReadBufferForRedo(record, 2, &rbuff) == BLK_NEEDS_REDO) + { + page = (Page) BufferGetPage(rbuff); + pageop = (BTPageOpaque) PageGetSpecialPointer(page); + pageop->btpo_prev = leftsib; + + PageSetLSN(page, lsn); + MarkBufferDirty(rbuff); + } + + /* Release all buffers */ + if (BufferIsValid(lbuff)) + UnlockReleaseBuffer(lbuff); + UnlockReleaseBuffer(buff); + if (BufferIsValid(rbuff)) + UnlockReleaseBuffer(rbuff); /* * If we deleted a parent of the targeted leaf page, instead of the leaf @@ -846,10 +845,10 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) */ IndexTupleData trunctuple; - buffer = XLogInitBufferForRedo(record, 3); - page = (Page) BufferGetPage(buffer); + leafbuf = XLogInitBufferForRedo(record, 3); + page = (Page) BufferGetPage(leafbuf); - _bt_pageinit(page, BufferGetPageSize(buffer)); + _bt_pageinit(page, BufferGetPageSize(leafbuf)); pageop = (BTPageOpaque) PageGetSpecialPointer(page); pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF; @@ -868,8 +867,8 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) elog(ERROR, "could not add dummy high key to half-dead page"); PageSetLSN(page, lsn); - MarkBufferDirty(buffer); - UnlockReleaseBuffer(buffer); + MarkBufferDirty(leafbuf); + UnlockReleaseBuffer(leafbuf); } /* Update metapage if needed */