diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 2f85867..50f2dd5 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -423,23 +423,18 @@ btree_xlog_vacuum(XLogReaderState *record) for (blkno = xlrec->lastBlockVacuumed + 1; blkno < thisblkno; blkno++) { /* - * We use RBM_NORMAL_NO_LOG mode because it's not an error - * condition to see all-zero pages. The original btvacuumpage - * scan would have skipped over all-zero pages, noting them in FSM - * but not bothering to initialize them just yet; so we mustn't - * throw an error here. (We could skip acquiring the cleanup lock - * if PageIsNew, but it's probably not worth the cycles to test.) - * - * XXX we don't actually need to read the block, we just need to - * confirm it is unpinned. If we had a special call into the - * buffer manager we could optimise this so that if the block is - * not in shared_buffers we confirm it as unpinned. + * We use RBM_ZERO_NO_BM_VALID mode because we don't actually + * need to read the block, we just need to confirm it is unpinned. + * In this mode the buffer is returned if it is already in shared + * buffers. If not, zeroed page is returned without BM_VALID flag + * so that no one could pin it. In both cases the returned buffer + * is pinned and with grabbed buffer content lock. So that we + * call UnlockReleaseBuffer on it. */ buffer = XLogReadBufferExtended(thisrnode, MAIN_FORKNUM, blkno, - RBM_NORMAL_NO_LOG); + RBM_ZERO_NO_BM_VALID); if (BufferIsValid(buffer)) { - LockBufferForCleanup(buffer); UnlockReleaseBuffer(buffer); } } diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index fa98b82..3c53c0e 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -459,8 +459,11 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, { if (buffer != InvalidBuffer) { - if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) + if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK || + mode == RBM_ZERO_NO_BM_VALID) + { LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + } ReleaseBuffer(buffer); } buffer = ReadBufferWithoutRelcache(rnode, forknum, @@ -470,8 +473,11 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, /* Handle the corner case that P_NEW returns non-consecutive pages */ if (BufferGetBlockNumber(buffer) != blkno) { - if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) + if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK || + mode == RBM_ZERO_NO_BM_VALID) + { LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + } ReleaseBuffer(buffer); buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno, mode, NULL); diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index a68eae8..3c300c3 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -527,6 +527,12 @@ ReadBuffer(Relation reln, BlockNumber blockNum) * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires * a cleanup-strength lock on the page. * + * RBM_ZERO_NO_BM_VALID is the same as RBM_ZERO_AND_LOCK, but does not set + * BM_VALID bit before returning buffer. This is done to ensure that no one + * can pin the buffer without actually reading the buffer contents in. It is + * useful when the caller does not need to read the page contents (since this + * saves I/O) but also is not going to change page contents. + * * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here. * * If strategy is not NULL, a nondefault buffer access strategy is used. @@ -674,7 +680,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, */ if (!isLocalBuf) { - if (mode == RBM_ZERO_AND_LOCK) + if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_NO_BM_VALID) LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE); else if (mode == RBM_ZERO_AND_CLEANUP_LOCK) LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr)); @@ -761,8 +767,11 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * Read in the page, unless the caller intends to overwrite it and * just wants us to allocate a buffer. */ - if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) + if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK || + mode == RBM_ZERO_NO_BM_VALID) + { MemSet((char *) bufBlock, 0, BLCKSZ); + } else { instr_time io_start, @@ -813,8 +822,8 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * (Note that we cannot use LockBuffer() of LockBufferForCleanup() here, * because they assert that the buffer is already valid.) */ - if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) && - !isLocalBuf) + if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK || + mode == RBM_ZERO_NO_BM_VALID) && !isLocalBuf) { LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE); } @@ -826,8 +835,14 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, } else { - /* Set BM_VALID, terminate IO, and wake up any waiters */ - TerminateBufferIO(bufHdr, false, BM_VALID); + /* + * Set BM_VALID (in all modes except RBM_ZERO_NO_BM_VALID), + * terminate IO, and wake up any waiters. + */ + if (mode == RBM_ZERO_NO_BM_VALID) + TerminateBufferIO(bufHdr, false, 0); + else + TerminateBufferIO(bufHdr, false, BM_VALID); } VacuumPageMiss++; diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index ec0a254..62c0348 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -40,6 +40,8 @@ typedef enum * initialize. Also locks the page. */ RBM_ZERO_AND_CLEANUP_LOCK, /* Like RBM_ZERO_AND_LOCK, but locks the page * in "cleanup" mode */ + RBM_ZERO_NO_BM_VALID, /* Like RBM_ZERO_AND_LOCK, but do not set BM_VALID + * bit before returning buffer */ RBM_ZERO_ON_ERROR, /* Read, but return an all-zeros page on error */ RBM_NORMAL_NO_LOG /* Don't log page as invalid during WAL * replay; otherwise same as RBM_NORMAL */