From 8751db6e46166a3c54c456c6d981c74cdcd526e3 Mon Sep 17 00:00:00 2001 From: Dilip Kumar Date: Mon, 3 Dec 2018 10:12:34 +0530 Subject: [PATCH 2/3] Provide access to undo log data via the buffer manager. In ancient Berkeley POSTGRES, smgr.c allowed for different storage engines, of which only md.c survives. Revive this mechanism to provide access to undo log data through the existing buffer manager. Undo logs exist in a pseudo-database whose OID is used to dispatch IO requests to undofile.c instead of md.c. Note: a separate proposal generalizes the fsync request machinery, see https://commitfest.postgresql.org/20/1829/. This patch has some stand-in fsync machinery, but will be rebased on that other one depending on progress. It seems better to avoid tangling up too many concurrently proposals so for now this patch has its own fsync queue, duplicating some code from md.c. Author: Thomas Munro, though ForgetBuffer() was contributed by Robert Haas Discussion: https://postgr.es/m/CAEepm%3D2EqROYJ_xYz4v5kfr4b0qw_Lq_6Pe8RTEC8rx3upWsSQ%40mail.gmail.com --- src/backend/access/transam/xlogutils.c | 10 +- src/backend/postmaster/checkpointer.c | 2 +- src/backend/postmaster/pgstat.c | 24 +- src/backend/storage/buffer/bufmgr.c | 82 ++++- src/backend/storage/smgr/Makefile | 2 +- src/backend/storage/smgr/md.c | 15 +- src/backend/storage/smgr/smgr.c | 49 ++- src/backend/storage/smgr/undofile.c | 546 +++++++++++++++++++++++++++++++++ src/include/pgstat.h | 16 +- src/include/storage/bufmgr.h | 14 +- src/include/storage/smgr.h | 35 ++- src/include/storage/undofile.h | 50 +++ 12 files changed, 810 insertions(+), 35 deletions(-) create mode 100644 src/backend/storage/smgr/undofile.c create mode 100644 src/include/storage/undofile.h diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 4ecdc92..8fed7b1 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -346,7 +346,7 @@ XLogReadBufferForRedoExtended(XLogReaderState *record, * Make sure that if the block is marked with WILL_INIT, the caller is * going to initialize it. And vice versa. */ - zeromode = (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK); + zeromode = (mode == RBM_ZERO || mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK); willinit = (record->blocks[block_id].flags & BKPBLOCK_WILL_INIT) != 0; if (willinit && !zeromode) elog(PANIC, "block with WILL_INIT flag in WAL record must be zeroed by redo routine"); @@ -462,7 +462,7 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, { /* page exists in file */ buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno, - mode, NULL); + mode, NULL, RELPERSISTENCE_PERMANENT); } else { @@ -487,7 +487,8 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, ReleaseBuffer(buffer); } buffer = ReadBufferWithoutRelcache(rnode, forknum, - P_NEW, mode, NULL); + P_NEW, mode, NULL, + RELPERSISTENCE_PERMANENT); } while (BufferGetBlockNumber(buffer) < blkno); /* Handle the corner case that P_NEW returns non-consecutive pages */ @@ -497,7 +498,8 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, LockBuffer(buffer, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buffer); buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno, - mode, NULL); + mode, NULL, + RELPERSISTENCE_PERMANENT); } } diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index b9c118e..b2505c8 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -1314,7 +1314,7 @@ AbsorbFsyncRequests(void) LWLockRelease(CheckpointerCommLock); for (request = requests; n > 0; request++, n--) - RememberFsyncRequest(request->rnode, request->forknum, request->segno); + smgrrequestsync(request->rnode, request->forknum, request->segno); END_CRIT_SECTION(); diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 8676088..9d717d9 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -3515,7 +3515,7 @@ pgstat_get_wait_activity(WaitEventActivity w) case WAIT_EVENT_WAL_WRITER_MAIN: event_name = "WalWriterMain"; break; - /* no default case, so that compiler will warn */ + /* no default case, so that compiler will warn */ } return event_name; @@ -3897,6 +3897,28 @@ pgstat_get_wait_io(WaitEventIO w) case WAIT_EVENT_TWOPHASE_FILE_WRITE: event_name = "TwophaseFileWrite"; break; + case WAIT_EVENT_UNDO_CHECKPOINT_READ: + event_name = "UndoCheckpointRead"; + break; + case WAIT_EVENT_UNDO_CHECKPOINT_WRITE: + event_name = "UndoCheckpointWrite"; + break; + case WAIT_EVENT_UNDO_CHECKPOINT_SYNC: + event_name = "UndoCheckpointSync"; + break; + case WAIT_EVENT_UNDO_FILE_READ: + event_name = "UndoFileRead"; + break; + case WAIT_EVENT_UNDO_FILE_WRITE: + event_name = "UndoFileWrite"; + break; + case WAIT_EVENT_UNDO_FILE_FLUSH: + event_name = "UndoFileFlush"; + break; + case WAIT_EVENT_UNDO_FILE_SYNC: + event_name = "UndoFileSync"; + break; + case WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ: event_name = "WALSenderTimelineHistoryRead"; break; diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 9817770..bf2408a 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -176,6 +176,7 @@ static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer); static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move); static inline int32 GetPrivateRefCount(Buffer buffer); static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref); +static void InvalidateBuffer(BufferDesc *buf); /* * Ensure that the PrivateRefCountArray has sufficient space to store one more @@ -618,10 +619,12 @@ ReadBuffer(Relation reln, BlockNumber blockNum) * valid, the page is zeroed instead of throwing an error. This is intended * for non-critical data, where the caller is prepared to repair errors. * - * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's + * In RBM_ZERO mode, if the page isn't in buffer cache already, it's * filled with zeros instead of reading it from disk. Useful when the caller * is going to fill the page from scratch, since this saves I/O and avoids * unnecessary failure if the page-on-disk has corrupt page headers. + * + * In RBM_ZERO_AND_LOCK mode, the page is zeroed and also locked. * The page is returned locked to ensure that the caller has a chance to * initialize the page before it's made visible to others. * Caution: do not use this mode to read a page that is beyond the relation's @@ -672,24 +675,20 @@ ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, /* * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require * a relcache entry for the relation. - * - * NB: At present, this function may only be used on permanent relations, which - * is OK, because we only use it during XLOG replay. If in the future we - * want to use it on temporary or unlogged relations, we could pass additional - * parameters. */ Buffer ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, - BufferAccessStrategy strategy) + BufferAccessStrategy strategy, + char relpersistence) { bool hit; - SMgrRelation smgr = smgropen(rnode, InvalidBackendId); - - Assert(InRecovery); + SMgrRelation smgr = smgropen(rnode, + relpersistence == RELPERSISTENCE_TEMP + ? MyBackendId : InvalidBackendId); - return ReadBuffer_common(smgr, RELPERSISTENCE_PERMANENT, forkNum, blockNum, + return ReadBuffer_common(smgr, relpersistence, forkNum, blockNum, mode, strategy, &hit); } @@ -883,7 +882,9 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * Read in the page, unless the caller intends to overwrite it and * just wants us to allocate a buffer. */ - if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) + if (mode == RBM_ZERO || + mode == RBM_ZERO_AND_LOCK || + mode == RBM_ZERO_AND_CLEANUP_LOCK) MemSet((char *) bufBlock, 0, BLCKSZ); else { @@ -1338,6 +1339,61 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, } /* + * ForgetBuffer -- drop a buffer from shared buffers + * + * If the buffer isn't present in shared buffers, nothing happens. If it is + * present, it is discarded without making any attempt to write it back out to + * the operating system. The caller must therefore somehow be sure that the + * data won't be needed for anything now or in the future. It assumes that + * there is no concurrent access to the block, except that it might be being + * concurrently written. + */ +void +ForgetBuffer(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum) +{ + SMgrRelation smgr = smgropen(rnode, InvalidBackendId); + BufferTag tag; /* identity of target block */ + uint32 hash; /* hash value for tag */ + LWLock *partitionLock; /* buffer partition lock for it */ + int buf_id; + BufferDesc *bufHdr; + uint32 buf_state; + + /* create a tag so we can lookup the buffer */ + INIT_BUFFERTAG(tag, smgr->smgr_rnode.node, forkNum, blockNum); + + /* determine its hash code and partition lock ID */ + hash = BufTableHashCode(&tag); + partitionLock = BufMappingPartitionLock(hash); + + /* see if the block is in the buffer pool */ + LWLockAcquire(partitionLock, LW_SHARED); + buf_id = BufTableLookup(&tag, hash); + LWLockRelease(partitionLock); + + /* didn't find it, so nothing to do */ + if (buf_id < 0) + return; + + /* take the buffer header lock */ + bufHdr = GetBufferDescriptor(buf_id); + buf_state = LockBufHdr(bufHdr); + + /* + * The buffer might been evicted after we released the partition lock and + * before we acquired the buffer header lock. If so, the buffer we've + * locked might contain some other data which we shouldn't touch. If the + * buffer hasn't been recycled, we proceed to invalidate it. + */ + if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) && + bufHdr->tag.blockNum == blockNum && + bufHdr->tag.forkNum == forkNum) + InvalidateBuffer(bufHdr); /* releases spinlock */ + else + UnlockBufHdr(bufHdr, buf_state); +} + +/* * InvalidateBuffer -- mark a shared buffer invalid and return it to the * freelist. * @@ -1412,7 +1468,7 @@ retry: LWLockRelease(oldPartitionLock); /* safety check: should definitely not be our *own* pin */ if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0) - elog(ERROR, "buffer is pinned in InvalidateBuffer"); + elog(PANIC, "buffer is pinned in InvalidateBuffer"); WaitIO(buf); goto retry; } diff --git a/src/backend/storage/smgr/Makefile b/src/backend/storage/smgr/Makefile index 2b95cb0..b657eb2 100644 --- a/src/backend/storage/smgr/Makefile +++ b/src/backend/storage/smgr/Makefile @@ -12,6 +12,6 @@ subdir = src/backend/storage/smgr top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = md.o smgr.o smgrtype.o +OBJS = md.o smgr.o smgrtype.o undofile.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 4c6a505..4c489a2 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -45,7 +45,7 @@ #define UNLINKS_PER_ABSORB 10 /* - * Special values for the segno arg to RememberFsyncRequest. + * Special values for the segno arg to mdrequestsync. * * Note that CompactCheckpointerRequestQueue assumes that it's OK to remove an * fsync request from the queue if an identical, subsequent request is found. @@ -1420,7 +1420,7 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) if (pendingOpsTable) { /* push it into local pending-ops table */ - RememberFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno); + mdrequestsync(reln->smgr_rnode.node, forknum, seg->mdfd_segno); } else { @@ -1456,8 +1456,7 @@ register_unlink(RelFileNodeBackend rnode) if (pendingOpsTable) { /* push it into local pending-ops table */ - RememberFsyncRequest(rnode.node, MAIN_FORKNUM, - UNLINK_RELATION_REQUEST); + mdrequestsync(rnode.node, MAIN_FORKNUM, UNLINK_RELATION_REQUEST); } else { @@ -1476,7 +1475,7 @@ register_unlink(RelFileNodeBackend rnode) } /* - * RememberFsyncRequest() -- callback from checkpointer side of fsync request + * mdrequestsync() -- callback from checkpointer side of fsync request * * We stuff fsync requests into the local hash table for execution * during the checkpointer's next checkpoint. UNLINK requests go into a @@ -1497,7 +1496,7 @@ register_unlink(RelFileNodeBackend rnode) * heavyweight operation anyhow, so we'll live with it.) */ void -RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) +mdrequestsync(RelFileNode rnode, ForkNumber forknum, int segno) { Assert(pendingOpsTable); @@ -1640,7 +1639,7 @@ ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum) if (pendingOpsTable) { /* standalone backend or startup process: fsync state is local */ - RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC); + mdrequestsync(rnode, forknum, FORGET_RELATION_FSYNC); } else if (IsUnderPostmaster) { @@ -1679,7 +1678,7 @@ ForgetDatabaseFsyncRequests(Oid dbid) if (pendingOpsTable) { /* standalone backend or startup process: fsync state is local */ - RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC); + mdrequestsync(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC); } else if (IsUnderPostmaster) { diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 189342e..d0b2c0d 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -58,6 +58,8 @@ typedef struct f_smgr BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); + void (*smgr_requestsync) (RelFileNode rnode, ForkNumber forknum, + int segno); void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); void (*smgr_pre_ckpt) (void); /* may be NULL */ void (*smgr_sync) (void); /* may be NULL */ @@ -81,15 +83,45 @@ static const f_smgr smgrsw[] = { .smgr_writeback = mdwriteback, .smgr_nblocks = mdnblocks, .smgr_truncate = mdtruncate, + .smgr_requestsync = mdrequestsync, .smgr_immedsync = mdimmedsync, .smgr_pre_ckpt = mdpreckpt, .smgr_sync = mdsync, .smgr_post_ckpt = mdpostckpt + }, + /* undo logs */ + { + .smgr_init = undofile_init, + .smgr_shutdown = undofile_shutdown, + .smgr_close = undofile_close, + .smgr_create = undofile_create, + .smgr_exists = undofile_exists, + .smgr_unlink = undofile_unlink, + .smgr_extend = undofile_extend, + .smgr_prefetch = undofile_prefetch, + .smgr_read = undofile_read, + .smgr_write = undofile_write, + .smgr_writeback = undofile_writeback, + .smgr_nblocks = undofile_nblocks, + .smgr_truncate = undofile_truncate, + .smgr_requestsync = undofile_requestsync, + .smgr_immedsync = undofile_immedsync, + .smgr_pre_ckpt = undofile_preckpt, + .smgr_sync = undofile_sync, + .smgr_post_ckpt = undofile_postckpt } }; static const int NSmgr = lengthof(smgrsw); +/* + * In ancient Postgres the catalog entry for each relation controlled the + * choice of storage manager implementation. Now we have only md.c for + * regular relations, and undofile.c for undo log storage in the undolog + * pseudo-database. + */ +#define SmgrWhichForRelFileNode(rfn) \ + ((rfn).dbNode == 9 ? 1 : 0) /* * Each backend has a hashtable that stores all extant SMgrRelation objects. @@ -185,11 +217,18 @@ smgropen(RelFileNode rnode, BackendId backend) reln->smgr_targblock = InvalidBlockNumber; reln->smgr_fsm_nblocks = InvalidBlockNumber; reln->smgr_vm_nblocks = InvalidBlockNumber; - reln->smgr_which = 0; /* we only have md.c at present */ + + /* Which storage manager implementation? */ + reln->smgr_which = SmgrWhichForRelFileNode(rnode); /* mark it not open */ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) + { reln->md_num_open_segs[forknum] = 0; + reln->md_seg_fds[forknum] = NULL; + } + + reln->private_data = NULL; /* it has no owner yet */ add_to_unowned_list(reln); @@ -723,6 +762,14 @@ smgrtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) } /* + * smgrrequestsync() -- Enqueue a request for smgrsync() to flush data. + */ +void smgrrequestsync(RelFileNode rnode, ForkNumber forknum, int segno) +{ + smgrsw[SmgrWhichForRelFileNode(rnode)].smgr_requestsync(rnode, forknum, segno); +} + +/* * smgrimmedsync() -- Force the specified relation to stable storage. * * Synchronously force all previous writes to the specified relation diff --git a/src/backend/storage/smgr/undofile.c b/src/backend/storage/smgr/undofile.c new file mode 100644 index 0000000..afba64e --- /dev/null +++ b/src/backend/storage/smgr/undofile.c @@ -0,0 +1,546 @@ +/* + * undofile.h + * + * PostgreSQL undo file manager. This module provides SMGR-compatible + * interface to the files that back undo logs on the filesystem, so that undo + * log data can use the shared buffer pool. Other aspects of undo log + * management are provided by undolog.c, so the SMGR interfaces not directly + * concerned with reading, writing and flushing data are unimplemented. + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/storage/smgr/undofile.c + */ + +#include "postgres.h" + +#include "access/undolog.h" +#include "access/xlog.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/bgwriter.h" +#include "storage/fd.h" +#include "storage/undofile.h" +#include "utils/memutils.h" + +/* intervals for calling AbsorbFsyncRequests in undofile_sync */ +#define FSYNCS_PER_ABSORB 10 + +/* + * Special values for the fork arg to undofile_requestsync. + */ +#define FORGET_UNDO_SEGMENT_FSYNC (InvalidBlockNumber) + +/* + * While md.c expects random access and has a small number of huge + * segments, undofile.c manages a potentially very large number of smaller + * segments and has a less random access pattern. Therefore, instead of + * keeping a potentially huge array of vfds we'll just keep the most + * recently accessed N. + * + * For now, N == 1, so we just need to hold onto one 'File' handle. + */ +typedef struct UndoFileState +{ + int mru_segno; + File mru_file; +} UndoFileState; + +static MemoryContext UndoFileCxt; + +typedef uint16 CycleCtr; + +/* + * An entry recording the segments that need to be fsynced by undofile_sync(). + * This is a bit simpler than md.c's version, though it could perhaps be + * merged into a common struct. One difference is that we can have much + * larger segment numbers, so we'll adjust for that to avoid having a lot of + * leading zero bits. + */ +typedef struct +{ + RelFileNode rnode; + Bitmapset *requests; + CycleCtr cycle_ctr; +} PendingOperationEntry; + +static HTAB *pendingOpsTable = NULL; +static MemoryContext pendingOpsCxt; + +static CycleCtr undofile_sync_cycle_ctr = 0; + +static File undofile_open_segment_file(Oid relNode, Oid spcNode, int segno, + bool missing_ok); +static File undofile_get_segment_file(SMgrRelation reln, int segno); + +void +undofile_init(void) +{ + UndoFileCxt = AllocSetContextCreate(TopMemoryContext, + "UndoFileSmgr", + ALLOCSET_DEFAULT_SIZES); + + if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess()) + { + HASHCTL hash_ctl; + + pendingOpsCxt = AllocSetContextCreate(UndoFileCxt, + "Pending ops context", + ALLOCSET_DEFAULT_SIZES); + MemoryContextAllowInCriticalSection(pendingOpsCxt, true); + + MemSet(&hash_ctl, 0, sizeof(hash_ctl)); + hash_ctl.keysize = sizeof(RelFileNode); + hash_ctl.entrysize = sizeof(PendingOperationEntry); + hash_ctl.hcxt = pendingOpsCxt; + pendingOpsTable = hash_create("Pending Ops Table", + 100L, + &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + } +} + +void +undofile_shutdown(void) +{ +} + +void +undofile_close(SMgrRelation reln, ForkNumber forknum) +{ +} + +void +undofile_create(SMgrRelation reln, ForkNumber forknum, bool isRedo) +{ + elog(ERROR, "undofile_create is not supported"); +} + +bool +undofile_exists(SMgrRelation reln, ForkNumber forknum) +{ + elog(ERROR, "undofile_exists is not supported"); +} + +void +undofile_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo) +{ + elog(ERROR, "undofile_unlink is not supported"); +} + +void +undofile_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, + bool skipFsync) +{ + elog(ERROR, "undofile_extend is not supported"); +} + +void +undofile_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +{ + elog(ERROR, "undofile_prefetch is not supported"); +} + +void +undofile_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer) +{ + File file; + off_t seekpos; + int nbytes; + + Assert(forknum == MAIN_FORKNUM); + file = undofile_get_segment_file(reln, blocknum / UNDOSEG_SIZE); + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) UNDOSEG_SIZE)); + Assert(seekpos < (off_t) BLCKSZ * UNDOSEG_SIZE); + nbytes = FileRead(file, buffer, BLCKSZ, seekpos, WAIT_EVENT_UNDO_FILE_READ); + if (nbytes != BLCKSZ) + { + if (nbytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read block %u in file \"%s\": %m", + blocknum, FilePathName(file)))); + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read block %u in file \"%s\": read only %d of %d bytes", + blocknum, FilePathName(file), + nbytes, BLCKSZ))); + } +} + +static void +register_dirty_segment(SMgrRelation reln, ForkNumber forknum, int segno, File file) +{ + /* Temp relations should never be fsync'd */ + Assert(!SmgrIsTemp(reln)); + + if (pendingOpsTable) + { + /* push it into local pending-ops table */ + undofile_requestsync(reln->smgr_rnode.node, forknum, segno); + } + else + { + if (ForwardFsyncRequest(reln->smgr_rnode.node, forknum, segno)) + return; /* passed it off successfully */ + + ereport(DEBUG1, + (errmsg("could not forward fsync request because request queue is full"))); + + if (FileSync(file, WAIT_EVENT_DATA_FILE_SYNC) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", + FilePathName(file)))); + } +} + +void +undofile_write(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, + bool skipFsync) +{ + File file; + off_t seekpos; + int nbytes; + + Assert(forknum == MAIN_FORKNUM); + file = undofile_get_segment_file(reln, blocknum / UNDOSEG_SIZE); + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) UNDOSEG_SIZE)); + Assert(seekpos < (off_t) BLCKSZ * UNDOSEG_SIZE); + nbytes = FileWrite(file, buffer, BLCKSZ, seekpos, WAIT_EVENT_UNDO_FILE_WRITE); + if (nbytes != BLCKSZ) + { + if (nbytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write block %u in file \"%s\": %m", + blocknum, FilePathName(file)))); + /* + * short write: unexpected, because this should be overwriting an + * entirely pre-allocated segment file + */ + ereport(ERROR, + (errcode(ERRCODE_DISK_FULL), + errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes", + blocknum, FilePathName(file), + nbytes, BLCKSZ))); + } + + if (!skipFsync && !SmgrIsTemp(reln)) + register_dirty_segment(reln, forknum, blocknum / UNDOSEG_SIZE, file); +} + +void +undofile_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks) +{ + while (nblocks > 0) + { + File file; + int nflush; + + file = undofile_get_segment_file(reln, blocknum / UNDOSEG_SIZE); + + /* compute number of desired writes within the current segment */ + nflush = Min(nblocks, + 1 + UNDOSEG_SIZE - (blocknum % UNDOSEG_SIZE)); + + FileWriteback(file, + (blocknum % UNDOSEG_SIZE) * BLCKSZ, + nflush * BLCKSZ, WAIT_EVENT_UNDO_FILE_FLUSH); + + nblocks -= nflush; + blocknum += nflush; + } +} + +BlockNumber +undofile_nblocks(SMgrRelation reln, ForkNumber forknum) +{ + elog(ERROR, "undofile_nblocks is not supported"); + return 0; +} + +void +undofile_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) +{ + elog(ERROR, "undofile_truncate is not supported"); +} + +void +undofile_immedsync(SMgrRelation reln, ForkNumber forknum) +{ + elog(ERROR, "undofile_immedsync is not supported"); +} + +void +undofile_preckpt(void) +{ +} + +void +undofile_requestsync(RelFileNode rnode, ForkNumber forknum, int segno) +{ + MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt); + PendingOperationEntry *entry; + bool found; + + Assert(pendingOpsTable); + + if (forknum == FORGET_UNDO_SEGMENT_FSYNC) + { + entry = (PendingOperationEntry *) hash_search(pendingOpsTable, + &rnode, + HASH_FIND, + NULL); + if (entry) + entry->requests = bms_del_member(entry->requests, segno); + } + else + { + entry = (PendingOperationEntry *) hash_search(pendingOpsTable, + &rnode, + HASH_ENTER, + &found); + if (!found) + { + entry->cycle_ctr = undofile_sync_cycle_ctr; + entry->requests = bms_make_singleton(segno); + } + else + entry->requests = bms_add_member(entry->requests, segno); + } + + MemoryContextSwitchTo(oldcxt); +} + +void +undofile_forgetsync(Oid logno, Oid tablespace, int segno) +{ + RelFileNode rnode; + + rnode.dbNode = 9; + rnode.spcNode = tablespace; + rnode.relNode = logno; + + if (pendingOpsTable) + undofile_requestsync(rnode, FORGET_UNDO_SEGMENT_FSYNC, segno); + else if (IsUnderPostmaster) + { + while (!ForwardFsyncRequest(rnode, FORGET_UNDO_SEGMENT_FSYNC, segno)) + pg_usleep(10000L); + } +} + +void +undofile_sync(void) +{ + static bool undofile_sync_in_progress = false; + + HASH_SEQ_STATUS hstat; + PendingOperationEntry *entry; + int absorb_counter; + int segno; + + if (!pendingOpsTable) + elog(ERROR, "cannot sync without a pendingOpsTable"); + + AbsorbFsyncRequests(); + + if (undofile_sync_in_progress) + { + /* prior try failed, so update any stale cycle_ctr values */ + hash_seq_init(&hstat, pendingOpsTable); + while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) + entry->cycle_ctr = undofile_sync_cycle_ctr; + } + + undofile_sync_cycle_ctr++; + undofile_sync_in_progress = true; + + absorb_counter = FSYNCS_PER_ABSORB; + hash_seq_init(&hstat, pendingOpsTable); + while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) + { + Bitmapset *requests; + + /* Skip entries that arrived after we arrived. */ + if (entry->cycle_ctr == undofile_sync_cycle_ctr) + continue; + + Assert((CycleCtr) (entry->cycle_ctr + 1) == undofile_sync_cycle_ctr); + + if (!enableFsync) + continue; + + requests = entry->requests; + entry->requests = NULL; + + segno = -1; + while ((segno = bms_next_member(requests, segno)) >= 0) + { + File file; + + if (!enableFsync) + continue; + + file = undofile_open_segment_file(entry->rnode.relNode, + entry->rnode.spcNode, + segno, true /* missing_ok */); + + /* + * The file may be gone due to concurrent discard. We'll ignore + * that, but only if we find a cancel request for this segment in + * the queue. + * + * It's also possible that we succeed in opening a segment file + * that is subsequently recycled (renamed to represent a new range + * of undo log), in which case we'll fsync that later file + * instead. That is rare and harmless. + */ + if (file <= 0) + { + char name[MAXPGPATH]; + + /* + * Put the request back into the bitset in a way that can't + * fail due to memory allocation. + */ + entry->requests = bms_join(entry->requests, requests); + /* + * Check if a forgetsync request has arrived to delete that + * segment. + */ + AbsorbFsyncRequests(); + if (bms_is_member(segno, entry->requests)) + { + UndoLogSegmentPath(entry->rnode.relNode, + segno, + entry->rnode.spcNode, + name); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", name))); + } + /* It must have been removed, so we can safely skip it. */ + continue; + } + + elog(LOG, "fsync()ing %s", FilePathName(file)); /* TODO: remove me */ + if (FileSync(file, WAIT_EVENT_UNDO_FILE_SYNC) < 0) + { + char name[MAXPGPATH]; + + strcpy(name, FilePathName(file)); + FileClose(file); + + /* + * Keep the failed requests, but merge with any new ones. The + * requirement to be able to do this without risk of failure + * prevents us from using a smaller bitmap that doesn't bother + * tracking leading zeros. Perhaps another data structure + * would be better. + */ + entry->requests = bms_join(entry->requests, requests); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", name))); + } + requests = bms_del_member(requests, segno); + FileClose(file); + + if (--absorb_counter <= 0) + { + AbsorbFsyncRequests(); + absorb_counter = FSYNCS_PER_ABSORB; + } + } + + bms_free(requests); + } + + undofile_sync_in_progress = true; +} + +void undofile_postckpt(void) +{ +} + +static File undofile_open_segment_file(Oid relNode, Oid spcNode, int segno, + bool missing_ok) +{ + File file; + char path[MAXPGPATH]; + + UndoLogSegmentPath(relNode, segno, spcNode, path); + file = PathNameOpenFile(path, O_RDWR | PG_BINARY); + + if (file <= 0 && (!missing_ok || errno != ENOENT)) + elog(ERROR, "cannot open undo segment file '%s': %m", path); + + return file; +} + +/* + * Get a File for a particular segment of a SMgrRelation representing an undo + * log. + */ +static File undofile_get_segment_file(SMgrRelation reln, int segno) +{ + UndoFileState *state; + + + /* + * Create private state space on demand. + * + * XXX There should probably be a smgr 'open' or 'init' interface that + * would do this. smgr.c currently initializes reln->md_XXX stuff + * directly... + */ + state = (UndoFileState *) reln->private_data; + if (unlikely(state == NULL)) + { + state = MemoryContextAllocZero(UndoFileCxt, sizeof(UndoFileState)); + reln->private_data = state; + } + + /* If we have a file open already, check if we need to close it. */ + if (state->mru_file > 0 && state->mru_segno != segno) + { + /* These are not the blocks we're looking for. */ + FileClose(state->mru_file); + state->mru_file = 0; + } + + /* Check if we need to open a new file. */ + if (state->mru_file <= 0) + { + state->mru_file = + undofile_open_segment_file(reln->smgr_rnode.node.relNode, + reln->smgr_rnode.node.spcNode, + segno, InRecovery); + if (InRecovery && state->mru_file <= 0) + { + /* + * If in recovery, we may be trying to access a file that will + * later be unlinked. Tolerate missing files, creating a new + * zero-filled file as required. + */ + UndoLogNewSegment(reln->smgr_rnode.node.relNode, + reln->smgr_rnode.node.spcNode, + segno); + state->mru_file = + undofile_open_segment_file(reln->smgr_rnode.node.relNode, + reln->smgr_rnode.node.spcNode, + segno, false); + Assert(state->mru_file > 0); + } + state->mru_segno = segno; + } + + return state->mru_file; +} diff --git a/src/include/pgstat.h b/src/include/pgstat.h index f1c10d1..763379e 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -624,6 +624,11 @@ typedef struct PgStat_StatTabEntry PgStat_Counter tuples_inserted; PgStat_Counter tuples_updated; PgStat_Counter tuples_deleted; + + /* + * Counter tuples_hot_updated stores number of hot updates for heap table + * and the number of inplace updates for zheap table. + */ PgStat_Counter tuples_hot_updated; PgStat_Counter n_live_tuples; @@ -743,6 +748,7 @@ typedef enum BackendState #define PG_WAIT_IPC 0x08000000U #define PG_WAIT_TIMEOUT 0x09000000U #define PG_WAIT_IO 0x0A000000U +#define PG_WAIT_PAGE_TRANS_SLOT 0x0B000000U /* ---------- * Wait Events - Activity @@ -767,7 +773,7 @@ typedef enum WAIT_EVENT_SYSLOGGER_MAIN, WAIT_EVENT_WAL_RECEIVER_MAIN, WAIT_EVENT_WAL_SENDER_MAIN, - WAIT_EVENT_WAL_WRITER_MAIN + WAIT_EVENT_WAL_WRITER_MAIN, } WaitEventActivity; /* ---------- @@ -913,6 +919,13 @@ typedef enum WAIT_EVENT_TWOPHASE_FILE_READ, WAIT_EVENT_TWOPHASE_FILE_SYNC, WAIT_EVENT_TWOPHASE_FILE_WRITE, + WAIT_EVENT_UNDO_CHECKPOINT_READ, + WAIT_EVENT_UNDO_CHECKPOINT_WRITE, + WAIT_EVENT_UNDO_CHECKPOINT_SYNC, + WAIT_EVENT_UNDO_FILE_READ, + WAIT_EVENT_UNDO_FILE_WRITE, + WAIT_EVENT_UNDO_FILE_FLUSH, + WAIT_EVENT_UNDO_FILE_SYNC, WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ, WAIT_EVENT_WAL_BOOTSTRAP_SYNC, WAIT_EVENT_WAL_BOOTSTRAP_WRITE, @@ -1317,6 +1330,7 @@ pgstat_report_wait_end(void) extern void pgstat_count_heap_insert(Relation rel, PgStat_Counter n); extern void pgstat_count_heap_update(Relation rel, bool hot); +extern void pgstat_count_zheap_update(Relation rel); extern void pgstat_count_heap_delete(Relation rel); extern void pgstat_count_truncate(Relation rel); extern void pgstat_update_heap_dead_tuples(Relation rel, int delta); diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 3cce390..5b13556 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -38,8 +38,9 @@ typedef enum BufferAccessStrategyType typedef enum { RBM_NORMAL, /* Normal read */ - RBM_ZERO_AND_LOCK, /* Don't read from disk, caller will - * initialize. Also locks the page. */ + RBM_ZERO, /* Don't read from disk, caller will + * initialize. */ + RBM_ZERO_AND_LOCK, /* Like RBM_ZERO, but also locks the page. */ RBM_ZERO_AND_CLEANUP_LOCK, /* Like RBM_ZERO_AND_LOCK, but locks the page * in "cleanup" mode */ RBM_ZERO_ON_ERROR, /* Read, but return an all-zeros page on error */ @@ -171,7 +172,10 @@ extern Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy); extern Buffer ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum, - ReadBufferMode mode, BufferAccessStrategy strategy); + ReadBufferMode mode, BufferAccessStrategy strategy, + char relpersistence); +extern void ForgetBuffer(RelFileNode rnode, ForkNumber forkNum, + BlockNumber blockNum); extern void ReleaseBuffer(Buffer buffer); extern void UnlockReleaseBuffer(Buffer buffer); extern void MarkBufferDirty(Buffer buffer); @@ -228,6 +232,10 @@ extern void AtProcExit_LocalBuffers(void); extern void TestForOldSnapshot_impl(Snapshot snapshot, Relation relation); +/* in localbuf.c */ +extern void ForgetLocalBuffer(RelFileNode rnode, ForkNumber forkNum, + BlockNumber blockNum); + /* in freelist.c */ extern BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype); extern void FreeAccessStrategy(BufferAccessStrategy strategy); diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index c843bbc..65d164b 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -71,6 +71,9 @@ typedef struct SMgrRelationData int md_num_open_segs[MAX_FORKNUM + 1]; struct _MdfdVec *md_seg_fds[MAX_FORKNUM + 1]; + /* For use by implementations. */ + void *private_data; + /* if unowned, list link in list of all unowned SMgrRelations */ struct SMgrRelationData *next_unowned_reln; } SMgrRelationData; @@ -105,6 +108,7 @@ extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum, extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum); extern void smgrtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); +extern void smgrrequestsync(RelFileNode rnode, ForkNumber forknum, int segno); extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum); extern void smgrpreckpt(void); extern void smgrsync(void); @@ -133,14 +137,41 @@ extern void mdwriteback(SMgrRelation reln, ForkNumber forknum, extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum); extern void mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); +extern void mdrequestsync(RelFileNode rnode, ForkNumber forknum, int segno); extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum); extern void mdpreckpt(void); extern void mdsync(void); extern void mdpostckpt(void); +/* in undofile.c */ +extern void undofile_init(void); +extern void undofile_shutdown(void); +extern void undofile_close(SMgrRelation reln, ForkNumber forknum); +extern void undofile_create(SMgrRelation reln, ForkNumber forknum, + bool isRedo); +extern bool undofile_exists(SMgrRelation reln, ForkNumber forknum); +extern void undofile_unlink(RelFileNodeBackend rnode, ForkNumber forknum, + bool isRedo); +extern void undofile_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern void undofile_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); +extern void undofile_read(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer); +extern void undofile_write(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern void undofile_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); +extern BlockNumber undofile_nblocks(SMgrRelation reln, ForkNumber forknum); +extern void undofile_truncate(SMgrRelation reln, ForkNumber forknum, + BlockNumber nblocks); +extern void undofile_requestsync(RelFileNode rnode, ForkNumber forknum, int segno); +extern void undofile_immedsync(SMgrRelation reln, ForkNumber forknum); +extern void undofile_preckpt(void); +extern void undofile_sync(void); +extern void undofile_postckpt(void); + extern void SetForwardFsyncRequests(void); -extern void RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, - BlockNumber segno); extern void ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum); extern void ForgetDatabaseFsyncRequests(Oid dbid); extern void DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo); diff --git a/src/include/storage/undofile.h b/src/include/storage/undofile.h new file mode 100644 index 0000000..7544be3 --- /dev/null +++ b/src/include/storage/undofile.h @@ -0,0 +1,50 @@ +/* + * undofile.h + * + * PostgreSQL undo file manager. This module manages the files that back undo + * logs on the filesystem. + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/undofile.h + */ + +#ifndef UNDOFILE_H +#define UNDOFILE_H + +#include "storage/smgr.h" + +/* Prototypes of functions exposed to SMgr. */ +extern void undofile_init(void); +extern void undofile_shutdown(void); +extern void undofile_close(SMgrRelation reln, ForkNumber forknum); +extern void undofile_create(SMgrRelation reln, ForkNumber forknum, + bool isRedo); +extern bool undofile_exists(SMgrRelation reln, ForkNumber forknum); +extern void undofile_unlink(RelFileNodeBackend rnode, ForkNumber forknum, + bool isRedo); +extern void undofile_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, + bool skipFsync); +extern void undofile_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); +extern void undofile_read(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer); +extern void undofile_write(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, + bool skipFsync); +extern void undofile_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); +extern BlockNumber undofile_nblocks(SMgrRelation reln, ForkNumber forknum); +extern void undofile_truncate(SMgrRelation reln, ForkNumber forknum, + BlockNumber nblocks); +extern void undofile_immedsync(SMgrRelation reln, ForkNumber forknum); +extern void undofile_pre_ckpt(void); +extern void undofile_sync(void); +extern void undofile_post_ckpt(void); + +/* Functions used by undolog.c. */ +extern void undofile_forgetsync(Oid logno, Oid tablespace, int segno); + +#endif -- 1.8.3.1