From 24129934c491179edcdd02c6085d74074ecb702c Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Fri, 25 May 2018 09:43:16 +1200 Subject: [PATCH 2/6] Provide access to undo log data via the buffer manager. In ancient Berkeley POSTGRES, smgr.c allowed for different storage engines, of which only md.c survives. Revive this mechanism to provide access to undo log data through the existing buffer manager. Undo logs exist in a pseudo-database whose OID is used to dispatch IO requests to undofile.c instead of md.c. Convert RemeberFsyncRequest() into a first class smgr function smgrrequestsync() so that it can be dispatched to md.c or undofile.c as appropriate. XXX Status: WIP. Some details around fsync queuing are likely to change. Author: Thomas Munro, though ForgetBuffer() was contributed by Robert Haas Reviewed-By: Discussion: --- src/backend/access/transam/xlogutils.c | 8 +- src/backend/postmaster/checkpointer.c | 2 +- src/backend/postmaster/pgstat.c | 22 + src/backend/storage/buffer/bufmgr.c | 80 +++- src/backend/storage/smgr/Makefile | 2 +- src/backend/storage/smgr/md.c | 15 +- src/backend/storage/smgr/smgr.c | 37 +- src/backend/storage/smgr/undofile.c | 547 +++++++++++++++++++++++++ src/include/pgstat.h | 7 + src/include/storage/bufmgr.h | 14 +- src/include/storage/smgr.h | 35 +- src/include/storage/undofile.h | 50 +++ 12 files changed, 788 insertions(+), 31 deletions(-) create mode 100644 src/backend/storage/smgr/undofile.c create mode 100644 src/include/storage/undofile.h diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 52fe55e2afb..4888242604b 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -462,7 +462,7 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, { /* page exists in file */ buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno, - mode, NULL); + mode, NULL, RELPERSISTENCE_PERMANENT); } else { @@ -487,7 +487,8 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, ReleaseBuffer(buffer); } buffer = ReadBufferWithoutRelcache(rnode, forknum, - P_NEW, mode, NULL); + P_NEW, mode, NULL, + RELPERSISTENCE_PERMANENT); } while (BufferGetBlockNumber(buffer) < blkno); /* Handle the corner case that P_NEW returns non-consecutive pages */ @@ -497,7 +498,8 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, LockBuffer(buffer, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buffer); buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno, - mode, NULL); + mode, NULL, + RELPERSISTENCE_PERMANENT); } } diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index 0950ada6019..c0bde4d0566 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -1342,7 +1342,7 @@ AbsorbFsyncRequests(void) LWLockRelease(CheckpointerCommLock); for (request = requests; n > 0; request++, n--) - RememberFsyncRequest(request->rnode, request->forknum, request->segno); + smgrrequestsync(request->rnode, request->forknum, request->segno); END_CRIT_SECTION(); diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 084573e77c0..0c5f3daa69d 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -3898,6 +3898,28 @@ pgstat_get_wait_io(WaitEventIO w) case WAIT_EVENT_TWOPHASE_FILE_WRITE: event_name = "TwophaseFileWrite"; break; + case WAIT_EVENT_UNDO_CHECKPOINT_READ: + event_name = "UndoCheckpointRead"; + break; + case WAIT_EVENT_UNDO_CHECKPOINT_WRITE: + event_name = "UndoCheckpointWrite"; + break; + case WAIT_EVENT_UNDO_CHECKPOINT_SYNC: + event_name = "UndoCheckpointSync"; + break; + case WAIT_EVENT_UNDO_FILE_READ: + event_name = "UndoFileRead"; + break; + case WAIT_EVENT_UNDO_FILE_WRITE: + event_name = "UndoFileWrite"; + break; + case WAIT_EVENT_UNDO_FILE_FLUSH: + event_name = "UndoFileFlush"; + break; + case WAIT_EVENT_UNDO_FILE_SYNC: + event_name = "UndoFileSync"; + break; + case WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ: event_name = "WALSenderTimelineHistoryRead"; break; diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 01eabe57063..31a9b54080e 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -176,6 +176,7 @@ static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer); static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move); static inline int32 GetPrivateRefCount(Buffer buffer); static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref); +static void InvalidateBuffer(BufferDesc *buf); /* * Ensure that the PrivateRefCountArray has sufficient space to store one more @@ -618,10 +619,12 @@ ReadBuffer(Relation reln, BlockNumber blockNum) * valid, the page is zeroed instead of throwing an error. This is intended * for non-critical data, where the caller is prepared to repair errors. * - * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's + * In RBM_ZERO mode, if the page isn't in buffer cache already, it's * filled with zeros instead of reading it from disk. Useful when the caller * is going to fill the page from scratch, since this saves I/O and avoids * unnecessary failure if the page-on-disk has corrupt page headers. + * + * In RBM_ZERO_AND_LOCK mode, the page is zeroed and also locked. * The page is returned locked to ensure that the caller has a chance to * initialize the page before it's made visible to others. * Caution: do not use this mode to read a page that is beyond the relation's @@ -672,24 +675,20 @@ ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, /* * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require * a relcache entry for the relation. - * - * NB: At present, this function may only be used on permanent relations, which - * is OK, because we only use it during XLOG replay. If in the future we - * want to use it on temporary or unlogged relations, we could pass additional - * parameters. */ Buffer ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, - BufferAccessStrategy strategy) + BufferAccessStrategy strategy, + char relpersistence) { bool hit; - SMgrRelation smgr = smgropen(rnode, InvalidBackendId); - - Assert(InRecovery); + SMgrRelation smgr = smgropen(rnode, + relpersistence == RELPERSISTENCE_TEMP + ? MyBackendId : InvalidBackendId); - return ReadBuffer_common(smgr, RELPERSISTENCE_PERMANENT, forkNum, blockNum, + return ReadBuffer_common(smgr, relpersistence, forkNum, blockNum, mode, strategy, &hit); } @@ -877,7 +876,9 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * Read in the page, unless the caller intends to overwrite it and * just wants us to allocate a buffer. */ - if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) + if (mode == RBM_ZERO || + mode == RBM_ZERO_AND_LOCK || + mode == RBM_ZERO_AND_CLEANUP_LOCK) MemSet((char *) bufBlock, 0, BLCKSZ); else { @@ -1331,6 +1332,61 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, return buf; } +/* + * ForgetBuffer -- drop a buffer from shared buffers + * + * If the buffer isn't present in shared buffers, nothing happens. If it is + * present, it is discarded without making any attempt to write it back out to + * the operating system. The caller must therefore somehow be sure that the + * data won't be needed for anything now or in the future. It assumes that + * there is no concurrent access to the block, except that it might be being + * concurrently written. + */ +void +ForgetBuffer(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum) +{ + SMgrRelation smgr = smgropen(rnode, InvalidBackendId); + BufferTag tag; /* identity of target block */ + uint32 hash; /* hash value for tag */ + LWLock *partitionLock; /* buffer partition lock for it */ + int buf_id; + BufferDesc *bufHdr; + uint32 buf_state; + + /* create a tag so we can lookup the buffer */ + INIT_BUFFERTAG(tag, smgr->smgr_rnode.node, forkNum, blockNum); + + /* determine its hash code and partition lock ID */ + hash = BufTableHashCode(&tag); + partitionLock = BufMappingPartitionLock(hash); + + /* see if the block is in the buffer pool */ + LWLockAcquire(partitionLock, LW_SHARED); + buf_id = BufTableLookup(&tag, hash); + LWLockRelease(partitionLock); + + /* didn't find it, so nothing to do */ + if (buf_id < 0) + return; + + /* take the buffer header lock */ + bufHdr = GetBufferDescriptor(buf_id); + buf_state = LockBufHdr(bufHdr); + + /* + * The buffer might been evicted after we released the partition lock and + * before we acquired the buffer header lock. If so, the buffer we've + * locked might contain some other data which we shouldn't touch. If the + * buffer hasn't been recycled, we proceed to invalidate it. + */ + if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) && + bufHdr->tag.blockNum == blockNum && + bufHdr->tag.forkNum == forkNum) + InvalidateBuffer(bufHdr); /* releases spinlock */ + else + UnlockBufHdr(bufHdr, buf_state); +} + /* * InvalidateBuffer -- mark a shared buffer invalid and return it to the * freelist. diff --git a/src/backend/storage/smgr/Makefile b/src/backend/storage/smgr/Makefile index 2b95cb0df16..b657eb275fa 100644 --- a/src/backend/storage/smgr/Makefile +++ b/src/backend/storage/smgr/Makefile @@ -12,6 +12,6 @@ subdir = src/backend/storage/smgr top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = md.o smgr.o smgrtype.o +OBJS = md.o smgr.o smgrtype.o undofile.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 2ec103e6047..930a2a8c74b 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -44,7 +44,7 @@ #define UNLINKS_PER_ABSORB 10 /* - * Special values for the segno arg to RememberFsyncRequest. + * Special values for the segno arg to mdrequestsync. * * Note that CompactCheckpointerRequestQueue assumes that it's OK to remove an * fsync request from the queue if an identical, subsequent request is found. @@ -1433,7 +1433,7 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) if (pendingOpsTable) { /* push it into local pending-ops table */ - RememberFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno); + mdrequestsync(reln->smgr_rnode.node, forknum, seg->mdfd_segno); } else { @@ -1469,8 +1469,7 @@ register_unlink(RelFileNodeBackend rnode) if (pendingOpsTable) { /* push it into local pending-ops table */ - RememberFsyncRequest(rnode.node, MAIN_FORKNUM, - UNLINK_RELATION_REQUEST); + mdrequestsync(rnode.node, MAIN_FORKNUM, UNLINK_RELATION_REQUEST); } else { @@ -1489,7 +1488,7 @@ register_unlink(RelFileNodeBackend rnode) } /* - * RememberFsyncRequest() -- callback from checkpointer side of fsync request + * mdrequestsync() -- callback from checkpointer side of fsync request * * We stuff fsync requests into the local hash table for execution * during the checkpointer's next checkpoint. UNLINK requests go into a @@ -1510,7 +1509,7 @@ register_unlink(RelFileNodeBackend rnode) * heavyweight operation anyhow, so we'll live with it.) */ void -RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) +mdrequestsync(RelFileNode rnode, ForkNumber forknum, int segno) { Assert(pendingOpsTable); @@ -1653,7 +1652,7 @@ ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum) if (pendingOpsTable) { /* standalone backend or startup process: fsync state is local */ - RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC); + mdrequestsync(rnode, forknum, FORGET_RELATION_FSYNC); } else if (IsUnderPostmaster) { @@ -1692,7 +1691,7 @@ ForgetDatabaseFsyncRequests(Oid dbid) if (pendingOpsTable) { /* standalone backend or startup process: fsync state is local */ - RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC); + mdrequestsync(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC); } else if (IsUnderPostmaster) { diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 08f06bade25..c456ea1a12b 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -58,6 +58,8 @@ typedef struct f_smgr BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); + void (*smgr_requestsync) (RelFileNode rnode, ForkNumber forknum, + int segno); void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); void (*smgr_pre_ckpt) (void); /* may be NULL */ void (*smgr_sync) (void); /* may be NULL */ @@ -69,12 +71,30 @@ static const f_smgr smgrsw[] = { /* magnetic disk */ {mdinit, NULL, mdclose, mdcreate, mdexists, mdunlink, mdextend, mdprefetch, mdread, mdwrite, mdwriteback, mdnblocks, mdtruncate, + mdrequestsync, mdimmedsync, mdpreckpt, mdsync, mdpostckpt + }, + /* undo logs */ + {undofile_init, undofile_shutdown, undofile_close, undofile_create, + undofile_exists, undofile_unlink, undofile_extend, undofile_prefetch, + undofile_read, undofile_write, undofile_writeback, undofile_nblocks, + undofile_truncate, + undofile_requestsync, + undofile_immedsync, undofile_preckpt, undofile_sync, + undofile_postckpt } }; static const int NSmgr = lengthof(smgrsw); +/* + * In ancient Postgres the catalog entry for each relation controlled the + * choice of storage manager implementation. Now we have only md.c for + * regular relations, and undofile.c for undo log storage in the undolog + * pseudo-database. + */ +#define SmgrWhichForRelFileNode(rfn) \ + ((rfn).dbNode == 9 ? 1 : 0) /* * Each backend has a hashtable that stores all extant SMgrRelation objects. @@ -170,11 +190,18 @@ smgropen(RelFileNode rnode, BackendId backend) reln->smgr_targblock = InvalidBlockNumber; reln->smgr_fsm_nblocks = InvalidBlockNumber; reln->smgr_vm_nblocks = InvalidBlockNumber; - reln->smgr_which = 0; /* we only have md.c at present */ + + /* Which storage manager implementation? */ + reln->smgr_which = SmgrWhichForRelFileNode(rnode); /* mark it not open */ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) + { reln->md_num_open_segs[forknum] = 0; + reln->md_seg_fds[forknum] = NULL; + } + + reln->private_data = NULL; /* it has no owner yet */ add_to_unowned_list(reln); @@ -707,6 +734,14 @@ smgrtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) smgrsw[reln->smgr_which].smgr_truncate(reln, forknum, nblocks); } +/* + * smgrrequestsync() -- Enqueue a request for smgrsync() to flush data. + */ +void smgrrequestsync(RelFileNode rnode, ForkNumber forknum, int segno) +{ + smgrsw[SmgrWhichForRelFileNode(rnode)].smgr_requestsync(rnode, forknum, segno); +} + /* * smgrimmedsync() -- Force the specified relation to stable storage. * diff --git a/src/backend/storage/smgr/undofile.c b/src/backend/storage/smgr/undofile.c new file mode 100644 index 00000000000..dbf98acb227 --- /dev/null +++ b/src/backend/storage/smgr/undofile.c @@ -0,0 +1,547 @@ +/* + * undofile.h + * + * PostgreSQL undo file manager. This module provides SMGR-compatible + * interface to the files that back undo logs on the filesystem, so that undo + * log data can use the shared buffer pool. Other aspects of undo log + * management are provided by undolog.c, so the SMGR interfaces not directly + * concerned with reading, writing and flushing data are unimplemented. + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/storage/smgr/undofile.c + */ + +#include "postgres.h" + +#include "access/undolog.h" +#include "access/xlog.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/bgwriter.h" +#include "storage/fd.h" +#include "storage/undofile.h" +#include "utils/memutils.h" + +/* intervals for calling AbsorbFsyncRequests in undofile_sync */ +#define FSYNCS_PER_ABSORB 10 + +/* + * Special values for the fork arg to undofile_requestsync. + */ +#define FORGET_UNDO_SEGMENT_FSYNC (InvalidBlockNumber) + +/* + * While md.c expects random access and has a small number of huge + * segments, undofile.c manages a potentially very large number of smaller + * segments and has a less random access pattern. Therefore, instead of + * keeping a potentially huge array of vfds we'll just keep the most + * recently accessed N. + * + * For now, N == 1, so we just need to hold onto one 'File' handle. + */ +typedef struct UndoFileState +{ + int mru_segno; + File mru_file; +} UndoFileState; + +static MemoryContext UndoFileCxt; + +typedef uint16 CycleCtr; + +/* + * An entry recording the segments that need to be fsynced by undofile_sync(). + * This is a bit simpler than md.c's version, though it could perhaps be + * merged into a common struct. One difference is that we can have much + * larger segment numbers, so we'll adjust for that to avoid having a lot of + * leading zero bits. + */ +typedef struct +{ + RelFileNode rnode; + Bitmapset *requests; + CycleCtr cycle_ctr; +} PendingOperationEntry; + +static HTAB *pendingOpsTable = NULL; +static MemoryContext pendingOpsCxt; + +static CycleCtr undofile_sync_cycle_ctr = 0; + +static File undofile_open_segment_file(Oid relNode, Oid spcNode, int segno, + bool missing_ok); +static File undofile_get_segment_file(SMgrRelation reln, int segno); + +void +undofile_init(void) +{ + UndoFileCxt = AllocSetContextCreate(TopMemoryContext, + "UndoFileSmgr", + ALLOCSET_DEFAULT_SIZES); + + if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess()) + { + HASHCTL hash_ctl; + + pendingOpsCxt = AllocSetContextCreate(UndoFileCxt, + "Pending ops context", + ALLOCSET_DEFAULT_SIZES); + MemoryContextAllowInCriticalSection(pendingOpsCxt, true); + + MemSet(&hash_ctl, 0, sizeof(hash_ctl)); + hash_ctl.keysize = sizeof(RelFileNode); + hash_ctl.entrysize = sizeof(PendingOperationEntry); + hash_ctl.hcxt = pendingOpsCxt; + pendingOpsTable = hash_create("Pending Ops Table", + 100L, + &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + } +} + +void +undofile_shutdown(void) +{ +} + +void +undofile_close(SMgrRelation reln, ForkNumber forknum) +{ +} + +void +undofile_create(SMgrRelation reln, ForkNumber forknum, bool isRedo) +{ + elog(ERROR, "undofile_create is not supported"); +} + +bool +undofile_exists(SMgrRelation reln, ForkNumber forknum) +{ + elog(ERROR, "undofile_exists is not supported"); +} + +void +undofile_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo) +{ + elog(ERROR, "undofile_unlink is not supported"); +} + +void +undofile_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, + bool skipFsync) +{ + elog(ERROR, "undofile_extend is not supported"); +} + +void +undofile_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +{ + elog(ERROR, "undofile_prefetch is not supported"); +} + +void +undofile_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer) +{ + File file; + off_t seekpos; + int nbytes; + + Assert(forknum == MAIN_FORKNUM); + file = undofile_get_segment_file(reln, blocknum / UNDOSEG_SIZE); + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) UNDOSEG_SIZE)); + Assert(seekpos < (off_t) BLCKSZ * UNDOSEG_SIZE); + if (FileSeek(file, seekpos, SEEK_SET) != seekpos) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not seek to block %u in file \"%s\": %m", + blocknum, FilePathName(file)))); + nbytes = FileRead(file, buffer, BLCKSZ, WAIT_EVENT_UNDO_FILE_READ); + if (nbytes != BLCKSZ) + { + if (nbytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read block %u in file \"%s\": %m", + blocknum, FilePathName(file)))); + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read block %u in file \"%s\": read only %d of %d bytes", + blocknum, FilePathName(file), + nbytes, BLCKSZ))); + } +} + +static void +register_dirty_segment(SMgrRelation reln, ForkNumber forknum, int segno, File file) +{ + /* Temp relations should never be fsync'd */ + Assert(!SmgrIsTemp(reln)); + + if (pendingOpsTable) + { + /* push it into local pending-ops table */ + undofile_requestsync(reln->smgr_rnode.node, forknum, segno); + } + else + { + if (ForwardFsyncRequest(reln->smgr_rnode.node, forknum, segno)) + return; /* passed it off successfully */ + + ereport(DEBUG1, + (errmsg("could not forward fsync request because request queue is full"))); + + if (FileSync(file, WAIT_EVENT_DATA_FILE_SYNC) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", + FilePathName(file)))); + } +} + +void +undofile_write(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, + bool skipFsync) +{ + File file; + off_t seekpos; + int nbytes; + + Assert(forknum == MAIN_FORKNUM); + file = undofile_get_segment_file(reln, blocknum / UNDOSEG_SIZE); + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) UNDOSEG_SIZE)); + Assert(seekpos < (off_t) BLCKSZ * UNDOSEG_SIZE); + if (FileSeek(file, seekpos, SEEK_SET) != seekpos) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not seek to block %u in file \"%s\": %m", + blocknum, FilePathName(file)))); + nbytes = FileWrite(file, buffer, BLCKSZ, WAIT_EVENT_UNDO_FILE_WRITE); + if (nbytes != BLCKSZ) + { + if (nbytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write block %u in file \"%s\": %m", + blocknum, FilePathName(file)))); + /* + * short write: unexpected, because this should be overwriting an + * entirely pre-allocated segment file + */ + ereport(ERROR, + (errcode(ERRCODE_DISK_FULL), + errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes", + blocknum, FilePathName(file), + nbytes, BLCKSZ))); + } + + if (!skipFsync && !SmgrIsTemp(reln)) + register_dirty_segment(reln, forknum, blocknum / UNDOSEG_SIZE, file); +} + +void +undofile_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks) +{ + while (nblocks > 0) + { + File file; + int nflush; + + file = undofile_get_segment_file(reln, blocknum / UNDOSEG_SIZE); + + /* compute number of desired writes within the current segment */ + nflush = Min(nblocks, + 1 + UNDOSEG_SIZE - (blocknum % UNDOSEG_SIZE)); + + FileWriteback(file, + (blocknum % UNDOSEG_SIZE) * BLCKSZ, + nflush * BLCKSZ, WAIT_EVENT_UNDO_FILE_FLUSH); + + nblocks -= nflush; + blocknum += nflush; + } +} + +BlockNumber +undofile_nblocks(SMgrRelation reln, ForkNumber forknum) +{ + elog(ERROR, "undofile_nblocks is not supported"); + return 0; +} + +void +undofile_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) +{ + elog(ERROR, "undofile_truncate is not supported"); +} + +void +undofile_immedsync(SMgrRelation reln, ForkNumber forknum) +{ + elog(ERROR, "undofile_immedsync is not supported"); +} + +void +undofile_preckpt(void) +{ +} + +void +undofile_requestsync(RelFileNode rnode, ForkNumber forknum, int segno) +{ + MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt); + PendingOperationEntry *entry; + bool found; + + Assert(pendingOpsTable); + + if (forknum == FORGET_UNDO_SEGMENT_FSYNC) + { + entry = (PendingOperationEntry *) hash_search(pendingOpsTable, + &rnode, + HASH_FIND, + NULL); + if (entry) + entry->requests = bms_del_member(entry->requests, segno); + } + else + { + entry = (PendingOperationEntry *) hash_search(pendingOpsTable, + &rnode, + HASH_ENTER, + &found); + if (!found) + { + entry->cycle_ctr = undofile_sync_cycle_ctr; + entry->requests = bms_make_singleton(segno); + } + else + entry->requests = bms_add_member(entry->requests, segno); + } + + MemoryContextSwitchTo(oldcxt); +} + +void +undofile_forgetsync(Oid logno, Oid tablespace, int segno) +{ + RelFileNode rnode; + + rnode.dbNode = 9; + rnode.spcNode = tablespace; + rnode.relNode = logno; + + if (pendingOpsTable) + undofile_requestsync(rnode, FORGET_UNDO_SEGMENT_FSYNC, segno); + else if (IsUnderPostmaster) + { + while (!ForwardFsyncRequest(rnode, FORGET_UNDO_SEGMENT_FSYNC, segno)) + pg_usleep(10000L); + } +} + +void +undofile_sync(void) +{ + static bool undofile_sync_in_progress = false; + + HASH_SEQ_STATUS hstat; + PendingOperationEntry *entry; + int absorb_counter; + int segno; + + if (!pendingOpsTable) + elog(ERROR, "cannot sync without a pendingOpsTable"); + + AbsorbFsyncRequests(); + + if (undofile_sync_in_progress) + { + /* prior try failed, so update any stale cycle_ctr values */ + hash_seq_init(&hstat, pendingOpsTable); + while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) + entry->cycle_ctr = undofile_sync_cycle_ctr; + } + + undofile_sync_cycle_ctr++; + undofile_sync_in_progress = true; + + absorb_counter = FSYNCS_PER_ABSORB; + hash_seq_init(&hstat, pendingOpsTable); + while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) + { + Bitmapset *requests; + + /* Skip entries that arrived after we arrived. */ + if (entry->cycle_ctr == undofile_sync_cycle_ctr) + continue; + + Assert((CycleCtr) (entry->cycle_ctr + 1) == undofile_sync_cycle_ctr); + + if (!enableFsync) + continue; + + requests = entry->requests; + entry->requests = NULL; + + segno = -1; + while ((segno = bms_next_member(requests, segno)) >= 0) + { + File file; + + if (!enableFsync) + continue; + + file = undofile_open_segment_file(entry->rnode.relNode, + entry->rnode.spcNode, + segno, true /* missing_ok */); + + /* + * The file may be gone due to concurrent discard. We'll ignore + * that, but only if we find a cancel request for this segment in + * the queue. + * + * It's also possible that we succeed in opening a segment file + * that is subsequently recycled (renamed to represent a new range + * of undo log), in which case we'll fsync that later file + * instead. That is rare and harmless. + */ + if (file <= 0) + { + /* + * Put the request back into the bitset in a way that can't + * fail due to memory allocation. + */ + entry->requests = bms_join(entry->requests, requests); + /* + * Check if a forgetsync request has arrived to delete that + * segment. + */ + AbsorbFsyncRequests(); + if (bms_is_member(segno, entry->requests)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", + FilePathName(file)))); + /* It must have been removed, so we can safely skip it. */ + continue; + } + + elog(LOG, "fsync()ing %s", FilePathName(file)); /* TODO: remove me */ + if (FileSync(file, WAIT_EVENT_UNDO_FILE_SYNC) < 0) + { + FileClose(file); + + /* + * Keep the failed requests, but merge with any new ones. The + * requirement to be able to do this without risk of failure + * prevents us from using a smaller bitmap that doesn't bother + * tracking leading zeros. Perhaps another data structure + * would be better. + */ + entry->requests = bms_join(entry->requests, requests); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", + FilePathName(file)))); + } + requests = bms_del_member(requests, segno); + FileClose(file); + + if (--absorb_counter <= 0) + { + AbsorbFsyncRequests(); + absorb_counter = FSYNCS_PER_ABSORB; + } + } + + bms_free(requests); + } + + undofile_sync_in_progress = true; +} + +void undofile_postckpt(void) +{ +} + +static File undofile_open_segment_file(Oid relNode, Oid spcNode, int segno, + bool missing_ok) +{ + File file; + char path[MAXPGPATH]; + + UndoLogSegmentPath(relNode, segno, spcNode, path); + file = PathNameOpenFile(path, O_RDWR | PG_BINARY); + + if (file <= 0 && (!missing_ok || errno != ENOENT)) + elog(ERROR, "cannot open undo segment file '%s': %m", path); + + return file; +} + +/* + * Get a File for a particular segment of a SMgrRelation representing an undo + * log. + */ +static File undofile_get_segment_file(SMgrRelation reln, int segno) +{ + UndoFileState *state; + + + /* + * Create private state space on demand. + * + * XXX There should probably be a smgr 'open' or 'init' interface that + * would do this. smgr.c currently initializes reln->md_XXX stuff + * directly... + */ + state = (UndoFileState *) reln->private_data; + if (unlikely(state == NULL)) + { + state = MemoryContextAllocZero(UndoFileCxt, sizeof(UndoFileState)); + reln->private_data = state; + } + + /* If we have a file open already, check if we need to close it. */ + if (state->mru_file > 0 && state->mru_segno != segno) + { + /* These are not the blocks we're looking for. */ + FileClose(state->mru_file); + state->mru_file = 0; + } + + /* Check if we need to open a new file. */ + if (state->mru_file <= 0) + { + state->mru_file = + undofile_open_segment_file(reln->smgr_rnode.node.relNode, + reln->smgr_rnode.node.spcNode, + segno, InRecovery); + if (InRecovery && state->mru_file <= 0) + { + /* + * If in recovery, we may be trying to access a file that will + * later be unlinked. Tolerate missing files, creating a new + * zero-filled file as required. + */ + UndoLogNewSegment(reln->smgr_rnode.node.relNode, + reln->smgr_rnode.node.spcNode, + segno); + state->mru_file = + undofile_open_segment_file(reln->smgr_rnode.node.relNode, + reln->smgr_rnode.node.spcNode, + segno, false); + Assert(state->mru_file > 0); + } + state->mru_segno = segno; + } + + return state->mru_file; +} diff --git a/src/include/pgstat.h b/src/include/pgstat.h index be2f59239bf..f02d9a39935 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -912,6 +912,13 @@ typedef enum WAIT_EVENT_TWOPHASE_FILE_READ, WAIT_EVENT_TWOPHASE_FILE_SYNC, WAIT_EVENT_TWOPHASE_FILE_WRITE, + WAIT_EVENT_UNDO_CHECKPOINT_READ, + WAIT_EVENT_UNDO_CHECKPOINT_WRITE, + WAIT_EVENT_UNDO_CHECKPOINT_SYNC, + WAIT_EVENT_UNDO_FILE_READ, + WAIT_EVENT_UNDO_FILE_WRITE, + WAIT_EVENT_UNDO_FILE_FLUSH, + WAIT_EVENT_UNDO_FILE_SYNC, WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ, WAIT_EVENT_WAL_BOOTSTRAP_SYNC, WAIT_EVENT_WAL_BOOTSTRAP_WRITE, diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 3cce3906a0e..5b135565527 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -38,8 +38,9 @@ typedef enum BufferAccessStrategyType typedef enum { RBM_NORMAL, /* Normal read */ - RBM_ZERO_AND_LOCK, /* Don't read from disk, caller will - * initialize. Also locks the page. */ + RBM_ZERO, /* Don't read from disk, caller will + * initialize. */ + RBM_ZERO_AND_LOCK, /* Like RBM_ZERO, but also locks the page. */ RBM_ZERO_AND_CLEANUP_LOCK, /* Like RBM_ZERO_AND_LOCK, but locks the page * in "cleanup" mode */ RBM_ZERO_ON_ERROR, /* Read, but return an all-zeros page on error */ @@ -171,7 +172,10 @@ extern Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy); extern Buffer ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum, - ReadBufferMode mode, BufferAccessStrategy strategy); + ReadBufferMode mode, BufferAccessStrategy strategy, + char relpersistence); +extern void ForgetBuffer(RelFileNode rnode, ForkNumber forkNum, + BlockNumber blockNum); extern void ReleaseBuffer(Buffer buffer); extern void UnlockReleaseBuffer(Buffer buffer); extern void MarkBufferDirty(Buffer buffer); @@ -228,6 +232,10 @@ extern void AtProcExit_LocalBuffers(void); extern void TestForOldSnapshot_impl(Snapshot snapshot, Relation relation); +/* in localbuf.c */ +extern void ForgetLocalBuffer(RelFileNode rnode, ForkNumber forkNum, + BlockNumber blockNum); + /* in freelist.c */ extern BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype); extern void FreeAccessStrategy(BufferAccessStrategy strategy); diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 558e4d8518b..002ae4c5e32 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -71,6 +71,9 @@ typedef struct SMgrRelationData int md_num_open_segs[MAX_FORKNUM + 1]; struct _MdfdVec *md_seg_fds[MAX_FORKNUM + 1]; + /* For use by implementations. */ + void *private_data; + /* if unowned, list link in list of all unowned SMgrRelations */ struct SMgrRelationData *next_unowned_reln; } SMgrRelationData; @@ -105,6 +108,7 @@ extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum, extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum); extern void smgrtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); +extern void smgrrequestsync(RelFileNode rnode, ForkNumber forknum, int segno); extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum); extern void smgrpreckpt(void); extern void smgrsync(void); @@ -133,14 +137,41 @@ extern void mdwriteback(SMgrRelation reln, ForkNumber forknum, extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum); extern void mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); +extern void mdrequestsync(RelFileNode rnode, ForkNumber forknum, int segno); extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum); extern void mdpreckpt(void); extern void mdsync(void); extern void mdpostckpt(void); +/* in undofile.c */ +extern void undofile_init(void); +extern void undofile_shutdown(void); +extern void undofile_close(SMgrRelation reln, ForkNumber forknum); +extern void undofile_create(SMgrRelation reln, ForkNumber forknum, + bool isRedo); +extern bool undofile_exists(SMgrRelation reln, ForkNumber forknum); +extern void undofile_unlink(RelFileNodeBackend rnode, ForkNumber forknum, + bool isRedo); +extern void undofile_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern void undofile_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); +extern void undofile_read(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer); +extern void undofile_write(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern void undofile_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); +extern BlockNumber undofile_nblocks(SMgrRelation reln, ForkNumber forknum); +extern void undofile_truncate(SMgrRelation reln, ForkNumber forknum, + BlockNumber nblocks); +extern void undofile_requestsync(RelFileNode rnode, ForkNumber forknum, int segno); +extern void undofile_immedsync(SMgrRelation reln, ForkNumber forknum); +extern void undofile_preckpt(void); +extern void undofile_sync(void); +extern void undofile_postckpt(void); + extern void SetForwardFsyncRequests(void); -extern void RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, - BlockNumber segno); extern void ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum); extern void ForgetDatabaseFsyncRequests(Oid dbid); diff --git a/src/include/storage/undofile.h b/src/include/storage/undofile.h new file mode 100644 index 00000000000..7544be3522b --- /dev/null +++ b/src/include/storage/undofile.h @@ -0,0 +1,50 @@ +/* + * undofile.h + * + * PostgreSQL undo file manager. This module manages the files that back undo + * logs on the filesystem. + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/undofile.h + */ + +#ifndef UNDOFILE_H +#define UNDOFILE_H + +#include "storage/smgr.h" + +/* Prototypes of functions exposed to SMgr. */ +extern void undofile_init(void); +extern void undofile_shutdown(void); +extern void undofile_close(SMgrRelation reln, ForkNumber forknum); +extern void undofile_create(SMgrRelation reln, ForkNumber forknum, + bool isRedo); +extern bool undofile_exists(SMgrRelation reln, ForkNumber forknum); +extern void undofile_unlink(RelFileNodeBackend rnode, ForkNumber forknum, + bool isRedo); +extern void undofile_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, + bool skipFsync); +extern void undofile_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); +extern void undofile_read(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer); +extern void undofile_write(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, + bool skipFsync); +extern void undofile_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); +extern BlockNumber undofile_nblocks(SMgrRelation reln, ForkNumber forknum); +extern void undofile_truncate(SMgrRelation reln, ForkNumber forknum, + BlockNumber nblocks); +extern void undofile_immedsync(SMgrRelation reln, ForkNumber forknum); +extern void undofile_pre_ckpt(void); +extern void undofile_sync(void); +extern void undofile_post_ckpt(void); + +/* Functions used by undolog.c. */ +extern void undofile_forgetsync(Oid logno, Oid tablespace, int segno); + +#endif -- 2.17.0