From db1edf775f550567e0008b3e89d1ad3eee0c276e Mon Sep 17 00:00:00 2001 From: Dilip Kumar Date: Mon, 5 Nov 2018 00:16:33 -0800 Subject: [PATCH 1/4] Add undo log manager. Add a new subsystem to manage undo logs. Undo logs allow data to be appended efficiently, like logs. They also allow data to be discarded efficiently from the other end, like a queue. Thirdly, they allow efficient buffered random access, like a relation. Undo logs physically consist of a set of 1MB segment files under $PGDATA/base/undo (or per-tablespace equivalent) that are created, deleted or renamed as required, similarly to the way that WAL segments are managed. Meta-data about the set of undo logs is stored in shared memory, and written to per-checkpoint files under $PGDATA/pg_undo. This commit provides an API for allocating and discarding undo log storage space and managing the files in a crash-safe way. A later commit will provide support for accessing the data stored inside them. XXX Status: WIP. Some details around WAL are being reconsidered, as noted in comments. Author: Thomas Munro, with contributions from Dilip Kumar and input from Amit Kapila and Robert Haas Tested-By: Neha Sharma Discussion: https://postgr.es/m/CAEepm%3D2EqROYJ_xYz4v5kfr4b0qw_Lq_6Pe8RTEC8rx3upWsSQ%40mail.gmail.com --- src/backend/access/Makefile | 2 +- src/backend/access/rmgrdesc/Makefile | 2 +- src/backend/access/rmgrdesc/undologdesc.c | 88 + src/backend/access/transam/rmgr.c | 1 + src/backend/access/transam/xlog.c | 17 + src/backend/access/undo/Makefile | 17 + src/backend/access/undo/undolog.c | 2643 +++++++++++++++++++++++++++++ src/backend/catalog/system_views.sql | 4 + src/backend/commands/tablespace.c | 23 + src/backend/replication/logical/decode.c | 1 + src/backend/storage/ipc/ipci.c | 3 + src/backend/storage/lmgr/lwlock.c | 2 + src/backend/storage/lmgr/lwlocknames.txt | 1 + src/backend/utils/init/postinit.c | 1 + src/backend/utils/misc/guc.c | 12 + src/bin/initdb/initdb.c | 2 + src/bin/pg_waldump/rmgrdesc.c | 1 + src/include/access/rmgrlist.h | 1 + src/include/access/undolog.h | 405 +++++ src/include/access/undolog_xlog.h | 72 + src/include/catalog/pg_proc.dat | 7 + src/include/storage/lwlock.h | 2 + src/include/utils/guc.h | 2 + src/test/regress/expected/rules.out | 11 + 24 files changed, 3318 insertions(+), 2 deletions(-) create mode 100644 src/backend/access/rmgrdesc/undologdesc.c create mode 100644 src/backend/access/undo/Makefile create mode 100644 src/backend/access/undo/undolog.c create mode 100644 src/include/access/undolog.h create mode 100644 src/include/access/undolog_xlog.h diff --git a/src/backend/access/Makefile b/src/backend/access/Makefile index bd93a6a..7f7380c 100644 --- a/src/backend/access/Makefile +++ b/src/backend/access/Makefile @@ -9,6 +9,6 @@ top_builddir = ../../.. include $(top_builddir)/src/Makefile.global SUBDIRS = brin common gin gist hash heap index nbtree rmgrdesc spgist \ - tablesample transam + tablesample transam undo include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/rmgrdesc/Makefile b/src/backend/access/rmgrdesc/Makefile index 5514db1..91ad1ef 100644 --- a/src/backend/access/rmgrdesc/Makefile +++ b/src/backend/access/rmgrdesc/Makefile @@ -11,6 +11,6 @@ include $(top_builddir)/src/Makefile.global OBJS = brindesc.o clogdesc.o committsdesc.o dbasedesc.o genericdesc.o \ gindesc.o gistdesc.o hashdesc.o heapdesc.o logicalmsgdesc.o \ mxactdesc.o nbtdesc.o relmapdesc.o replorigindesc.o seqdesc.o \ - smgrdesc.o spgdesc.o standbydesc.o tblspcdesc.o xactdesc.o xlogdesc.o + smgrdesc.o spgdesc.o standbydesc.o tblspcdesc.o undologdesc.o xactdesc.o xlogdesc.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/rmgrdesc/undologdesc.c b/src/backend/access/rmgrdesc/undologdesc.c new file mode 100644 index 0000000..6cf32f4 --- /dev/null +++ b/src/backend/access/rmgrdesc/undologdesc.c @@ -0,0 +1,88 @@ +/*------------------------------------------------------------------------- + * + * undologdesc.c + * rmgr descriptor routines for access/undo/undolog.c + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/undologdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/undolog.h" +#include "access/undolog_xlog.h" + +void +undolog_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + if (info == XLOG_UNDOLOG_CREATE) + { + xl_undolog_create *xlrec = (xl_undolog_create *) rec; + + appendStringInfo(buf, "logno %u", xlrec->logno); + } + else if (info == XLOG_UNDOLOG_EXTEND) + { + xl_undolog_extend *xlrec = (xl_undolog_extend *) rec; + + appendStringInfo(buf, "logno %u end " UndoLogOffsetFormat, + xlrec->logno, xlrec->end); + } + else if (info == XLOG_UNDOLOG_ATTACH) + { + xl_undolog_attach *xlrec = (xl_undolog_attach *) rec; + + appendStringInfo(buf, "logno %u xid %u", xlrec->logno, xlrec->xid); + } + else if (info == XLOG_UNDOLOG_DISCARD) + { + xl_undolog_discard *xlrec = (xl_undolog_discard *) rec; + + appendStringInfo(buf, "logno %u discard " UndoLogOffsetFormat " end " + UndoLogOffsetFormat, + xlrec->logno, xlrec->discard, xlrec->end); + } + else if (info == XLOG_UNDOLOG_REWIND) + { + xl_undolog_rewind *xlrec = (xl_undolog_rewind *) rec; + + appendStringInfo(buf, "logno %u insert " UndoLogOffsetFormat " prevlen %d", + xlrec->logno, xlrec->insert, xlrec->prevlen); + } + +} + +const char * +undolog_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG_UNDOLOG_CREATE: + id = "CREATE"; + break; + case XLOG_UNDOLOG_EXTEND: + id = "EXTEND"; + break; + case XLOG_UNDOLOG_ATTACH: + id = "ATTACH"; + break; + case XLOG_UNDOLOG_DISCARD: + id = "DISCARD"; + break; + case XLOG_UNDOLOG_REWIND: + id = "REWIND"; + break; + } + + return id; +} diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c index 9368b56..8b05374 100644 --- a/src/backend/access/transam/rmgr.c +++ b/src/backend/access/transam/rmgr.c @@ -18,6 +18,7 @@ #include "access/multixact.h" #include "access/nbtxlog.h" #include "access/spgxlog.h" +#include "access/undolog_xlog.h" #include "access/xact.h" #include "access/xlog_internal.h" #include "catalog/storage_xlog.h" diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 246869b..dce4c01 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -31,6 +31,7 @@ #include "access/transam.h" #include "access/tuptoaster.h" #include "access/twophase.h" +#include "access/undolog.h" #include "access/xact.h" #include "access/xlog_internal.h" #include "access/xloginsert.h" @@ -6881,6 +6882,9 @@ StartupXLOG(void) */ restoreTwoPhaseData(); + /* Recover undo log meta data corresponding to this checkpoint. */ + StartupUndoLogs(ControlFile->checkPointCopy.redo); + lastFullPageWrites = checkPoint.fullPageWrites; RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo; @@ -7503,7 +7507,13 @@ StartupXLOG(void) * end-of-recovery steps fail. */ if (InRecovery) + { ResetUnloggedRelations(UNLOGGED_RELATION_INIT); + ResetUndoLogs(UNDO_UNLOGGED); + } + + /* Always reset temporary undo logs. */ + ResetUndoLogs(UNDO_TEMP); /* * We don't need the latch anymore. It's not strictly necessary to disown @@ -9208,6 +9218,7 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags) CheckPointSnapBuild(); CheckPointLogicalRewriteHeap(); CheckPointBuffers(flags); /* performs all required fsyncs */ + CheckPointUndoLogs(checkPointRedo, ControlFile->checkPointCopy.redo); CheckPointReplicationOrigin(); /* We deliberately delay 2PC checkpointing as long as possible */ CheckPointTwoPhase(checkPointRedo); @@ -9914,6 +9925,9 @@ xlog_redo(XLogReaderState *record) XLogCtl->ckptXid = checkPoint.nextXid; SpinLockRelease(&XLogCtl->info_lck); + /* Write an undo log metadata snapshot. */ + CheckPointUndoLogs(checkPoint.redo, ControlFile->checkPointCopy.redo); + /* * We should've already switched to the new TLI before replaying this * record. @@ -9973,6 +9987,9 @@ xlog_redo(XLogReaderState *record) XLogCtl->ckptXid = checkPoint.nextXid; SpinLockRelease(&XLogCtl->info_lck); + /* Write an undo log metadata snapshot. */ + CheckPointUndoLogs(checkPoint.redo, ControlFile->checkPointCopy.redo); + /* TLI should not change in an on-line checkpoint */ if (checkPoint.ThisTimeLineID != ThisTimeLineID) ereport(PANIC, diff --git a/src/backend/access/undo/Makefile b/src/backend/access/undo/Makefile new file mode 100644 index 0000000..219c696 --- /dev/null +++ b/src/backend/access/undo/Makefile @@ -0,0 +1,17 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/undo +# +# IDENTIFICATION +# src/backend/access/undo/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/undo +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = undolog.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/undo/undolog.c b/src/backend/access/undo/undolog.c new file mode 100644 index 0000000..48dd662 --- /dev/null +++ b/src/backend/access/undo/undolog.c @@ -0,0 +1,2643 @@ +/*------------------------------------------------------------------------- + * + * undolog.c + * management of undo logs + * + * PostgreSQL undo log manager. This module is responsible for managing the + * lifecycle of undo logs and their segment files, associating undo logs with + * backends, and allocating space within undo logs. + * + * For the code that reads and writes blocks of data, see undofile.c. + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/undo/undolog.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/transam.h" +#include "access/undolog.h" +#include "access/undolog_xlog.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xlogreader.h" +#include "catalog/catalog.h" +#include "catalog/pg_tablespace.h" +#include "commands/tablespace.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "nodes/execnodes.h" +#include "pgstat.h" +#include "storage/buf.h" +#include "storage/bufmgr.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/procarray.h" +#include "storage/shmem.h" +#include "storage/standby.h" +#include "storage/undofile.h" +#include "utils/builtins.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/varlena.h" + +#include +#include + +/* + * During recovery we maintain a mapping of transaction ID to undo logs + * numbers. We do this with a two-level array, so that we use memory only for + * chunks of the array that overlap with the range of active xids. + */ +#define UndoLogXidLowBits 16 + +/* + * Number of high bits. + */ +#define UndoLogXidHighBits \ + (sizeof(TransactionId) * CHAR_BIT - UndoLogXidLowBits) + +/* Extract the upper bits of an xid, for undo log mapping purposes. */ +#define UndoLogGetXidHigh(xid) ((xid) >> UndoLogXidLowBits) + +/* Extract the lower bits of an xid, for undo log mapping purposes. */ +#define UndoLogGetXidLow(xid) ((xid) & ((1 << UndoLogXidLowBits) - 1)) + +/* + * Main control structure for undo log management in shared memory. + * UndoLogControl objects are arranged in a fixed-size array, at a position + * determined by the undo log number. + */ +typedef struct UndoLogSharedData +{ + UndoLogNumber free_lists[UndoPersistenceLevels]; + UndoLogNumber low_logno; /* the lowest logno */ + UndoLogNumber next_logno; /* one past the highest logno */ + UndoLogNumber array_size; /* how many UndoLogControl objects do we have? */ + UndoLogControl logs[FLEXIBLE_ARRAY_MEMBER]; +} UndoLogSharedData; + +/* + * Per-backend state for the undo log module. + * Backend-local pointers to undo subsystem state in shared memory. + */ +typedef struct UndoLogSession +{ + UndoLogSharedData *shared; + + /* + * The control object for the undo logs that this session is currently + * attached to at each persistence level. This is where it will write new + * undo data. + */ + UndoLogControl *logs[UndoPersistenceLevels]; + + /* + * If the undo_tablespaces GUC changes we'll remember to examine it and + * attach to a new undo log using this flag. + */ + bool need_to_choose_tablespace; + + /* + * During recovery, the startup process maintains a mapping of xid to undo + * log number, instead of using 'log' above. This is not used in regular + * backends and can be in backend-private memory so long as recovery is + * single-process. This map references UNDO_PERMANENT logs only, since + * temporary and unlogged relations don't have WAL to replay. + */ + UndoLogNumber **xid_map; + + /* + * The slot for the oldest xids still running. We advance this during + * checkpoints to free up chunks of the map. + */ + uint16 xid_map_oldest_chunk; + + /* Current dbid. Used during recovery. */ + Oid dbid; +} UndoLogSession; + +UndoLogSession MyUndoLogState; + +undologtable_hash *undologtable_cache; + +/* GUC variables */ +char *undo_tablespaces = NULL; + +static UndoLogControl *get_undo_log(UndoLogNumber logno, bool locked); +static UndoLogControl *allocate_undo_log(void); +static void free_undo_log(UndoLogControl *log); +static void attach_undo_log(UndoPersistence level, Oid tablespace); +static void detach_current_undo_log(UndoPersistence level, bool full); +static void extend_undo_log(UndoLogNumber logno, UndoLogOffset new_end); +static void undo_log_before_exit(int code, Datum value); +static void forget_undo_buffers(int logno, UndoLogOffset old_discard, + UndoLogOffset new_discard, + bool drop_tail); +static bool choose_undo_tablespace(bool force_detach, Oid *oid); +static void undolog_xid_map_gc(void); + +PG_FUNCTION_INFO_V1(pg_stat_get_undo_logs); + +/* + * How many undo logs can be active at a time? This creates a theoretical + * maximum transaction size, but it we set it to a factor the maximum number + * of backends it will be a very high limit. Alternative designs involving + * demand paging or dynamic shared memory could remove this limit but + * introduce other problems. + */ +static inline size_t +UndoLogNumSlots(void) +{ + return MaxBackends * 4; +} + +/* + * Return the amount of traditional smhem required for undo log management. + * Extra shared memory will be managed using DSM segments. + */ +Size +UndoLogShmemSize(void) +{ + return sizeof(UndoLogSharedData) + + UndoLogNumSlots() * sizeof(UndoLogControl); +} + +/* + * Initialize the undo log subsystem. Called in each backend. + */ +void +UndoLogShmemInit(void) +{ + bool found; + + MyUndoLogState.shared = (UndoLogSharedData *) + ShmemInitStruct("UndoLogShared", UndoLogShmemSize(), &found); + + /* The postmaster initialized the shared memory state. */ + if (!IsUnderPostmaster) + { + UndoLogSharedData *shared = MyUndoLogState.shared; + int i; + + Assert(!found); + + /* + * We start with no active undo logs. StartUpUndoLogs() will recreate + * the undo logs that were known at the last checkpoint. + */ + memset(shared, 0, sizeof(*shared)); + shared->array_size = UndoLogNumSlots(); + for (i = 0; i < UndoPersistenceLevels; ++i) + shared->free_lists[i] = InvalidUndoLogNumber; + for (i = 0; i < shared->array_size; ++i) + { + memset(&shared->logs[i], 0, sizeof(shared->logs[i])); + shared->logs[i].logno = InvalidUndoLogNumber; + LWLockInitialize(&shared->logs[i].mutex, + LWTRANCHE_UNDOLOG); + LWLockInitialize(&shared->logs[i].discard_lock, + LWTRANCHE_UNDODISCARD); + } + } + else + Assert(found); + + /* All backends prepare their per-backend lookup table. */ + undologtable_cache = undologtable_create(TopMemoryContext, + UndoLogNumSlots(), + NULL); +} + +void +UndoLogInit(void) +{ + before_shmem_exit(undo_log_before_exit, 0); +} + +/* + * Figure out which directory holds an undo log based on tablespace. + */ +static void +UndoLogDirectory(Oid tablespace, char *dir) +{ + if (tablespace == DEFAULTTABLESPACE_OID || + tablespace == InvalidOid) + snprintf(dir, MAXPGPATH, "base/undo"); + else + snprintf(dir, MAXPGPATH, "pg_tblspc/%u/%s/undo", + tablespace, TABLESPACE_VERSION_DIRECTORY); +} + +/* + * Compute the pathname to use for an undo log segment file. + */ +void +UndoLogSegmentPath(UndoLogNumber logno, int segno, Oid tablespace, char *path) +{ + char dir[MAXPGPATH]; + + /* Figure out which directory holds the segment, based on tablespace. */ + UndoLogDirectory(tablespace, dir); + + /* + * Build the path from log number and offset. The pathname is the + * UndoRecPtr of the first byte in the segment in hexadecimal, with a + * period inserted between the components. + */ + snprintf(path, MAXPGPATH, "%s/%06X.%010zX", dir, logno, + segno * UndoLogSegmentSize); +} + +/* + * Iterate through the set of currently active logs. Pass in NULL to get the + * first undo log. NULL indicates the end of the set of logs. The caller + * must lock the returned log before accessing its members, and must skip if + * logno is not valid. + */ +UndoLogControl * +UndoLogNext(UndoLogControl *log) +{ + UndoLogSharedData *shared = MyUndoLogState.shared; + + LWLockAcquire(UndoLogLock, LW_SHARED); + for (;;) + { + /* Advance to the next log. */ + if (log == NULL) + { + /* Start at the beginning. */ + log = &shared->logs[0]; + } + else if (++log == &shared->logs[shared->array_size]) + { + /* Past the end. */ + log = NULL; + break; + } + /* Have we found a slot with a valid log? */ + if (log->logno != InvalidUndoLogNumber) + break; + } + LWLockRelease(UndoLogLock); + + /* XXX: erm, which lock should the caller hold!? */ + return log; +} + +/* + * Check if an undo log position has been discarded. 'point' must be an undo + * log pointer that was allocated at some point in the past, otherwise the + * result is undefined. + */ +bool +UndoLogIsDiscarded(UndoRecPtr point) +{ + UndoLogNumber logno = UndoRecPtrGetLogNo(point); + UndoLogControl *log; + bool result; + + log = get_undo_log(logno, false); + + /* + * If we couldn't find the undo log number, then it must be entirely + * discarded. + */ + if (log == NULL) + return true; + + LWLockAcquire(&log->mutex, LW_SHARED); + if (unlikely(logno != log->logno)) + { + /* + * The undo log has been entirely discarded since we looked it up, and + * the UndoLogControl slot is now unused or being used for some other + * undo log. That means that any pointer within it must be discarded. + */ + result = true; + } + else + { + /* Check if this point is before the discard pointer. */ + result = UndoRecPtrGetOffset(point) < log->meta.discard; + } + LWLockRelease(&log->mutex); + + return result; +} + +/* + * Store latest transaction's start undo record point in undo meta data. It + * will fetched by the backend when it's reusing the undo log and preparing + * its first undo. + */ +void +UndoLogSetLastXactStartPoint(UndoRecPtr point) +{ + UndoLogNumber logno = UndoRecPtrGetLogNo(point); + UndoLogControl *log = get_undo_log(logno, false); + + LWLockAcquire(&log->mutex, LW_EXCLUSIVE); + /* TODO: review */ + log->meta.last_xact_start = UndoRecPtrGetOffset(point); + LWLockRelease(&log->mutex); +} + +/* + * Fetch the previous transaction's start undo record point. + */ +UndoRecPtr +UndoLogGetLastXactStartPoint(UndoLogNumber logno) +{ + UndoLogControl *log = get_undo_log(logno, false); + uint64 last_xact_start = 0; + + if (unlikely(log == NULL)) + return InvalidUndoRecPtr; + + LWLockAcquire(&log->mutex, LW_SHARED); + /* TODO: review */ + last_xact_start = log->meta.last_xact_start; + LWLockRelease(&log->mutex); + + if (last_xact_start == 0) + return InvalidUndoRecPtr; + + return MakeUndoRecPtr(logno, last_xact_start); +} + +/* + * Store the last undo record's length in undo meta-data so that it can be + * persistent across restart. + */ +void +UndoLogSetPrevLen(UndoLogNumber logno, uint16 prevlen) +{ + UndoLogControl *log = get_undo_log(logno, false); + + Assert(log != NULL); + + LWLockAcquire(&log->mutex, LW_EXCLUSIVE); + /* TODO review */ + log->meta.prevlen = prevlen; + LWLockRelease(&log->mutex); +} + +/* + * Get the last undo record's length. + */ +uint16 +UndoLogGetPrevLen(UndoLogNumber logno) +{ + UndoLogControl *log = get_undo_log(logno, false); + uint16 prevlen; + + Assert(log != NULL); + + LWLockAcquire(&log->mutex, LW_SHARED); + /* TODO review */ + prevlen = log->meta.prevlen; + LWLockRelease(&log->mutex); + + return prevlen; +} + +/* + * Is this record is the first record for any transaction. + */ +bool +IsTransactionFirstRec(TransactionId xid) +{ + uint16 high_bits = UndoLogGetXidHigh(xid); + uint16 low_bits = UndoLogGetXidLow(xid); + UndoLogNumber logno; + UndoLogControl *log; + + Assert(InRecovery); + + if (MyUndoLogState.xid_map == NULL) + elog(ERROR, "xid to undo log number map not initialized"); + if (MyUndoLogState.xid_map[high_bits] == NULL) + elog(ERROR, "cannot find undo log number for xid %u", xid); + + logno = MyUndoLogState.xid_map[high_bits][low_bits]; + log = get_undo_log(logno, false); + if (log == NULL) + elog(ERROR, "cannot find undo log number %d for xid %u", logno, xid); + + /* TODO review */ + return log->meta.is_first_rec; +} + +/* + * Detach from the undo log we are currently attached to, returning it to the + * appropriate free list if it still has space. + */ +static void +detach_current_undo_log(UndoPersistence persistence, bool full) +{ + UndoLogSharedData *shared = MyUndoLogState.shared; + UndoLogControl *log = MyUndoLogState.logs[persistence]; + + Assert(log != NULL); + + MyUndoLogState.logs[persistence] = NULL; + + LWLockAcquire(&log->mutex, LW_EXCLUSIVE); + log->pid = InvalidPid; + log->xid = InvalidTransactionId; + if (full) + log->meta.status = UNDO_LOG_STATUS_FULL; + LWLockRelease(&log->mutex); + + /* Push back onto the appropriate free list, unless it's full. */ + if (!full) + { + LWLockAcquire(UndoLogLock, LW_EXCLUSIVE); + log->next_free = shared->free_lists[persistence]; + shared->free_lists[persistence] = log->logno; + LWLockRelease(UndoLogLock); + } +} + +/* + * Exit handler, detaching from all undo logs. + */ +static void +undo_log_before_exit(int code, Datum arg) +{ + int i; + + for (i = 0; i < UndoPersistenceLevels; ++i) + { + if (MyUndoLogState.logs[i] != NULL) + detach_current_undo_log(i, false); + } +} + +/* + * Create a new empty segment file on disk for the byte starting at 'end'. + */ +static void +allocate_empty_undo_segment(UndoLogNumber logno, Oid tablespace, + UndoLogOffset end) +{ + struct stat stat_buffer; + off_t size; + char path[MAXPGPATH]; + void *zeroes; + size_t nzeroes = 8192; + int fd; + + UndoLogSegmentPath(logno, end / UndoLogSegmentSize, tablespace, path); + + /* + * Create and fully allocate a new file. If we crashed and recovered + * then the file might already exist, so use flags that tolerate that. + * It's also possible that it exists but is too short, in which case + * we'll write the rest. We don't really care what's in the file, we + * just want to make sure that the filesystem has allocated physical + * blocks for it, so that non-COW filesystems will report ENOSPC now + * rather than later when the space is needed and we'll avoid creating + * files with holes. + */ + fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY); + if (fd < 0 && tablespace != 0) + { + char undo_path[MAXPGPATH]; + + /* Try creating the undo directory for this tablespace. */ + UndoLogDirectory(tablespace, undo_path); + if (mkdir(undo_path, S_IRWXU) != 0 && errno != EEXIST) + { + char *parentdir; + + if (errno != ENOENT || !InRecovery) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create directory \"%s\": %m", + undo_path))); + + /* + * In recovery, it's possible that the tablespace directory + * doesn't exist because a later WAL record removed the whole + * tablespace. In that case we create a regular directory to + * stand in for it. This is similar to the logic in + * TablespaceCreateDbspace(). + */ + + /* create two parents up if not exist */ + parentdir = pstrdup(undo_path); + get_parent_directory(parentdir); + get_parent_directory(parentdir); + /* Can't create parent and it doesn't already exist? */ + if (mkdir(parentdir, S_IRWXU) < 0 && errno != EEXIST) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create directory \"%s\": %m", + parentdir))); + pfree(parentdir); + + /* create one parent up if not exist */ + parentdir = pstrdup(undo_path); + get_parent_directory(parentdir); + /* Can't create parent and it doesn't already exist? */ + if (mkdir(parentdir, S_IRWXU) < 0 && errno != EEXIST) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create directory \"%s\": %m", + parentdir))); + pfree(parentdir); + + if (mkdir(undo_path, S_IRWXU) != 0 && errno != EEXIST) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create directory \"%s\": %m", + undo_path))); + } + + fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY); + } + if (fd < 0) + elog(ERROR, "could not create new file \"%s\": %m", path); + if (fstat(fd, &stat_buffer) < 0) + elog(ERROR, "could not stat \"%s\": %m", path); + size = stat_buffer.st_size; + + /* A buffer full of zeroes we'll use to fill up new segment files. */ + zeroes = palloc0(nzeroes); + + while (size < UndoLogSegmentSize) + { + ssize_t written; + + written = write(fd, zeroes, Min(nzeroes, UndoLogSegmentSize - size)); + if (written < 0) + elog(ERROR, "cannot initialize undo log segment file \"%s\": %m", + path); + size += written; + } + + /* Flush the contents of the file to disk. */ + if (pg_fsync(fd) != 0) + elog(ERROR, "cannot fsync file \"%s\": %m", path); + CloseTransientFile(fd); + + pfree(zeroes); + + elog(LOG, "created undo segment \"%s\"", path); /* XXX: remove me */ +} + +/* + * Create a new undo segment, when it is unexpectedly not present. + */ +void +UndoLogNewSegment(UndoLogNumber logno, Oid tablespace, int segno) +{ + Assert(InRecovery); + allocate_empty_undo_segment(logno, tablespace, segno * UndoLogSegmentSize); +} + +/* + * Create and zero-fill a new segment for a given undo log number. + */ +static void +extend_undo_log(UndoLogNumber logno, UndoLogOffset new_end) +{ + UndoLogControl *log; + char dir[MAXPGPATH]; + size_t end; + + log = get_undo_log(logno, false); + + /* TODO review interlocking */ + + Assert(log != NULL); + Assert(log->meta.end % UndoLogSegmentSize == 0); + Assert(new_end % UndoLogSegmentSize == 0); + Assert(MyUndoLogState.logs[log->meta.persistence] == log || InRecovery); + + /* + * Create all the segments needed to increase 'end' to the requested + * size. This is quite expensive, so we will try to avoid it completely + * by renaming files into place in UndoLogDiscard instead. + */ + end = log->meta.end; + while (end < new_end) + { + allocate_empty_undo_segment(logno, log->meta.tablespace, end); + end += UndoLogSegmentSize; + } + + Assert(end == new_end); + + /* + * Flush the parent dir so that the directory metadata survives a crash + * after this point. + */ + UndoLogDirectory(log->meta.tablespace, dir); + fsync_fname(dir, true); + + /* + * If we're not in recovery, we need to WAL-log the creation of the new + * file(s). We do that after the above filesystem modifications, in + * violation of the data-before-WAL rule as exempted by + * src/backend/access/transam/README. This means that it's possible for + * us to crash having made some or all of the filesystem changes but + * before WAL logging, but in that case we'll eventually try to create the + * same segment(s) again, which is tolerated. + */ + if (!InRecovery) + { + xl_undolog_extend xlrec; + XLogRecPtr ptr; + + xlrec.logno = logno; + xlrec.end = end; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + ptr = XLogInsert(RM_UNDOLOG_ID, XLOG_UNDOLOG_EXTEND); + XLogFlush(ptr); + } + + /* + * We didn't need to acquire the mutex to read 'end' above because only + * we write to it. But we need the mutex to update it, because the + * checkpointer might read it concurrently. + * + * XXX It's possible for meta.end to be higher already during + * recovery, because of the timing of a checkpoint; in that case we did + * nothing above and we shouldn't update shmem here. That interaction + * needs more analysis. + */ + LWLockAcquire(&log->mutex, LW_EXCLUSIVE); + if (log->meta.end < end) + log->meta.end = end; + LWLockRelease(&log->mutex); +} + +/* + * Get an insertion point that is guaranteed to be backed by enough space to + * hold 'size' bytes of data. To actually write into the undo log, client + * code should call this first and then use bufmgr routines to access buffers + * and provide WAL logs and redo handlers. In other words, while this module + * looks after making sure the undo log has sufficient space and the undo meta + * data is crash safe, the *contents* of the undo log and (indirectly) the + * insertion point are the responsibility of client code. + * + * Return an undo log insertion point that can be converted to a buffer tag + * and an insertion point within a buffer page. + * + * XXX For now an xl_undolog_meta object is filled in, in case it turns out + * to be necessary to write it into the WAL record (like FPI, this must be + * logged once for each undo log after each checkpoint). I think this should + * be moved out of this interface and done differently -- to review. + */ +UndoRecPtr +UndoLogAllocate(size_t size, UndoPersistence persistence) +{ + UndoLogControl *log = MyUndoLogState.logs[persistence]; + UndoLogOffset new_insert; + UndoLogNumber prevlogno = InvalidUndoLogNumber; + TransactionId logxid; + + /* + * We may need to attach to an undo log, either because this is the first + * time this backend as needed to write to an undo log at all or because + * the undo_tablespaces GUC was changed. When doing that, we'll need + * interlocking against tablespaces being concurrently dropped. + */ + + retry: + /* See if we need to check the undo_tablespaces GUC. */ + if (unlikely(MyUndoLogState.need_to_choose_tablespace || log == NULL)) + { + Oid tablespace; + bool need_to_unlock; + + need_to_unlock = + choose_undo_tablespace(MyUndoLogState.need_to_choose_tablespace, + &tablespace); + attach_undo_log(persistence, tablespace); + if (need_to_unlock) + LWLockRelease(TablespaceCreateLock); + log = MyUndoLogState.logs[persistence]; + log->meta.prevlogno = prevlogno; + MyUndoLogState.need_to_choose_tablespace = false; + } + + /* + * If this is the first time we've allocated undo log space in this + * transaction, we'll record the xid->undo log association so that it can + * be replayed correctly. Before that, we set the first record flag to + * false. + */ + LWLockAcquire(&log->mutex, LW_EXCLUSIVE); + log->meta.is_first_rec = false; + logxid = log->xid; + + if (logxid != GetTopTransactionId()) + { + xl_undolog_attach xlrec; + + /* + * While we have the lock, check if we have been forcibly detached by + * DROP TABLESPACE. That can only happen between transactions (see + * DropUndoLogsInsTablespace()). + */ + if (log->pid == InvalidPid) + { + LWLockRelease(&log->mutex); + log = NULL; + goto retry; + } + log->xid = GetTopTransactionId(); + log->meta.is_first_rec = true; + LWLockRelease(&log->mutex); + + /* Skip the attach record for unlogged and temporary tables. */ + if (persistence == UNDO_PERMANENT) + { + xlrec.xid = GetTopTransactionId(); + xlrec.logno = log->logno; + xlrec.dbid = MyDatabaseId; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + XLogInsert(RM_UNDOLOG_ID, XLOG_UNDOLOG_ATTACH); + } + } + else + { + LWLockRelease(&log->mutex); + } + + /* + * 'size' is expressed in usable non-header bytes. Figure out how far we + * have to move insert to create space for 'size' usable bytes, stepping + * over any intervening headers. + */ + Assert(log->meta.insert % BLCKSZ >= UndoLogBlockHeaderSize); + new_insert = UndoLogOffsetPlusUsableBytes(log->meta.insert, size); + Assert(new_insert % BLCKSZ >= UndoLogBlockHeaderSize); + + /* + * We don't need to acquire log->mutex to read log->meta.insert and + * log->meta.end, because this backend is the only one that can + * modify them. + */ + if (unlikely(new_insert > log->meta.end)) + { + if (new_insert > UndoLogMaxSize) + { + /* This undo log is entirely full. Get a new one. */ + if (logxid == GetTopTransactionId()) + { + /* + * If the same transaction is split over two undo logs then + * store the previous log number in new log. See detailed + * comments in undorecord.c file header. + */ + prevlogno = log->logno; + } + elog(LOG, "undo log %u is full, switching to a new one", log->logno); + log = NULL; + detach_current_undo_log(persistence, true); + goto retry; + } + /* + * Extend the end of this undo log to cover new_insert (in other words + * round up to the segment size). + */ + extend_undo_log(log->logno, + new_insert + UndoLogSegmentSize - + new_insert % UndoLogSegmentSize); + Assert(new_insert <= log->meta.end); + } + + return MakeUndoRecPtr(log->logno, log->meta.insert); +} + +/* + * In recovery, we expect the xid to map to a known log which already has + * enough space in it. + */ +UndoRecPtr +UndoLogAllocateInRecovery(TransactionId xid, size_t size, + UndoPersistence level) +{ + uint16 high_bits = UndoLogGetXidHigh(xid); + uint16 low_bits = UndoLogGetXidLow(xid); + UndoLogNumber logno; + UndoLogControl *log; + + /* + * The sequence of calls to UndoLogAllocateRecovery() during REDO + * (recovery) must match the sequence of calls to UndoLogAllocate during + * DO, for any given session. The XXX_redo code for any UNDO-generating + * operation must use UndoLogAllocateRecovery() rather than + * UndoLogAllocate(), because it must supply the extra 'xid' argument so + * that we can find out which undo log number to use. During DO, that's + * tracked per-backend, but during REDO the original backends/sessions are + * lost and we have only the Xids. + */ + Assert(InRecovery); + + /* + * Look up the undo log number for this xid. The mapping must already + * have been created by an XLOG_UNDOLOG_ATTACH record emitted during the + * first call to UndoLogAllocate for this xid after the most recent + * checkpoint. + */ + if (MyUndoLogState.xid_map == NULL) + elog(ERROR, "xid to undo log number map not initialized"); + if (MyUndoLogState.xid_map[high_bits] == NULL) + elog(ERROR, "cannot find undo log number for xid %u", xid); + logno = MyUndoLogState.xid_map[high_bits][low_bits]; + if (logno == InvalidUndoLogNumber) + elog(ERROR, "cannot find undo log number for xid %u", xid); + + /* + * This log must already have been created by an XLOG_UNDOLOG_CREATE + * record emitted by UndoLogAllocate(). + */ + log = get_undo_log(logno, false); + if (log == NULL) + elog(ERROR, "cannot find undo log number %d for xid %u", logno, xid); + + /* + * This log must already have been extended to cover the requested size by + * XLOG_UNDOLOG_EXTEND records emitted by UndoLogAllocate(), or by + * XLOG_UNDLOG_DISCARD records recycling segments. + */ + if (log->meta.end < UndoLogOffsetPlusUsableBytes(log->meta.insert, size)) + elog(ERROR, + "unexpectedly couldn't allocate %zu bytes in undo log number %d", + size, logno); + + /* + * By this time we have allocated a undo log in transaction so after this + * it will not be first undo record for the transaction. + */ + log->meta.is_first_rec = false; + + return MakeUndoRecPtr(logno, log->meta.insert); +} + +/* + * Advance the insertion pointer by 'size' usable (non-header) bytes. + */ +void +UndoLogAdvance(UndoRecPtr insertion_point, size_t size, UndoPersistence persistence) +{ + UndoLogControl *log = NULL; + UndoLogNumber logno = UndoRecPtrGetLogNo(insertion_point) ; + + /* + * During recovery, MyUndoLogState is uninitialized. Hence, we need to work + * more. + */ + log = (InRecovery) ? get_undo_log(logno, false) + : MyUndoLogState.logs[persistence]; + + Assert(log != NULL); + Assert(InRecovery || logno == log->logno); + Assert(UndoRecPtrGetOffset(insertion_point) == log->meta.insert); + + LWLockAcquire(&log->mutex, LW_EXCLUSIVE); + log->meta.insert = UndoLogOffsetPlusUsableBytes(log->meta.insert, size); + LWLockRelease(&log->mutex); +} + +/* + * Advance the discard pointer in one undo log, discarding all undo data + * relating to one or more whole transactions. The passed in undo pointer is + * the address of the oldest data that the called would like to keep, and the + * affected undo log is implied by this pointer, ie + * UndoRecPtrGetLogNo(discard_pointer). + * + * The caller asserts that there will be no attempts to access the undo log + * region being discarded after this moment. This operation will cause the + * relevant buffers to be dropped immediately, without writing any data out to + * disk. Any attempt to read the buffers (except a partial buffer at the end + * of this range which will remain) may result in IO errors, because the + * underlying segment file may have been physically removed. + * + * Only one backend should call this for a given undo log concurrently, or + * data structures will become corrupted. It is expected that the caller will + * be an undo worker; only one undo worker should be working on a given undo + * log at a time. + */ +void +UndoLogDiscard(UndoRecPtr discard_point, TransactionId xid) +{ + UndoLogNumber logno = UndoRecPtrGetLogNo(discard_point); + UndoLogOffset discard = UndoRecPtrGetOffset(discard_point); + UndoLogOffset old_discard; + UndoLogOffset end; + UndoLogControl *log; + int segno; + int new_segno; + bool need_to_flush_wal = false; + bool entirely_discarded = false; + + log = get_undo_log(logno, false); + if (unlikely(log == NULL)) + elog(ERROR, + "cannot advance discard pointer for undo log %d because it is already entirely discarded", + logno); + + LWLockAcquire(&log->mutex, LW_EXCLUSIVE); + if (unlikely(log->logno != logno)) + elog(ERROR, + "cannot advance discard pointer for undo log %d because it is entirely discarded", + logno); + if (discard > log->meta.insert) + elog(ERROR, "cannot move discard point past insert point"); + old_discard = log->meta.discard; + if (discard < old_discard) + elog(ERROR, "cannot move discard pointer backwards"); + end = log->meta.end; + /* Are we discarding the last remaining data in a log marked as full? */ + if (log->meta.status == UNDO_LOG_STATUS_FULL && + discard == log->meta.insert) + { + /* + * Adjust the discard and insert pointers so that the final segment is + * deleted from disk, and remember not to recycle it. + */ + entirely_discarded = true; + log->meta.insert = log->meta.end; + discard = log->meta.end; + } + LWLockRelease(&log->mutex); + + /* + * Drop all buffers holding this undo data out of the buffer pool (except + * the last one, if the new location is in the middle of it somewhere), so + * that the contained data doesn't ever touch the disk. The caller + * promises that this data will not be needed again. We have to drop the + * buffers from the buffer pool before removing files, otherwise a + * concurrent session might try to write the block to evict the buffer. + */ + forget_undo_buffers(logno, old_discard, discard, entirely_discarded); + + /* + * Check if we crossed a segment boundary and need to do some synchronous + * filesystem operations. + */ + segno = old_discard / UndoLogSegmentSize; + new_segno = discard / UndoLogSegmentSize; + if (segno < new_segno) + { + int recycle; + UndoLogOffset pointer; + + /* + * We always WAL-log discards, but we only need to flush the WAL if we + * have performed a filesystem operation. + */ + need_to_flush_wal = true; + + /* + * XXX When we rename or unlink a file, it's possible that some + * backend still has it open because it has recently read a page from + * it. smgr/undofile.c in any such backend will eventually close it, + * because it considers that fd to belong to the file with the name + * that we're unlinking or renaming and it doesn't like to keep more + * than one open at a time. No backend should ever try to read from + * such a file descriptor; that is what it means when we say that the + * caller of UndoLogDiscard() asserts that there will be no attempts + * to access the discarded range of undo log. In the case of a + * rename, if a backend were to attempt to read undo data in the range + * being discarded, it would read entirely the wrong data. + */ + + /* + * How many segments should we recycle (= rename from tail position to + * head position)? For now it's always 1 unless there is already a + * spare one, but we could have an adaptive algorithm that recycles + * multiple segments at a time and pays just one fsync(). + */ + LWLockAcquire(&log->mutex, LW_SHARED); + if ((log->meta.end - log->meta.insert) < UndoLogSegmentSize && + log->meta.status == UNDO_LOG_STATUS_ACTIVE) + recycle = 1; + else + recycle = 0; + LWLockRelease(&log->mutex); + + /* Rewind to the start of the segment. */ + pointer = segno * UndoLogSegmentSize; + + while (pointer < new_segno * UndoLogSegmentSize) + { + char discard_path[MAXPGPATH]; + + /* + * Before removing the file, make sure that undofile_sync knows + * that it might be missing. + */ + undofile_forgetsync(log->logno, + log->meta.tablespace, + pointer / UndoLogSegmentSize); + + UndoLogSegmentPath(logno, pointer / UndoLogSegmentSize, + log->meta.tablespace, discard_path); + + /* Can we recycle the oldest segment? */ + if (recycle > 0) + { + char recycle_path[MAXPGPATH]; + + /* + * End points one byte past the end of the current undo space, + * ie to the first byte of the segment file we want to create. + */ + UndoLogSegmentPath(logno, end / UndoLogSegmentSize, + log->meta.tablespace, recycle_path); + if (rename(discard_path, recycle_path) == 0) + { + elog(LOG, "recycled undo segment \"%s\" -> \"%s\"", discard_path, recycle_path); /* XXX: remove me */ + end += UndoLogSegmentSize; + --recycle; + } + else + { + elog(ERROR, "could not rename \"%s\" to \"%s\": %m", + discard_path, recycle_path); + } + } + else + { + if (unlink(discard_path) == 0) + elog(LOG, "unlinked undo segment \"%s\"", discard_path); /* XXX: remove me */ + else + elog(ERROR, "could not unlink \"%s\": %m", discard_path); + } + pointer += UndoLogSegmentSize; + } + } + + /* WAL log the discard. */ + { + xl_undolog_discard xlrec; + XLogRecPtr ptr; + + xlrec.logno = logno; + xlrec.discard = discard; + xlrec.end = end; + xlrec.latestxid = xid; + xlrec.entirely_discarded = entirely_discarded; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + ptr = XLogInsert(RM_UNDOLOG_ID, XLOG_UNDOLOG_DISCARD); + + if (need_to_flush_wal) + XLogFlush(ptr); + } + + /* Update shmem to show the new discard and end pointers. */ + LWLockAcquire(&log->mutex, LW_EXCLUSIVE); + log->meta.discard = discard; + log->meta.end = end; + LWLockRelease(&log->mutex); + + /* If we discarded everything, the slot can be given up. */ + if (entirely_discarded) + free_undo_log(log); +} + +/* + * Return an UndoRecPtr to the oldest valid data in an undo log, or + * InvalidUndoRecPtr if it is empty. + */ +UndoRecPtr +UndoLogGetFirstValidRecord(UndoLogControl *log, bool *full) +{ + UndoRecPtr result; + + LWLockAcquire(&log->mutex, LW_SHARED); + if (log->meta.discard == log->meta.insert) + result = InvalidUndoRecPtr; + else + result = MakeUndoRecPtr(log->logno, log->meta.discard); + *full = log->meta.status == UNDO_LOG_STATUS_FULL; + LWLockRelease(&log->mutex); + + return result; +} + +/* + * Return the Next insert location. This will also validate the input xid + * if latest insert point is not for the same transaction id then this will + * return Invalid Undo pointer. + */ +UndoRecPtr +UndoLogGetNextInsertPtr(UndoLogNumber logno, TransactionId xid) +{ + UndoLogControl *log = get_undo_log(logno, false); + TransactionId logxid; + UndoRecPtr insert; + + LWLockAcquire(&log->mutex, LW_SHARED); + insert = log->meta.insert; + logxid = log->xid; + LWLockRelease(&log->mutex); + + if (TransactionIdIsValid(logxid) && !TransactionIdEquals(logxid, xid)) + return InvalidUndoRecPtr; + + return MakeUndoRecPtr(logno, insert); +} + +/* + * Get the address of the most recently inserted record. + */ +UndoRecPtr +UndoLogGetLastRecordPtr(UndoLogNumber logno, TransactionId xid) +{ + UndoLogControl *log = get_undo_log(logno, false); + TransactionId logxid; + UndoRecPtr insert; + uint16 prevlen; + + LWLockAcquire(&log->mutex, LW_SHARED); + insert = log->meta.insert; + logxid = log->xid; + prevlen = log->meta.prevlen; + LWLockRelease(&log->mutex); + + if (TransactionIdIsValid(logxid) && + TransactionIdIsValid(xid) && + !TransactionIdEquals(logxid, xid)) + return InvalidUndoRecPtr; + + if (prevlen == 0) + return InvalidUndoRecPtr; + + return MakeUndoRecPtr(logno, insert - prevlen); +} + +/* + * Rewind the undo log insert position also set the prevlen in the mata + */ +void +UndoLogRewind(UndoRecPtr insert_urp, uint16 prevlen) +{ + UndoLogNumber logno = UndoRecPtrGetLogNo(insert_urp); + UndoLogControl *log = get_undo_log(logno, false); + UndoLogOffset insert = UndoRecPtrGetOffset(insert_urp); + + LWLockAcquire(&log->mutex, LW_EXCLUSIVE); + log->meta.insert = insert; + log->meta.prevlen = prevlen; + + /* + * Force the wal log on next undo allocation. So that during recovery undo + * insert location is consistent with normal allocation. + */ + log->need_attach_wal_record = true; + LWLockRelease(&log->mutex); + + /* WAL log the rewind. */ + { + xl_undolog_rewind xlrec; + + xlrec.logno = logno; + xlrec.insert = insert; + xlrec.prevlen = prevlen; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + XLogInsert(RM_UNDOLOG_ID, XLOG_UNDOLOG_REWIND); + } +} + +/* + * Delete unreachable files under pg_undo. Any files corresponding to LSN + * positions before the previous checkpoint are no longer needed. + */ +static void +CleanUpUndoCheckPointFiles(XLogRecPtr checkPointRedo) +{ + DIR *dir; + struct dirent *de; + char path[MAXPGPATH]; + char oldest_path[MAXPGPATH]; + + /* + * If a base backup is in progress, we can't delete any checkpoint + * snapshot files because one of them corresponds to the backup label but + * there could be any number of checkpoints during the backup. + */ + if (BackupInProgress()) + return; + + /* Otherwise keep only those >= the previous checkpoint's redo point. */ + snprintf(oldest_path, MAXPGPATH, "%016" INT64_MODIFIER "X", + checkPointRedo); + dir = AllocateDir("pg_undo"); + while ((de = ReadDir(dir, "pg_undo")) != NULL) + { + /* + * Assume that fixed width uppercase hex strings sort the same way as + * the values they represent, so we can use strcmp to identify undo + * log snapshot files corresponding to checkpoints that we don't need + * anymore. This assumption holds for ASCII. + */ + if (!(strlen(de->d_name) == UNDO_CHECKPOINT_FILENAME_LENGTH)) + continue; + + if (UndoCheckPointFilenamePrecedes(de->d_name, oldest_path)) + { + snprintf(path, MAXPGPATH, "pg_undo/%s", de->d_name); + if (unlink(path) != 0) + elog(ERROR, "could not unlink file \"%s\": %m", path); + } + } + FreeDir(dir); +} + +/* + * Write out the undo log meta data to the pg_undo directory. The actual + * contents of undo logs is in shared buffers and therefore handled by + * CheckPointBuffers(), but here we record the table of undo logs and their + * properties. + */ +void +CheckPointUndoLogs(XLogRecPtr checkPointRedo, XLogRecPtr priorCheckPointRedo) +{ + UndoLogSharedData *shared = MyUndoLogState.shared; + UndoLogMetaData *serialized = NULL; + size_t serialized_size = 0; + char *data; + char path[MAXPGPATH]; + int num_logs; + int fd; + int i; + pg_crc32c crc; + + /* + * We acquire UndoLogLock to prevent any undo logs from being created or + * discarded while we build a snapshot of them. This isn't expected to + * take long on a healthy system because the number of active logs should + * be around the number of backends. Holding this lock won't prevent + * concurrent access to the undo log, except when segments need to be + * added or removed. + */ + LWLockAcquire(UndoLogLock, LW_SHARED); + + /* + * Rather than doing the file IO while we hold locks, we'll copy the + * meta-data into a palloc'd buffer. + */ + serialized_size = sizeof(UndoLogMetaData) * UndoLogNumSlots(); + serialized = (UndoLogMetaData *) palloc0(serialized_size); + + /* Scan through all slots looking for non-empty ones. */ + num_logs = 0; + for (i = 0; i < UndoLogNumSlots(); ++i) + { + UndoLogControl *slot = &shared->logs[i]; + + /* Skip empty slots. */ + if (slot->logno == InvalidUndoLogNumber) + continue; + + /* Capture snapshot while holding each mutex. */ + LWLockAcquire(&slot->mutex, LW_EXCLUSIVE); + serialized[num_logs++] = slot->meta; + slot->need_attach_wal_record = true; /* XXX: ?!? */ + LWLockRelease(&slot->mutex); + } + + LWLockRelease(UndoLogLock); + + /* Dump into a file under pg_undo. */ + snprintf(path, MAXPGPATH, "pg_undo/%016" INT64_MODIFIER "X", + checkPointRedo); + pgstat_report_wait_start(WAIT_EVENT_UNDO_CHECKPOINT_WRITE); + fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", path))); + + /* Compute header checksum. */ + INIT_CRC32C(crc); + COMP_CRC32C(crc, &shared->low_logno, sizeof(shared->low_logno)); + COMP_CRC32C(crc, &shared->next_logno, sizeof(shared->next_logno)); + COMP_CRC32C(crc, &num_logs, sizeof(num_logs)); + FIN_CRC32C(crc); + + /* Write out the number of active logs + crc. */ + if ((write(fd, &shared->low_logno, sizeof(shared->low_logno)) != sizeof(shared->low_logno)) || + (write(fd, &shared->next_logno, sizeof(shared->next_logno)) != sizeof(shared->next_logno)) || + (write(fd, &num_logs, sizeof(num_logs)) != sizeof(num_logs)) || + (write(fd, &crc, sizeof(crc)) != sizeof(crc))) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", path))); + + /* Write out the meta data for all active undo logs. */ + data = (char *) serialized; + INIT_CRC32C(crc); + serialized_size = num_logs * sizeof(UndoLogMetaData); + while (serialized_size > 0) + { + ssize_t written; + + written = write(fd, data, serialized_size); + if (written < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", path))); + COMP_CRC32C(crc, data, written); + serialized_size -= written; + data += written; + } + FIN_CRC32C(crc); + + if (write(fd, &crc, sizeof(crc)) != sizeof(crc)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", path))); + + + /* Flush file and directory entry. */ + pgstat_report_wait_start(WAIT_EVENT_UNDO_CHECKPOINT_SYNC); + pg_fsync(fd); + CloseTransientFile(fd); + fsync_fname("pg_undo", true); + pgstat_report_wait_end(); + + if (serialized) + pfree(serialized); + + CleanUpUndoCheckPointFiles(priorCheckPointRedo); + undolog_xid_map_gc(); +} + +void +StartupUndoLogs(XLogRecPtr checkPointRedo) +{ + UndoLogSharedData *shared = MyUndoLogState.shared; + char path[MAXPGPATH]; + int i; + int fd; + int nlogs; + pg_crc32c crc; + pg_crc32c new_crc; + + /* If initdb is calling, there is no file to read yet. */ + if (IsBootstrapProcessingMode()) + return; + + /* Open the pg_undo file corresponding to the given checkpoint. */ + snprintf(path, MAXPGPATH, "pg_undo/%016" INT64_MODIFIER "X", + checkPointRedo); + pgstat_report_wait_start(WAIT_EVENT_UNDO_CHECKPOINT_READ); + fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); + if (fd < 0) + elog(ERROR, "cannot open undo checkpoint snapshot \"%s\": %m", path); + + /* Read the active log number range. */ + if ((read(fd, &shared->low_logno, sizeof(shared->low_logno)) + != sizeof(shared->low_logno)) || + (read(fd, &shared->next_logno, sizeof(shared->next_logno)) + != sizeof(shared->next_logno)) || + (read(fd, &nlogs, sizeof(nlogs)) != sizeof(nlogs)) || + (read(fd, &crc, sizeof(crc)) != sizeof(crc))) + elog(ERROR, "pg_undo file \"%s\" is corrupted", path); + + /* Verify the header checksum. */ + INIT_CRC32C(new_crc); + COMP_CRC32C(new_crc, &shared->low_logno, sizeof(shared->low_logno)); + COMP_CRC32C(new_crc, &shared->next_logno, sizeof(shared->next_logno)); + COMP_CRC32C(new_crc, &nlogs, sizeof(shared->next_logno)); + FIN_CRC32C(new_crc); + + if (crc != new_crc) + elog(ERROR, + "pg_undo file \"%s\" has incorrect checksum", path); + + /* + * We'll acquire UndoLogLock just because allocate_undo_log() asserts we + * hold it (we don't actually expect concurrent access yet). + */ + LWLockAcquire(UndoLogLock, LW_EXCLUSIVE); + + /* Initialize all the logs and set up the freelist. */ + INIT_CRC32C(new_crc); + for (i = 0; i < nlogs; ++i) + { + ssize_t size; + UndoLogControl *log; + + /* + * Get a new slot to hold this UndoLogControl object. If this + * checkpoint was created on a system with a higher max_connections + * setting, it's theoretically possible that we don't have enough + * space and cannot start up. + */ + log = allocate_undo_log(); + if (!log) + ereport(ERROR, + (errmsg("not enough undo log slots to recover from checkpoint: need at least %d, have %zu", + nlogs, UndoLogNumSlots()), + errhint("Consider increasing max_connections"))); + + /* Read in the meta data for this undo log. */ + if ((size = read(fd, &log->meta, sizeof(log->meta))) != sizeof(log->meta)) + elog(ERROR, "short read of pg_undo meta data in file \"%s\": %m (got %zu, wanted %zu)", + path, size, sizeof(log->meta)); + COMP_CRC32C(new_crc, &log->meta, sizeof(log->meta)); + + /* + * At normal start-up, or during recovery, all active undo logs start + * out on the appropriate free list. + */ + log->logno = log->meta.logno; + log->pid = InvalidPid; + log->xid = InvalidTransactionId; + if (log->meta.status == UNDO_LOG_STATUS_ACTIVE) + { + log->next_free = shared->free_lists[log->meta.persistence]; + shared->free_lists[log->meta.persistence] = log->logno; + } + } + FIN_CRC32C(new_crc); + + LWLockRelease(UndoLogLock); + + /* Verify body checksum. */ + if (read(fd, &crc, sizeof(crc)) != sizeof(crc)) + elog(ERROR, "pg_undo file \"%s\" is corrupted", path); + if (crc != new_crc) + elog(ERROR, + "pg_undo file \"%s\" has incorrect checksum", path); + + CloseTransientFile(fd); + pgstat_report_wait_end(); +} + +/* + * Return a pointer to a newly allocated UndoLogControl object in shared + * memory, or return NULL if there are no free slots. The caller should + * acquire the mutex and set up the object. + */ +static UndoLogControl * +allocate_undo_log(void) +{ + UndoLogSharedData *shared = MyUndoLogState.shared; + UndoLogControl *log; + int i; + + Assert(LWLockHeldByMeInMode(UndoLogLock, LW_EXCLUSIVE)); + + for (i = 0; i < UndoLogNumSlots(); ++i) + { + log = &shared->logs[i]; + if (log->logno == InvalidUndoLogNumber) + { + memset(&log->meta, 0, sizeof(log->meta)); + log->next_free = InvalidUndoLogNumber; + /* TODO: oldest_xid etc? */ + return log; + } + } + + return NULL; +} + +/* + * Free an UndoLogControl object in shared memory, so that it can be reused. + */ +static void +free_undo_log(UndoLogControl *log) +{ + /* + * When removing an undo log from a slot in shared memory, we acquire + * UndoLogLock and log->mutex, so that other code can hold either lock to + * prevent the object from disappearing. + */ + LWLockAcquire(UndoLogLock, LW_EXCLUSIVE); + LWLockAcquire(&log->mutex, LW_EXCLUSIVE); + Assert(log->logno != InvalidUndoLogNumber); + log->logno = InvalidUndoLogNumber; + memset(&log->meta, 0, sizeof(log->meta)); + LWLockRelease(&log->mutex); + LWLockRelease(UndoLogLock); +} + +/* + * Get the UndoLogControl object for a given log number. + * + * The caller may or may not already hold UndoLogLock, and should indicate + * this by passing 'locked'. We'll acquire it in the slow path if necessary. + * Either way, the caller must deal with the possibility that the returned + * UndoLogControl object pointed to no longer contains the requested logno by + * the time it is accessed. + * + * To do that, one of the following approaches must be taken by the calling + * code: + * + * 1. If it is known that the calling backend is attached to the log, then it + * can be assumed that the UndoLogControl slot still holds the same undo log + * number. The UndoLogControl slot can only change with the cooperation of + * the undo log that is attached to it (it must first be marked as + * UNDO_LOG_STATUS_FULL, which happens when a backend detaches). Calling + * code should probably assert that it is attached and the logno is as + * expected, however. + * + * 2. Acquire log->mutex before accessing any members, and after doing so, + * check that the logno is as expected. If it is not, the entire undo log + * must be assumed to be discarded and the caller must behave accordingly. + * + * Return NULL if the undo log has been entirely discarded. It is an error to + * ask for undo logs that have never been created. + */ +static UndoLogControl * +get_undo_log(UndoLogNumber logno, bool locked) +{ + UndoLogControl *result = NULL; + UndoLogTableEntry *entry; + bool found; + + Assert(locked == LWLockHeldByMe(UndoLogLock)); + + /* First see if we already have it in our cache. */ + entry = undologtable_lookup(undologtable_cache, logno); + if (likely(entry)) + result = entry->control; + else + { + UndoLogSharedData *shared = MyUndoLogState.shared; + int i; + + /* Nope. Linear search for the slot in shared memory. */ + if (!locked) + LWLockAcquire(UndoLogLock, LW_SHARED); + for (i = 0; i < UndoLogNumSlots(); ++i) + { + if (shared->logs[i].logno == logno) + { + /* Found it. */ + + /* + * TODO: Should this function be usable in a critical section? + * Woudl it make sense to detect that we are in a critical + * section and just return the pointer to the log without + * updating the cache, to avoid any chance of allocating + * memory? + */ + + entry = undologtable_insert(undologtable_cache, logno, &found); + entry->number = logno; + entry->control = &shared->logs[i]; + entry->tablespace = entry->control->meta.tablespace; + result = entry->control; + break; + } + } + + /* + * If we didn't find it, then it must already have been entirely + * discarded. We create a negative cache entry so that we can answer + * this question quickly next time. + * + * TODO: We could track the lowest known undo log number, to reduce + * the negative cache entry bloat. + */ + if (result == NULL) + { + /* + * Sanity check: the caller should not be asking about undo logs + * that have never existed. + */ + if (logno >= shared->next_logno) + elog(PANIC, "undo log %u hasn't been created yet", logno); + entry = undologtable_insert(undologtable_cache, logno, &found); + entry->number = logno; + entry->control = NULL; + entry->tablespace = 0; + } + if (!locked) + LWLockRelease(UndoLogLock); + } + + return result; +} + +/* + * Get a pointer to an UndoLogControl object corresponding to a given logno. + * + * In general, the caller must acquire the UndoLogControl's mutex to access + * the contents, and at that time must consider that the logno might have + * changed because the undo log it contained has been entirely discarded. + * + * If the calling backend is currently attached to the undo log, that is not + * possible, because logs can only reach UNDO_LOG_STATUS_DISCARDED after first + * reaching UNDO_LOG_STATUS_FULL, and that only happens while detaching. + */ +UndoLogControl * +UndoLogGet(UndoLogNumber logno, bool missing_ok) +{ + UndoLogControl *log = get_undo_log(logno, false); + + if (log == NULL && !missing_ok) + elog(ERROR, "unknown undo log number %d", logno); + + return log; +} + +/* + * Attach to an undo log, possibly creating or recycling one as required. + */ +static void +attach_undo_log(UndoPersistence persistence, Oid tablespace) +{ + UndoLogSharedData *shared = MyUndoLogState.shared; + UndoLogControl *log = NULL; + UndoLogNumber logno; + UndoLogNumber *place; + + Assert(!InRecovery); + Assert(MyUndoLogState.logs[persistence] == NULL); + + LWLockAcquire(UndoLogLock, LW_EXCLUSIVE); + + /* + * For now we have a simple linked list of unattached undo logs for each + * persistence level. We'll grovel though it to find something for the + * tablespace you asked for. If you're not using multiple tablespaces + * it'll be able to pop one off the front. We might need a hash table + * keyed by tablespace if this simple scheme turns out to be too slow when + * using many tablespaces and many undo logs, but that seems like an + * unusual use case not worth optimizing for. + */ + place = &shared->free_lists[persistence]; + while (*place != InvalidUndoLogNumber) + { + UndoLogControl *candidate = get_undo_log(*place, true); + + /* + * There should never be an undo log on the freelist that has been + * entirely discarded, or hasn't been created yet. The persistence + * level should match the freelist. + */ + if (unlikely(candidate == NULL)) + elog(ERROR, + "corrupted undo log freelist, no such undo log %u", *place); + if (unlikely(candidate->meta.persistence != persistence)) + elog(ERROR, + "corrupted undo log freelist, undo log %u with persistence %d found on freelist %d", + *place, candidate->meta.persistence, persistence); + + if (candidate->meta.tablespace == tablespace) + { + logno = *place; + log = candidate; + *place = candidate->next_free; + break; + } + place = &candidate->next_free; + } + + /* + * All existing undo logs for this tablespace and persistence level are + * busy, so we'll have to create a new one. + */ + if (log == NULL) + { + if (shared->next_logno > MaxUndoLogNumber) + { + /* + * You've used up all 16 exabytes of undo log addressing space. + * This is a difficult state to reach using only 16 exabytes of + * WAL. + */ + elog(ERROR, "undo log address space exhausted"); + } + + /* Allocate a slot from the UndoLogControl pool. */ + log = allocate_undo_log(); + if (unlikely(!log)) + ereport(ERROR, + (errmsg("could not create new undo log"), + errdetail("The maximum number of active undo logs is %zu.", + UndoLogNumSlots()), + errhint("Consider increasing max_connections."))); + log->logno = logno = shared->next_logno; + + /* + * The insert and discard pointers start after the first block's + * header. XXX That means that insert is > end for a short time in a + * newly created undo log. Is there any problem with that? + */ + log->meta.insert = UndoLogBlockHeaderSize; + log->meta.discard = UndoLogBlockHeaderSize; + + log->meta.logno = logno; + log->meta.tablespace = tablespace; + log->meta.persistence = persistence; + log->meta.status = UNDO_LOG_STATUS_ACTIVE; + + /* Move the high log number pointer past this one. */ + ++shared->next_logno; + + /* WAL-log the creation of this new undo log. */ + { + xl_undolog_create xlrec; + + xlrec.logno = logno; + xlrec.tablespace = log->meta.tablespace; + xlrec.persistence = log->meta.persistence; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + XLogInsert(RM_UNDOLOG_ID, XLOG_UNDOLOG_CREATE); + } + + /* + * This undo log has no segments. UndoLogAllocate will create the + * first one on demand. + */ + } + LWLockRelease(UndoLogLock); + + LWLockAcquire(&log->mutex, LW_EXCLUSIVE); + log->pid = MyProcPid; + log->xid = InvalidTransactionId; + log->need_attach_wal_record = true; + LWLockRelease(&log->mutex); + + MyUndoLogState.logs[persistence] = log; +} + +/* + * Free chunks of the xid/undo log map that relate to transactions that are no + * longer running. This is run at each checkpoint. + */ +static void +undolog_xid_map_gc(void) +{ + UndoLogNumber **xid_map = MyUndoLogState.xid_map; + TransactionId oldest_xid; + uint16 new_oldest_chunk; + uint16 oldest_chunk; + + if (xid_map == NULL) + return; + + /* + * During crash recovery, it may not be possible to call GetOldestXmin() + * yet because latestCompletedXid is invalid. + */ + if (!TransactionIdIsNormal(ShmemVariableCache->latestCompletedXid)) + return; + + oldest_xid = GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT); + new_oldest_chunk = UndoLogGetXidHigh(oldest_xid); + oldest_chunk = MyUndoLogState.xid_map_oldest_chunk; + + while (oldest_chunk != new_oldest_chunk) + { + if (xid_map[oldest_chunk]) + { + pfree(xid_map[oldest_chunk]); + xid_map[oldest_chunk] = NULL; + } + oldest_chunk = (oldest_chunk + 1) % (1 << UndoLogXidHighBits); + } + MyUndoLogState.xid_map_oldest_chunk = new_oldest_chunk; +} + +/* + * Associate a xid with an undo log, during recovery. In a primary server, + * this isn't necessary because backends know which undo log they're attached + * to. During recovery, the natural association between backends and xids is + * lost, so we need to manage that explicitly. + */ +static void +undolog_xid_map_add(TransactionId xid, UndoLogNumber logno) +{ + uint16 high_bits; + uint16 low_bits; + + high_bits = UndoLogGetXidHigh(xid); + low_bits = UndoLogGetXidLow(xid); + + if (unlikely(MyUndoLogState.xid_map == NULL)) + { + /* First time through. Create mapping array. */ + MyUndoLogState.xid_map = + MemoryContextAllocZero(TopMemoryContext, + sizeof(UndoLogNumber *) * + (1 << (32 - UndoLogXidLowBits))); + MyUndoLogState.xid_map_oldest_chunk = high_bits; + } + + if (unlikely(MyUndoLogState.xid_map[high_bits] == NULL)) + { + /* This bank of mappings doesn't exist yet. Create it. */ + MyUndoLogState.xid_map[high_bits] = + MemoryContextAllocZero(TopMemoryContext, + sizeof(UndoLogNumber) * + (1 << UndoLogXidLowBits)); + } + + /* Associate this xid with this undo log number. */ + MyUndoLogState.xid_map[high_bits][low_bits] = logno; +} + +/* check_hook: validate new undo_tablespaces */ +bool +check_undo_tablespaces(char **newval, void **extra, GucSource source) +{ + char *rawname; + List *namelist; + + /* Need a modifiable copy of string */ + rawname = pstrdup(*newval); + + /* + * Parse string into list of identifiers, just to check for + * well-formedness (unfortunateley we can't validate the names in the + * catalog yet). + */ + if (!SplitIdentifierString(rawname, ',', &namelist)) + { + /* syntax error in name list */ + GUC_check_errdetail("List syntax is invalid."); + pfree(rawname); + list_free(namelist); + return false; + } + + /* + * Make sure we aren't already in a transaction that has been assigned an + * XID. This ensures we don't detach from an undo log that we might have + * started writing undo data into for this transaction. + */ + if (GetTopTransactionIdIfAny() != InvalidTransactionId) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + (errmsg("undo_tablespaces cannot be changed while a transaction is in progress")))); + list_free(namelist); + + return true; +} + +/* assign_hook: do extra actions as needed */ +void +assign_undo_tablespaces(const char *newval, void *extra) +{ + /* + * This is normally called only when GetTopTransactionIdIfAny() == + * InvalidTransactionId (because you can't change undo_tablespaces in the + * middle of a transaction that's been asigned an xid), but we can't + * assert that because it's also called at the end of a transaction that's + * rolling back, to reset the GUC if it was set inside the transaction. + */ + + /* Tell UndoLogAllocate() to reexamine undo_tablespaces. */ + MyUndoLogState.need_to_choose_tablespace = true; +} + +static bool +choose_undo_tablespace(bool force_detach, Oid *tablespace) +{ + char *rawname; + List *namelist; + bool need_to_unlock; + int length; + int i; + + /* We need a modifiable copy of string. */ + rawname = pstrdup(undo_tablespaces); + + /* Break string into list of identifiers. */ + if (!SplitIdentifierString(rawname, ',', &namelist)) + elog(ERROR, "undo_tablespaces is unexpectedly malformed"); + + length = list_length(namelist); + if (length == 0 || + (length == 1 && ((char *) linitial(namelist))[0] == '\0')) + { + /* + * If it's an empty string, then we'll use the default tablespace. No + * locking is required because it can't be dropped. + */ + *tablespace = DEFAULTTABLESPACE_OID; + need_to_unlock = false; + } + else + { + /* + * Choose an OID using our pid, so that if several backends have the + * same multi-tablespace setting they'll spread out. We could easily + * do better than this if more serious load balancing is judged + * useful. + */ + int index = MyProcPid % length; + int first_index = index; + Oid oid = InvalidOid; + + /* + * Take the tablespace create/drop lock while we look the name up. + * This prevents the tablespace from being dropped while we're trying + * to resolve the name, or while the called is trying to create an + * undo log in it. The caller will have to release this lock. + */ + LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE); + for (;;) + { + const char *name = list_nth(namelist, index); + + oid = get_tablespace_oid(name, true); + if (oid == InvalidOid) + { + /* Unknown tablespace, try the next one. */ + index = (index + 1) % length; + /* + * But if we've tried them all, it's time to complain. We'll + * arbitrarily complain about the last one we tried in the + * error message. + */ + if (index == first_index) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("tablespace \"%s\" does not exist", name), + errhint("Create the tablespace or set undo_tablespaces to a valid or empty list."))); + continue; + } + if (oid == GLOBALTABLESPACE_OID) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("undo logs cannot be placed in pg_global tablespace"))); + /* If we got here we succeeded in finding one. */ + break; + } + + Assert(oid != InvalidOid); + *tablespace = oid; + need_to_unlock = true; + } + + /* + * If we came here because the user changed undo_tablesaces, then detach + * from any undo logs we happen to be attached to. + */ + if (force_detach) + { + for (i = 0; i < UndoPersistenceLevels; ++i) + { + UndoLogControl *log = MyUndoLogState.logs[i]; + UndoLogSharedData *shared = MyUndoLogState.shared; + + if (log != NULL) + { + LWLockAcquire(&log->mutex, LW_EXCLUSIVE); + log->pid = InvalidPid; + log->xid = InvalidTransactionId; + LWLockRelease(&log->mutex); + + LWLockAcquire(UndoLogLock, LW_EXCLUSIVE); + log->next_free = shared->free_lists[i]; + shared->free_lists[i] = log->logno; + LWLockRelease(UndoLogLock); + + MyUndoLogState.logs[i] = NULL; + } + } + } + + return need_to_unlock; +} + +bool +DropUndoLogsInTablespace(Oid tablespace) +{ + DIR *dir; + char undo_path[MAXPGPATH]; + UndoLogSharedData *shared = MyUndoLogState.shared; + UndoLogControl *log; + int i; + + Assert(LWLockHeldByMe(TablespaceCreateLock)); + Assert(tablespace != DEFAULTTABLESPACE_OID); + + /* First, try to kick everyone off any undo logs in this tablespace. */ + for (log = UndoLogNext(NULL); log != NULL; log = UndoLogNext(log)) + { + bool ok; + bool return_to_freelist = false; + + /* Skip undo logs in other tablespaces. */ + if (log->meta.tablespace != tablespace) + continue; + + /* Check if this undo log can be forcibly detached. */ + LWLockAcquire(&log->mutex, LW_EXCLUSIVE); + if (log->meta.discard == log->meta.insert && + (log->xid == InvalidTransactionId || + !TransactionIdIsInProgress(log->xid))) + { + log->xid = InvalidTransactionId; + if (log->pid != InvalidPid) + { + log->pid = InvalidPid; + return_to_freelist = true; + } + ok = true; + } + else + { + /* + * There is data we need in this undo log. We can't force it to + * be detached. + */ + ok = false; + } + LWLockRelease(&log->mutex); + + /* If we failed, then give up now and report failure. */ + if (!ok) + return false; + + /* + * Put this undo log back on the appropriate free-list. No one can + * attach to it while we hold TablespaceCreateLock, but if we return + * earlier in a future go around this loop, we need the undo log to + * remain usable. We'll remove all appropriate logs from the + * free-lists in a separate step below. + */ + if (return_to_freelist) + { + LWLockAcquire(UndoLogLock, LW_EXCLUSIVE); + log->next_free = shared->free_lists[log->meta.persistence]; + shared->free_lists[log->meta.persistence] = log->logno; + LWLockRelease(UndoLogLock); + } + } + + /* + * We detached all backends from undo logs in this tablespace, and no one + * can attach to any non-default-tablespace undo logs while we hold + * TablespaceCreateLock. We can now drop the undo logs. + */ + for (log = UndoLogNext(NULL); log != NULL; log = UndoLogNext(log)) + { + /* Skip undo logs in other tablespaces. */ + if (log->meta.tablespace != tablespace) + continue; + + /* + * Make sure no buffers remain. When that is done by UndoDiscard(), + * the final page is left in shared_buffers because it may contain + * data, or at least be needed again very soon. Here we need to drop + * even that page from the buffer pool. + */ + forget_undo_buffers(log->logno, log->meta.discard, log->meta.discard, true); + + /* + * TODO: For now we drop the undo log, meaning that it will never be + * used again. That wastes the rest of its address space. Instead, + * we should put it onto a special list of 'offline' undo logs, ready + * to be reactivated in some other tablespace. Then we can keep the + * unused portion of its address space. + */ + LWLockAcquire(&log->mutex, LW_EXCLUSIVE); + log->meta.status = UNDO_LOG_STATUS_DISCARDED; + LWLockRelease(&log->mutex); + } + + /* Unlink all undo segment files in this tablespace. */ + UndoLogDirectory(tablespace, undo_path); + + dir = AllocateDir(undo_path); + if (dir != NULL) + { + struct dirent *de; + + while ((de = ReadDirExtended(dir, undo_path, LOG)) != NULL) + { + char segment_path[MAXPGPATH]; + + if (strcmp(de->d_name, ".") == 0 || + strcmp(de->d_name, "..") == 0) + continue; + snprintf(segment_path, sizeof(segment_path), "%s/%s", + undo_path, de->d_name); + if (unlink(segment_path) < 0) + elog(LOG, "couldn't unlink file \"%s\": %m", segment_path); + } + FreeDir(dir); + } + + /* Remove all dropped undo logs from the free-lists. */ + LWLockAcquire(UndoLogLock, LW_EXCLUSIVE); + for (i = 0; i < UndoPersistenceLevels; ++i) + { + UndoLogControl *log; + UndoLogNumber *place; + + place = &shared->free_lists[i]; + while (*place != InvalidUndoLogNumber) + { + log = get_undo_log(*place, true); + if (!log) + elog(ERROR, + "corrupted undo log freelist, unknown log %u", *place); + if (log->meta.status == UNDO_LOG_STATUS_DISCARDED) + *place = log->next_free; + else + place = &log->next_free; + } + } + LWLockRelease(UndoLogLock); + + return true; +} + +void +ResetUndoLogs(UndoPersistence persistence) +{ + UndoLogControl *log; + + for (log = UndoLogNext(NULL); log != NULL; log = UndoLogNext(log)) + { + DIR *dir; + struct dirent *de; + char undo_path[MAXPGPATH]; + char segment_prefix[MAXPGPATH]; + size_t segment_prefix_size; + + if (log->meta.persistence != persistence) + continue; + + /* Scan the directory for files belonging to this undo log. */ + snprintf(segment_prefix, sizeof(segment_prefix), "%06X.", log->logno); + segment_prefix_size = strlen(segment_prefix); + UndoLogDirectory(log->meta.tablespace, undo_path); + dir = AllocateDir(undo_path); + if (dir == NULL) + continue; + while ((de = ReadDirExtended(dir, undo_path, LOG)) != NULL) + { + char segment_path[MAXPGPATH]; + + if (strncmp(de->d_name, segment_prefix, segment_prefix_size) != 0) + continue; + snprintf(segment_path, sizeof(segment_path), "%s/%s", + undo_path, de->d_name); + elog(LOG, "unlinked undo segment \"%s\"", segment_path); /* XXX: remove me */ + if (unlink(segment_path) < 0) + elog(LOG, "couldn't unlink file \"%s\": %m", segment_path); + } + FreeDir(dir); + + /* + * We have no segment files. Set the pointers to indicate that there + * is no data. The discard and insert pointers point to the first + * usable byte in the segment we will create when we next try to + * allocate. This is a bit strange, because it means that they are + * past the end pointer. That's the same as when new undo logs are + * created. + * + * TODO: Should we rewind to zero instead, so we can reuse that (now) + * unreferenced address space? + */ + log->meta.insert = log->meta.discard = log->meta.end + + UndoLogBlockHeaderSize; + } +} + +Datum +pg_stat_get_undo_logs(PG_FUNCTION_ARGS) +{ +#define PG_STAT_GET_UNDO_LOGS_COLS 10 + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + TupleDesc tupdesc; + Tuplestorestate *tupstore; + MemoryContext per_query_ctx; + MemoryContext oldcontext; + UndoLogSharedData *shared = MyUndoLogState.shared; + char *tablespace_name = NULL; + Oid last_tablespace = InvalidOid; + int i; + + /* check to see if caller supports us returning a tuplestore */ + if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("set-valued function called in context that cannot accept a set"))); + if (!(rsinfo->allowedModes & SFRM_Materialize)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("materialize mode required, but it is not " \ + "allowed in this context"))); + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; + oldcontext = MemoryContextSwitchTo(per_query_ctx); + + tupstore = tuplestore_begin_heap(true, false, work_mem); + rsinfo->returnMode = SFRM_Materialize; + rsinfo->setResult = tupstore; + rsinfo->setDesc = tupdesc; + + MemoryContextSwitchTo(oldcontext); + + /* Scan all undo logs to build the results. */ + for (i = 0; i < shared->array_size; ++i) + { + UndoLogControl *log = &shared->logs[i]; + char buffer[17]; + Datum values[PG_STAT_GET_UNDO_LOGS_COLS]; + bool nulls[PG_STAT_GET_UNDO_LOGS_COLS] = { false }; + Oid tablespace; + + if (log == NULL) + continue; + + /* + * This won't be a consistent result overall, but the values for each + * log will be consistent because we'll take the per-log lock while + * copying them. + */ + LWLockAcquire(&log->mutex, LW_SHARED); + + /* Skip unused slots and entirely discarded undo logs. */ + if (log->logno == InvalidUndoLogNumber || + log->meta.status == UNDO_LOG_STATUS_DISCARDED) + { + LWLockRelease(&log->mutex); + continue; + } + + values[0] = ObjectIdGetDatum((Oid) log->logno); + values[1] = CStringGetTextDatum( + log->meta.persistence == UNDO_PERMANENT ? "permanent" : + log->meta.persistence == UNDO_UNLOGGED ? "unlogged" : + log->meta.persistence == UNDO_TEMP ? "temporary" : ""); + tablespace = log->meta.tablespace; + + snprintf(buffer, sizeof(buffer), UndoRecPtrFormat, + MakeUndoRecPtr(log->logno, log->meta.discard)); + values[3] = CStringGetTextDatum(buffer); + snprintf(buffer, sizeof(buffer), UndoRecPtrFormat, + MakeUndoRecPtr(log->logno, log->meta.insert)); + values[4] = CStringGetTextDatum(buffer); + snprintf(buffer, sizeof(buffer), UndoRecPtrFormat, + MakeUndoRecPtr(log->logno, log->meta.end)); + values[5] = CStringGetTextDatum(buffer); + if (log->xid == InvalidTransactionId) + nulls[6] = true; + else + values[6] = TransactionIdGetDatum(log->xid); + if (log->pid == InvalidPid) + nulls[7] = true; + else + values[7] = Int32GetDatum((int64) log->pid); + if (log->meta.prevlogno == InvalidUndoLogNumber) + nulls[8] = true; + else + values[8] = ObjectIdGetDatum((Oid) log->meta.prevlogno); + switch (log->meta.status) + { + case UNDO_LOG_STATUS_ACTIVE: + values[9] = CStringGetTextDatum("ACTIVE"); break; + case UNDO_LOG_STATUS_FULL: + values[9] = CStringGetTextDatum("FULL"); break; + default: + nulls[9] = true; + } + LWLockRelease(&log->mutex); + + /* + * Deal with potentially slow tablespace name lookup without the lock. + * Avoid making multiple calls to that expensive function for the + * common case of repeating tablespace. + */ + if (tablespace != last_tablespace) + { + if (tablespace_name) + pfree(tablespace_name); + tablespace_name = get_tablespace_name(tablespace); + last_tablespace = tablespace; + } + if (tablespace_name) + { + values[2] = CStringGetTextDatum(tablespace_name); + nulls[2] = false; + } + else + nulls[2] = true; + + tuplestore_putvalues(tupstore, tupdesc, values, nulls); + } + + if (tablespace_name) + pfree(tablespace_name); + tuplestore_donestoring(tupstore); + + return (Datum) 0; +} + +/* + * replay the creation of a new undo log + */ +static void +undolog_xlog_create(XLogReaderState *record) +{ + xl_undolog_create *xlrec = (xl_undolog_create *) XLogRecGetData(record); + UndoLogControl *log; + UndoLogSharedData *shared = MyUndoLogState.shared; + + /* Create meta-data space in shared memory. */ + LWLockAcquire(UndoLogLock, LW_EXCLUSIVE); + /* TODO: assert that it doesn't exist already? */ + log = allocate_undo_log(); + LWLockAcquire(&log->mutex, LW_EXCLUSIVE); + log->logno = xlrec->logno; + log->meta.logno = xlrec->logno; + log->meta.status = UNDO_LOG_STATUS_ACTIVE; + log->meta.persistence = xlrec->persistence; + log->meta.tablespace = xlrec->tablespace; + log->meta.insert = UndoLogBlockHeaderSize; + log->meta.discard = UndoLogBlockHeaderSize; + shared->next_logno = Max(xlrec->logno + 1, shared->next_logno); + LWLockRelease(&log->mutex); + LWLockRelease(UndoLogLock); +} + +/* + * replay the addition of a new segment to an undo log + */ +static void +undolog_xlog_extend(XLogReaderState *record) +{ + xl_undolog_extend *xlrec = (xl_undolog_extend *) XLogRecGetData(record); + + /* Extend exactly as we would during DO phase. */ + extend_undo_log(xlrec->logno, xlrec->end); +} + +/* + * replay the association of an xid with a specific undo log + */ +static void +undolog_xlog_attach(XLogReaderState *record) +{ + xl_undolog_attach *xlrec = (xl_undolog_attach *) XLogRecGetData(record); + UndoLogControl *log; + + undolog_xid_map_add(xlrec->xid, xlrec->logno); + + /* Restore current dbid */ + MyUndoLogState.dbid = xlrec->dbid; + + /* + * Whatever follows is the first record for this transaction. Zheap will + * use this to add UREC_INFO_TRANSACTION. + */ + log = get_undo_log(xlrec->logno, false); + /* TODO */ + log->meta.is_first_rec = true; + log->xid = xlrec->xid; +} + +/* + * Drop all buffers for the given undo log, from the old_discard to up + * new_discard. If drop_tail is true, also drop the buffer that holds + * new_discard; this is used when discarding undo logs completely, for example + * via DROP TABLESPACE. If it is false, then the final buffer is not dropped + * because it may contain data. + * + */ +static void +forget_undo_buffers(int logno, UndoLogOffset old_discard, + UndoLogOffset new_discard, bool drop_tail) +{ + BlockNumber old_blockno; + BlockNumber new_blockno; + RelFileNode rnode; + + UndoRecPtrAssignRelFileNode(rnode, MakeUndoRecPtr(logno, old_discard)); + old_blockno = old_discard / BLCKSZ; + new_blockno = new_discard / BLCKSZ; + if (drop_tail) + ++new_blockno; + while (old_blockno < new_blockno) + ForgetBuffer(rnode, UndoLogForkNum, old_blockno++); +} + +/* + * replay an undo segment discard record + */ +static void +undolog_xlog_discard(XLogReaderState *record) +{ + xl_undolog_discard *xlrec = (xl_undolog_discard *) XLogRecGetData(record); + UndoLogControl *log; + UndoLogOffset discard; + UndoLogOffset end; + UndoLogOffset old_segment_begin; + UndoLogOffset new_segment_begin; + RelFileNode rnode = {0}; + char dir[MAXPGPATH]; + + log = get_undo_log(xlrec->logno, false); + if (log == NULL) + elog(ERROR, "unknown undo log %d", xlrec->logno); + + /* + * We're about to discard undologs. In Hot Standby mode, ensure that + * there's no queries running which need to get tuple from discarded undo. + * + * XXX we are passing empty rnode to the conflict function so that it can + * check conflict in all the backend regardless of which database the + * backend is connected. + */ + if (InHotStandby && TransactionIdIsValid(xlrec->latestxid)) + ResolveRecoveryConflictWithSnapshot(xlrec->latestxid, rnode); + + /* + * See if we need to unlink or rename any files, but don't consider it an + * error if we find that files are missing. Since UndoLogDiscard() + * performs filesystem operations before WAL logging or updating shmem + * which could be checkpointed, a crash could have left files already + * deleted, but we could replay WAL that expects the files to be there. + */ + + LWLockAcquire(&log->mutex, LW_EXCLUSIVE); + Assert(log->logno == xlrec->logno); + discard = log->meta.discard; + end = log->meta.end; + LWLockRelease(&log->mutex); + + /* Drop buffers before we remove/recycle any files. */ + forget_undo_buffers(xlrec->logno, discard, xlrec->discard, + xlrec->entirely_discarded); + + /* Rewind to the start of the segment. */ + old_segment_begin = discard - discard % UndoLogSegmentSize; + new_segment_begin = xlrec->discard - xlrec->discard % UndoLogSegmentSize; + + /* Unlink or rename segments that are no longer in range. */ + while (old_segment_begin < new_segment_begin) + { + char discard_path[MAXPGPATH]; + + /* + * Before removing the file, make sure that undofile_sync knows that + * it might be missing. + */ + undofile_forgetsync(log->logno, + log->meta.tablespace, + old_segment_begin / UndoLogSegmentSize); + + UndoLogSegmentPath(xlrec->logno, old_segment_begin / UndoLogSegmentSize, + log->meta.tablespace, discard_path); + + /* Can we recycle the oldest segment? */ + if (end < xlrec->end) + { + char recycle_path[MAXPGPATH]; + + UndoLogSegmentPath(xlrec->logno, end / UndoLogSegmentSize, + log->meta.tablespace, recycle_path); + if (rename(discard_path, recycle_path) == 0) + { + elog(LOG, "recycled undo segment \"%s\" -> \"%s\"", discard_path, recycle_path); /* XXX: remove me */ + end += UndoLogSegmentSize; + } + else + { + elog(LOG, "could not rename \"%s\" to \"%s\": %m", + discard_path, recycle_path); + } + } + else + { + if (unlink(discard_path) == 0) + elog(LOG, "unlinked undo segment \"%s\"", discard_path); /* XXX: remove me */ + else + elog(LOG, "could not unlink \"%s\": %m", discard_path); + } + old_segment_begin += UndoLogSegmentSize; + } + + /* Create any further new segments that are needed the slow way. */ + while (end < xlrec->end) + { + allocate_empty_undo_segment(xlrec->logno, log->meta.tablespace, end); + end += UndoLogSegmentSize; + } + + /* Flush the directory entries. */ + UndoLogDirectory(log->meta.tablespace, dir); + fsync_fname(dir, true); + + /* Update shmem. */ + LWLockAcquire(&log->mutex, LW_EXCLUSIVE); + log->meta.discard = xlrec->discard; + log->meta.end = end; + LWLockRelease(&log->mutex); + + /* If we discarded everything, the slot can be given up. */ + if (xlrec->entirely_discarded) + free_undo_log(log); +} + +/* + * replay the rewind of a undo log + */ +static void +undolog_xlog_rewind(XLogReaderState *record) +{ + xl_undolog_rewind *xlrec = (xl_undolog_rewind *) XLogRecGetData(record); + UndoLogControl *log; + + log = get_undo_log(xlrec->logno, false); + log->meta.insert = xlrec->insert; + log->meta.prevlen = xlrec->prevlen; +} + +void +undolog_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_UNDOLOG_CREATE: + undolog_xlog_create(record); + break; + case XLOG_UNDOLOG_EXTEND: + undolog_xlog_extend(record); + break; + case XLOG_UNDOLOG_ATTACH: + undolog_xlog_attach(record); + break; + case XLOG_UNDOLOG_DISCARD: + undolog_xlog_discard(record); + break; + case XLOG_UNDOLOG_REWIND: + undolog_xlog_rewind(record); + break; + default: + elog(PANIC, "undo_redo: unknown op code %u", info); + } +} + +/* + * For assertions only. + */ +bool +AmAttachedToUndoLog(UndoLogControl *log) +{ + /* + * In general, we can't access log's members without locking. But this + * function is intended only for asserting that you are attached, and + * while you're attached the slot can't be recycled, so don't bother + * locking. + */ + return MyUndoLogState.logs[log->meta.persistence] == log; +} + +/* + * For testing use only. This function is only used by the test_undo module. + */ +void +UndoLogDetachFull(void) +{ + int i; + + for (i = 0; i < UndoPersistenceLevels; ++i) + if (MyUndoLogState.logs[i]) + detach_current_undo_log(i, true); +} + +/* + * Fetch database id from the undo log state + */ +Oid +UndoLogStateGetDatabaseId() +{ + Assert(InRecovery); + return MyUndoLogState.dbid; +} diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 53ddc59..17cbc8e 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -939,6 +939,10 @@ GRANT SELECT (subdbid, subname, subowner, subenabled, subslotname, subpublicatio ON pg_subscription TO public; +CREATE VIEW pg_stat_undo_logs AS + SELECT * + FROM pg_stat_get_undo_logs(); + -- -- We have a few function definitions in here, too. -- At some point there might be enough to justify breaking them out into diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c index f7e9160..b9daba4 100644 --- a/src/backend/commands/tablespace.c +++ b/src/backend/commands/tablespace.c @@ -54,6 +54,7 @@ #include "access/reloptions.h" #include "access/htup_details.h" #include "access/sysattr.h" +#include "access/undolog.h" #include "access/xact.h" #include "access/xlog.h" #include "access/xloginsert.h" @@ -483,6 +484,20 @@ DropTableSpace(DropTableSpaceStmt *stmt) LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE); /* + * Drop the undo logs in this tablespace. This will fail (without + * dropping anything) if there are undo logs that we can't afford to drop + * because they contain non-discarded data or a transaction is in + * progress. Since we hold TablespaceCreateLock, no other session will be + * able to attach to an undo log in this tablespace (or any tablespace + * except default) concurrently. + */ + if (!DropUndoLogsInTablespace(tablespaceoid)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("tablespace \"%s\" cannot be dropped because it contains non-empty undo logs", + tablespacename))); + + /* * Try to remove the physical infrastructure. */ if (!destroy_tablespace_directories(tablespaceoid, false)) @@ -1482,6 +1497,14 @@ tblspc_redo(XLogReaderState *record) { xl_tblspc_drop_rec *xlrec = (xl_tblspc_drop_rec *) XLogRecGetData(record); + /* This shouldn't be able to fail in recovery. */ + LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE); + if (!DropUndoLogsInTablespace(xlrec->ts_id)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("tablespace cannot be dropped because it contains non-empty undo logs"))); + LWLockRelease(TablespaceCreateLock); + /* * If we issued a WAL record for a drop tablespace it implies that * there were no files in it at all when the DROP was done. That means diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index afb4972..f60ecc5 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -154,6 +154,7 @@ LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, XLogReaderState *recor case RM_COMMIT_TS_ID: case RM_REPLORIGIN_ID: case RM_GENERIC_ID: + case RM_UNDOLOG_ID: /* just deal with xid, and done */ ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(record), buf.origptr); diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 0c86a58..4725cbe 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -21,6 +21,7 @@ #include "access/nbtree.h" #include "access/subtrans.h" #include "access/twophase.h" +#include "access/undolog.h" #include "commands/async.h" #include "miscadmin.h" #include "pgstat.h" @@ -127,6 +128,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port) size = add_size(size, ProcGlobalShmemSize()); size = add_size(size, XLOGShmemSize()); size = add_size(size, CLOGShmemSize()); + size = add_size(size, UndoLogShmemSize()); size = add_size(size, CommitTsShmemSize()); size = add_size(size, SUBTRANSShmemSize()); size = add_size(size, TwoPhaseShmemSize()); @@ -219,6 +221,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port) */ XLOGShmemInit(); CLOGShmemInit(); + UndoLogShmemInit(); CommitTsShmemInit(); SUBTRANSShmemInit(); MultiXactShmemInit(); diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index a6fda81..b6c0b00 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -521,6 +521,8 @@ RegisterLWLockTranches(void) LWLockRegisterTranche(LWTRANCHE_TBM, "tbm"); LWLockRegisterTranche(LWTRANCHE_PARALLEL_APPEND, "parallel_append"); LWLockRegisterTranche(LWTRANCHE_PARALLEL_HASH_JOIN, "parallel_hash_join"); + LWLockRegisterTranche(LWTRANCHE_UNDOLOG, "undo_log"); + LWLockRegisterTranche(LWTRANCHE_UNDODISCARD, "undo_discard"); /* Register named tranches. */ for (i = 0; i < NamedLWLockTrancheRequests; i++) diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index e6025ec..554af46 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -50,3 +50,4 @@ OldSnapshotTimeMapLock 42 BackendRandomLock 43 LogicalRepWorkerLock 44 CLogTruncationLock 45 +UndoLogLock 46 diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 4f1d2a0..a3fc997 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -556,6 +556,7 @@ BaseInit(void) InitFileAccess(); smgrinit(); InitBufferPoolAccess(); + UndoLogInit(); } diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index e471d7f..287ca00 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -118,6 +118,7 @@ extern int CommitDelay; extern int CommitSiblings; extern char *default_tablespace; extern char *temp_tablespaces; +extern char *undo_tablespaces; extern bool ignore_checksum_failure; extern bool synchronize_seqscans; @@ -3350,6 +3351,17 @@ static struct config_string ConfigureNamesString[] = }, { + {"undo_tablespaces", PGC_USERSET, CLIENT_CONN_STATEMENT, + gettext_noop("Sets the tablespace(s) to use for undo logs."), + NULL, + GUC_LIST_INPUT | GUC_LIST_QUOTE + }, + &undo_tablespaces, + "", + check_undo_tablespaces, assign_undo_tablespaces, NULL + }, + + { {"dynamic_library_path", PGC_SUSET, CLIENT_CONN_OTHER, gettext_noop("Sets the path for dynamically loadable modules."), gettext_noop("If a dynamically loadable module needs to be opened and " diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index ab5cb7f..a64d936 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -209,11 +209,13 @@ static const char *const subdirs[] = { "pg_snapshots", "pg_subtrans", "pg_twophase", + "pg_undo", "pg_multixact", "pg_multixact/members", "pg_multixact/offsets", "base", "base/1", + "base/undo", "pg_replslot", "pg_tblspc", "pg_stat", diff --git a/src/bin/pg_waldump/rmgrdesc.c b/src/bin/pg_waldump/rmgrdesc.c index 852d8ca..938150d 100644 --- a/src/bin/pg_waldump/rmgrdesc.c +++ b/src/bin/pg_waldump/rmgrdesc.c @@ -20,6 +20,7 @@ #include "access/nbtxlog.h" #include "access/rmgr.h" #include "access/spgxlog.h" +#include "access/undolog_xlog.h" #include "access/xact.h" #include "access/xlog_internal.h" #include "catalog/storage_xlog.h" diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h index 0bbe9879..9c6fca4 100644 --- a/src/include/access/rmgrlist.h +++ b/src/include/access/rmgrlist.h @@ -47,3 +47,4 @@ PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_i PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL) PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask) PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL) +PG_RMGR(RM_UNDOLOG_ID, "UndoLog", undolog_redo, undolog_desc, undolog_identify, NULL, NULL, NULL) diff --git a/src/include/access/undolog.h b/src/include/access/undolog.h new file mode 100644 index 0000000..10bd502 --- /dev/null +++ b/src/include/access/undolog.h @@ -0,0 +1,405 @@ +/*------------------------------------------------------------------------- + * + * undolog.h + * + * PostgreSQL undo log manager. This module is responsible for lifecycle + * management of undo logs and backing files, associating undo logs with + * backends, allocating and managing space within undo logs. + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/undolog.h + * + *------------------------------------------------------------------------- + */ +#ifndef UNDOLOG_H +#define UNDOLOG_H + +#include "access/xlogreader.h" +#include "catalog/pg_class.h" +#include "common/relpath.h" +#include "storage/bufpage.h" + +#ifndef FRONTEND +#include "storage/lwlock.h" +#endif + +/* The type used to identify an undo log and position within it. */ +typedef uint64 UndoRecPtr; + +/* The type used for undo record lengths. */ +typedef uint16 UndoRecordSize; + +/* Undo log statuses. */ +typedef enum +{ + UNDO_LOG_STATUS_UNUSED = 0, + UNDO_LOG_STATUS_ACTIVE, + UNDO_LOG_STATUS_FULL, + UNDO_LOG_STATUS_DISCARDED +} UndoLogStatus; + +/* + * Undo log persistence levels. These have a one-to-one correspondence with + * relpersistence values, but are small integers so that we can use them as an + * index into the "logs" and "lognos" arrays. + */ +typedef enum +{ + UNDO_PERMANENT = 0, + UNDO_UNLOGGED = 1, + UNDO_TEMP = 2 +} UndoPersistence; + +#define UndoPersistenceLevels 3 + +/* + * Convert from relpersistence ('p', 'u', 't') to an UndoPersistence + * enumerator. + */ +#define UndoPersistenceForRelPersistence(rp) \ + ((rp) == RELPERSISTENCE_PERMANENT ? UNDO_PERMANENT : \ + (rp) == RELPERSISTENCE_UNLOGGED ? UNDO_UNLOGGED : UNDO_TEMP) + +/* + * Convert from UndoPersistence to a relpersistence value. + */ +#define RelPersistenceForUndoPersistence(up) \ + ((up) == UNDO_PERMANENT ? RELPERSISTENCE_PERMANENT : \ + (up) == UNDO_UNLOGGED ? RELPERSISTENCE_UNLOGGED : \ + RELPERSISTENCE_TEMP) + +/* + * Get the appropriate UndoPersistence value from a Relation. + */ +#define UndoPersistenceForRelation(rel) \ + (UndoPersistenceForRelPersistence((rel)->rd_rel->relpersistence)) + +/* Type for offsets within undo logs */ +typedef uint64 UndoLogOffset; + +/* printf-family format string for UndoRecPtr. */ +#define UndoRecPtrFormat "%016" INT64_MODIFIER "X" + +/* printf-family format string for UndoLogOffset. */ +#define UndoLogOffsetFormat UINT64_FORMAT + +/* Number of blocks of BLCKSZ in an undo log segment file. 128 = 1MB. */ +#define UNDOSEG_SIZE 128 + +/* Size of an undo log segment file in bytes. */ +#define UndoLogSegmentSize ((size_t) BLCKSZ * UNDOSEG_SIZE) + +/* The width of an undo log number in bits. 24 allows for 16.7m logs. */ +#define UndoLogNumberBits 24 + +/* The maximum valid undo log number. */ +#define MaxUndoLogNumber ((1 << UndoLogNumberBits) - 1) + +/* The width of an undo log offset in bits. 40 allows for 1TB per log.*/ +#define UndoLogOffsetBits (64 - UndoLogNumberBits) + +/* Special value for undo record pointer which indicates that it is invalid. */ +#define InvalidUndoRecPtr ((UndoRecPtr) 0) + +/* End-of-list value when building linked lists of undo logs. */ +#define InvalidUndoLogNumber -1 + +/* + * This undo record pointer will be used in the transaction header this special + * value is the indication that currently we don't have the value of the the + * next transactions start point but it will be updated with a valid value + * in the future. + */ +#define SpecialUndoRecPtr ((UndoRecPtr) 0xFFFFFFFFFFFFFFFF) + +/* + * The maximum amount of data that can be stored in an undo log. Can be set + * artificially low to test full log behavior. + */ +#define UndoLogMaxSize ((UndoLogOffset) 1 << UndoLogOffsetBits) + +/* Type for numbering undo logs. */ +typedef int UndoLogNumber; + +/* Extract the undo log number from an UndoRecPtr. */ +#define UndoRecPtrGetLogNo(urp) \ + ((urp) >> UndoLogOffsetBits) + +/* Extract the offset from an UndoRecPtr. */ +#define UndoRecPtrGetOffset(urp) \ + ((urp) & ((UINT64CONST(1) << UndoLogOffsetBits) - 1)) + +/* Make an UndoRecPtr from an log number and offset. */ +#define MakeUndoRecPtr(logno, offset) \ + (((uint64) (logno) << UndoLogOffsetBits) | (offset)) + +/* The number of unusable bytes in the header of each block. */ +#define UndoLogBlockHeaderSize SizeOfPageHeaderData + +/* The number of usable bytes we can store per block. */ +#define UndoLogUsableBytesPerPage (BLCKSZ - UndoLogBlockHeaderSize) + +/* The pseudo-database OID used for undo logs. */ +#define UndoLogDatabaseOid 9 + +/* Length of undo checkpoint filename */ +#define UNDO_CHECKPOINT_FILENAME_LENGTH 16 + +/* + * UndoRecPtrIsValid + * True iff undoRecPtr is valid. + */ +#define UndoRecPtrIsValid(undoRecPtr) \ + ((bool) ((UndoRecPtr) (undoRecPtr) != InvalidUndoRecPtr)) + +/* Extract the relnode for an undo log. */ +#define UndoRecPtrGetRelNode(urp) \ + UndoRecPtrGetLogNo(urp) + +/* The only valid fork number for undo log buffers. */ +#define UndoLogForkNum MAIN_FORKNUM + +/* Compute the block number that holds a given UndoRecPtr. */ +#define UndoRecPtrGetBlockNum(urp) \ + (UndoRecPtrGetOffset(urp) / BLCKSZ) + +/* Compute the offset of a given UndoRecPtr in the page that holds it. */ +#define UndoRecPtrGetPageOffset(urp) \ + (UndoRecPtrGetOffset(urp) % BLCKSZ) + +/* Compare two undo checkpoint files to find the oldest file. */ +#define UndoCheckPointFilenamePrecedes(file1, file2) \ + (strcmp(file1, file2) < 0) + +/* What is the offset of the i'th non-header byte? */ +#define UndoLogOffsetFromUsableByteNo(i) \ + (((i) / UndoLogUsableBytesPerPage) * BLCKSZ + \ + UndoLogBlockHeaderSize + \ + ((i) % UndoLogUsableBytesPerPage)) + +/* How many non-header bytes are there before a given offset? */ +#define UndoLogOffsetToUsableByteNo(offset) \ + (((offset) % BLCKSZ - UndoLogBlockHeaderSize) + \ + ((offset) / BLCKSZ) * UndoLogUsableBytesPerPage) + +/* Add 'n' usable bytes to offset stepping over headers to find new offset. */ +#define UndoLogOffsetPlusUsableBytes(offset, n) \ + UndoLogOffsetFromUsableByteNo(UndoLogOffsetToUsableByteNo(offset) + (n)) + +/* Populate a RelFileNode from an UndoRecPtr. */ +#define UndoRecPtrAssignRelFileNode(rfn, urp) \ + do \ + { \ + (rfn).spcNode = UndoRecPtrGetTablespace(urp); \ + (rfn).dbNode = UndoLogDatabaseOid; \ + (rfn).relNode = UndoRecPtrGetRelNode(urp); \ + } while (false); + +/* + * Control metadata for an active undo log. Lives in shared memory inside an + * UndoLogControl object, but also written to disk during checkpoints. + */ +typedef struct UndoLogMetaData +{ + UndoLogNumber logno; + UndoLogStatus status; + Oid tablespace; + UndoPersistence persistence; /* permanent, unlogged, temp? */ + UndoLogOffset insert; /* next insertion point (head) */ + UndoLogOffset end; /* one past end of highest segment */ + UndoLogOffset discard; /* oldest data needed (tail) */ + UndoLogOffset last_xact_start; /* last transactions start undo offset */ + + /* + * If the same transaction is split over two undo logs then it stored the + * previous log number, see file header comments of undorecord.c for its + * usage. + * + * Fixme: See if we can find other way to handle it instead of keeping + * previous log number. + */ + UndoLogNumber prevlogno; /* Previous undo log number */ + bool is_first_rec; + + /* + * last undo record's length. We need to save this in undo meta and WAL + * log so that the value can be preserved across restart so that the first + * undo record after the restart can get this value properly. This will be + * used going to the previous record of the transaction during rollback. + * In case the transaction have done some operation before checkpoint and + * remaining after checkpoint in such case if we can't get the previous + * record prevlen which which before checkpoint we can not properly + * rollback. And, undo worker is also fetch this value when rolling back + * the last transaction in the undo log for locating the last undo record + * of the transaction. + */ + uint16 prevlen; +} UndoLogMetaData; + +#ifndef FRONTEND + +/* + * The in-memory control object for an undo log. We have a fixed-sized array + * of these. + */ +typedef struct UndoLogControl +{ + /* + * Protected by UndoLogLock and 'mutex'. Both must be held to steal this + * slot for another undolog. Either may be held to prevent that from + * happening. + */ + UndoLogNumber logno; /* InvalidUndoLogNumber for unused slots */ + + /* Protected by UndoLogLock. */ + UndoLogNumber next_free; /* link for active unattached undo logs */ + + /* Protected by 'mutex'. */ + LWLock mutex; + UndoLogMetaData meta; /* current meta-data */ + XLogRecPtr lsn; + bool need_attach_wal_record; /* need_attach_wal_record */ + pid_t pid; /* InvalidPid for unattached */ + TransactionId xid; + + /* Protected by 'discard_lock'. State used by undo workers. */ + LWLock discard_lock; /* prevents discarding while reading */ + TransactionId oldest_xid; /* cache of oldest transaction's xid */ + uint32 oldest_xidepoch; + UndoRecPtr oldest_data; + +} UndoLogControl; + +extern UndoLogControl *UndoLogGet(UndoLogNumber logno, bool missing_ok); +extern UndoLogControl *UndoLogNext(UndoLogControl *log); +extern bool AmAttachedToUndoLog(UndoLogControl *log); +extern UndoRecPtr UndoLogGetFirstValidRecord(UndoLogControl *log, bool *full); + +/* + * Each backend maintains a small hash table mapping undo log numbers to + * UndoLogControl objects in shared memory. + * + * We also cache the tablespace here, since we need fast access to that when + * resolving UndoRecPtr to an buffer tag. We could also reach that via + * control->meta.tablespace, but that can't be accessed without locking (since + * the UndoLogControl object might be recycled). Since the tablespace for a + * given undo log is constant for the whole life of the undo log, there is no + * invalidation problem to worry about. + */ +typedef struct UndoLogTableEntry +{ + UndoLogNumber number; + UndoLogControl *control; + Oid tablespace; + char status; +} UndoLogTableEntry; + +/* + * Instantiate fast inline hash table access functions. We use an identity + * hash function for speed, since we already have integers and don't expect + * many collisions. + */ +#define SH_PREFIX undologtable +#define SH_ELEMENT_TYPE UndoLogTableEntry +#define SH_KEY_TYPE UndoLogNumber +#define SH_KEY number +#define SH_HASH_KEY(tb, key) (key) +#define SH_EQUAL(tb, a, b) ((a) == (b)) +#define SH_SCOPE static inline +#define SH_DECLARE +#define SH_DEFINE +#include "lib/simplehash.h" + +extern PGDLLIMPORT undologtable_hash *undologtable_cache; + +/* + * Find the OID of the tablespace that holds a given UndoRecPtr. This is + * included in the header so it can be inlined by UndoRecPtrAssignRelFileNode. + */ +static inline Oid +UndoRecPtrGetTablespace(UndoRecPtr urp) +{ + UndoLogNumber logno = UndoRecPtrGetLogNo(urp); + UndoLogTableEntry *entry; + + /* + * Fast path, for undo logs we've seen before. This is safe because + * tablespaces are constant for the lifetime of an undo log number. + */ + entry = undologtable_lookup(undologtable_cache, logno); + if (likely(entry)) + return entry->tablespace; + + /* + * Slow path: force cache entry to be created. Raises an error if the + * undo log has been entirely discarded, or hasn't been created yet. That + * is appropriate here, because this interface is designed for accessing + * undo pages via bufmgr, and we should never be trying to access undo + * pages that have been discarded. + */ + UndoLogGet(logno, false); + + /* + * We use the value from the newly created cache entry, because it's + * cheaper than acquiring log->mutex and reading log->meta.tablespace. + */ + entry = undologtable_lookup(undologtable_cache, logno); + return entry->tablespace; +} +#endif + +/* Space management. */ +extern UndoRecPtr UndoLogAllocate(size_t size, + UndoPersistence level); +extern UndoRecPtr UndoLogAllocateInRecovery(TransactionId xid, + size_t size, + UndoPersistence persistence); +extern void UndoLogAdvance(UndoRecPtr insertion_point, + size_t size, + UndoPersistence persistence); +extern void UndoLogDiscard(UndoRecPtr discard_point, TransactionId xid); +extern bool UndoLogIsDiscarded(UndoRecPtr point); + +/* Initialization interfaces. */ +extern void StartupUndoLogs(XLogRecPtr checkPointRedo); +extern void UndoLogShmemInit(void); +extern Size UndoLogShmemSize(void); +extern void UndoLogInit(void); +extern void UndoLogSegmentPath(UndoLogNumber logno, int segno, Oid tablespace, + char *path); +extern void ResetUndoLogs(UndoPersistence persistence); + +/* Interface use by tablespace.c. */ +extern bool DropUndoLogsInTablespace(Oid tablespace); + +/* GUC interfaces. */ +extern void assign_undo_tablespaces(const char *newval, void *extra); + +/* Checkpointing interfaces. */ +extern void CheckPointUndoLogs(XLogRecPtr checkPointRedo, + XLogRecPtr priorCheckPointRedo); + +extern void UndoLogSetLastXactStartPoint(UndoRecPtr point); +extern UndoRecPtr UndoLogGetLastXactStartPoint(UndoLogNumber logno); +extern UndoRecPtr UndoLogGetNextInsertPtr(UndoLogNumber logno, + TransactionId xid); +extern UndoRecPtr UndoLogGetLastRecordPtr(UndoLogNumber, + TransactionId xid); +extern void UndoLogRewind(UndoRecPtr insert_urp, uint16 prevlen); +extern bool IsTransactionFirstRec(TransactionId xid); +extern void UndoLogSetPrevLen(UndoLogNumber logno, uint16 prevlen); +extern uint16 UndoLogGetPrevLen(UndoLogNumber logno); +extern void UndoLogSetLSN(XLogRecPtr lsn); +void UndoLogNewSegment(UndoLogNumber logno, Oid tablespace, int segno); +/* Redo interface. */ +extern void undolog_redo(XLogReaderState *record); +/* Discard the undo logs for temp tables */ +extern void TempUndoDiscard(UndoLogNumber); +extern Oid UndoLogStateGetDatabaseId(void); + +/* Test-only interfacing. */ +extern void UndoLogDetachFull(void); + +#endif diff --git a/src/include/access/undolog_xlog.h b/src/include/access/undolog_xlog.h new file mode 100644 index 0000000..fe88ac5 --- /dev/null +++ b/src/include/access/undolog_xlog.h @@ -0,0 +1,72 @@ +/*------------------------------------------------------------------------- + * + * undolog_xlog.h + * undo log access XLOG definitions. + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/undolog_xlog.h + * + *------------------------------------------------------------------------- + */ +#ifndef UNDOLOG_XLOG_H +#define UNDOLOG_XLOG_H + +#include "access/undolog.h" +#include "access/xlogreader.h" +#include "lib/stringinfo.h" + +/* XLOG records */ +#define XLOG_UNDOLOG_CREATE 0x00 +#define XLOG_UNDOLOG_EXTEND 0x10 +#define XLOG_UNDOLOG_ATTACH 0x20 +#define XLOG_UNDOLOG_DISCARD 0x30 +#define XLOG_UNDOLOG_REWIND 0x40 +#define XLOG_UNDOLOG_META 0x50 + +/* Create a new undo log. */ +typedef struct xl_undolog_create +{ + UndoLogNumber logno; + Oid tablespace; + UndoPersistence persistence; +} xl_undolog_create; + +/* Extend an undo log by adding a new segment. */ +typedef struct xl_undolog_extend +{ + UndoLogNumber logno; + UndoLogOffset end; +} xl_undolog_extend; + +/* Record the undo log number used for a transaction. */ +typedef struct xl_undolog_attach +{ + TransactionId xid; + UndoLogNumber logno; + Oid dbid; +} xl_undolog_attach; + +/* Discard space, and possibly destroy or recycle undo log segments. */ +typedef struct xl_undolog_discard +{ + UndoLogNumber logno; + UndoLogOffset discard; + UndoLogOffset end; + TransactionId latestxid; /* latest xid whose undolog are discarded. */ + bool entirely_discarded; +} xl_undolog_discard; + +/* Rewind insert location of the undo log. */ +typedef struct xl_undolog_rewind +{ + UndoLogNumber logno; + UndoLogOffset insert; + uint16 prevlen; +} xl_undolog_rewind; + +extern void undolog_desc(StringInfo buf,XLogReaderState *record); +extern const char *undolog_identify(uint8 info); + +#endif diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 4026018..b4c3ad9 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -10038,4 +10038,11 @@ proargnames => '{rootrelid,relid,parentrelid,isleaf,level}', prosrc => 'pg_partition_tree' } +# undo logs +{ oid => '5032', descr => 'list undo logs', + proname => 'pg_stat_get_undo_logs', procost => '1', prorows => '10', proretset => 't', + prorettype => 'record', proargtypes => '', + proallargtypes => '{oid,text,text,text,text,text,xid,int4,oid,text}', proargmodes => '{o,o,o,o,o,o,o,o,o,o}', + proargnames => '{logno,persistence,tablespace,discard,insert,end,xid,pid,prev_logno,status}', prosrc => 'pg_stat_get_undo_logs' }, + ] diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index b2dcb73..4305af6 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -219,6 +219,8 @@ typedef enum BuiltinTrancheIds LWTRANCHE_SHARED_TUPLESTORE, LWTRANCHE_TBM, LWTRANCHE_PARALLEL_APPEND, + LWTRANCHE_UNDOLOG, + LWTRANCHE_UNDODISCARD, LWTRANCHE_FIRST_USER_DEFINED } BuiltinTrancheIds; diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index f462eab..217d80a 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -426,6 +426,8 @@ extern void GUC_check_errcode(int sqlerrcode); extern bool check_default_tablespace(char **newval, void **extra, GucSource source); extern bool check_temp_tablespaces(char **newval, void **extra, GucSource source); extern void assign_temp_tablespaces(const char *newval, void *extra); +extern bool check_undo_tablespaces(char **newval, void **extra, GucSource source); +extern void assign_undo_tablespaces(const char *newval, void *extra); /* in catalog/namespace.c */ extern bool check_search_path(char **newval, void **extra, GucSource source); diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 735dd37..f3de192 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1918,6 +1918,17 @@ pg_stat_sys_tables| SELECT pg_stat_all_tables.relid, pg_stat_all_tables.autoanalyze_count FROM pg_stat_all_tables WHERE ((pg_stat_all_tables.schemaname = ANY (ARRAY['pg_catalog'::name, 'information_schema'::name])) OR (pg_stat_all_tables.schemaname ~ '^pg_toast'::text)); +pg_stat_undo_logs| SELECT pg_stat_get_undo_logs.logno, + pg_stat_get_undo_logs.persistence, + pg_stat_get_undo_logs.tablespace, + pg_stat_get_undo_logs.discard, + pg_stat_get_undo_logs.insert, + pg_stat_get_undo_logs."end", + pg_stat_get_undo_logs.xid, + pg_stat_get_undo_logs.pid, + pg_stat_get_undo_logs.prev_logno, + pg_stat_get_undo_logs.status + FROM pg_stat_get_undo_logs() pg_stat_get_undo_logs(logno, persistence, tablespace, discard, insert, "end", xid, pid, prev_logno, status); pg_stat_user_functions| SELECT p.oid AS funcid, n.nspname AS schemaname, p.proname AS funcname, -- 1.8.3.1