From 3531ead1788045b602f43af06fc1ba3ddf74c46b Mon Sep 17 00:00:00 2001 From: Kyotaro Horiguchi Date: Wed, 15 Mar 2023 15:42:09 +0900 Subject: [PATCH v27 2/3] In-place table persistence change Currently, the command cuases a large amount of file I/O due to heap rewrite, even though ALTER TABLE SET UNLOGGED does not require any data rewrites. In addition, this patch changes ALTER TABLE SET LOGGED to emit XLOG_FPI records instead of a large number of HEAP_INSERT's when wal_level > minimal, as this option is likely to be less resource intensive. --- src/backend/access/rmgrdesc/smgrdesc.c | 12 + src/backend/catalog/storage.c | 290 ++++++++++++++++++++++- src/backend/commands/tablecmds.c | 269 ++++++++++++++++++--- src/backend/storage/buffer/bufmgr.c | 85 +++++++ src/backend/storage/file/reinit.c | 51 +++- src/bin/pg_rewind/parsexlog.c | 6 + src/bin/pg_rewind/pg_rewind.c | 1 - src/include/catalog/storage_xlog.h | 8 + src/include/storage/bufmgr.h | 2 + src/test/recovery/t/013_crash_restart.pl | 21 -- src/tools/pgindent/typedefs.list | 1 + 11 files changed, 673 insertions(+), 73 deletions(-) diff --git a/src/backend/access/rmgrdesc/smgrdesc.c b/src/backend/access/rmgrdesc/smgrdesc.c index f8187385c4..e2998a3ee4 100644 --- a/src/backend/access/rmgrdesc/smgrdesc.c +++ b/src/backend/access/rmgrdesc/smgrdesc.c @@ -71,6 +71,15 @@ smgr_desc(StringInfo buf, XLogReaderState *record) appendStringInfo(buf, "%s %s", action, path); pfree(path); } + else if (info == XLOG_SMGR_BUFPERSISTENCE) + { + xl_smgr_bufpersistence *xlrec = (xl_smgr_bufpersistence *) rec; + char *path = relpathperm(xlrec->rlocator, MAIN_FORKNUM); + + appendStringInfoString(buf, path); + appendStringInfo(buf, " persistence %d", xlrec->persistence); + pfree(path); + } } const char * @@ -92,6 +101,9 @@ smgr_identify(uint8 info) case XLOG_SMGR_MARK: id = "MARK"; break; + case XLOG_SMGR_BUFPERSISTENCE: + id = "BUFPERSISTENCE"; + break; } return id; diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index 03e06246be..97d1230ee8 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -69,11 +69,13 @@ typedef struct PendingRelDelete #define PCOP_UNLINK_FORK (1 << 0) #define PCOP_UNLINK_MARK (1 << 1) +#define PCOP_SET_PERSISTENCE (1 << 2) typedef struct PendingCleanup { RelFileLocator rlocator; /* relation that need a cleanup */ int op; /* operation mask */ + bool bufpersistence; /* buffer persistence to set */ ForkNumber unlink_forknum; /* forknum to unlink */ StorageMarks unlink_mark; /* mark to unlink */ BackendId backend; /* InvalidBackendId if not a temp rel */ @@ -223,6 +225,202 @@ RelationCreateStorage(RelFileLocator rlocator, char relpersistence, return srel; } +/* + * RelationCreateInitFork + * Create physical storage for the init fork of a relation. + * + * Create the init fork for the relation. + * + * This function is transactional. The creation is WAL-logged, and if the + * transaction aborts later on, the init fork will be removed. + */ +void +RelationCreateInitFork(Relation rel) +{ + RelFileLocator rlocator = rel->rd_locator; + PendingCleanup *pending; + PendingCleanup *prev; + PendingCleanup *next; + SMgrRelation srel; + bool create = true; + + /* switch buffer persistence */ + SetRelationBuffersPersistence(RelationGetSmgr(rel), false, false); + + /* + * If we have a pending-unlink for the init-fork of this relation, that + * means the init-fork exists since before the current transaction + * started. This function reverts that change just by removing the entry. + * See RelationDropInitFork. + */ + prev = NULL; + for (pending = pendingCleanups; pending != NULL; pending = next) + { + next = pending->next; + + if (RelFileLocatorEquals(rlocator, pending->rlocator) && + pending->unlink_forknum == INIT_FORKNUM) + { + if (prev) + prev->next = next; + else + pendingCleanups = next; + + pfree(pending); + /* prev does not change */ + + create = false; + } + else + prev = pending; + } + + if (!create) + return; + + /* create the init fork, along with the mark file */ + srel = smgropen(rlocator, InvalidBackendId); + log_smgrcreatemark(&rlocator, INIT_FORKNUM, SMGR_MARK_UNCOMMITTED); + smgrcreatemark(srel, INIT_FORKNUM, SMGR_MARK_UNCOMMITTED, false); + + /* We don't have existing init fork, create it. */ + smgrcreate(srel, INIT_FORKNUM, false); + + /* + * For index relations, WAL-logging and file sync are performed by + * ambuildempty. On the other hand, we manually perform these tasks here + * for heap relations. + */ + if (rel->rd_rel->relkind == RELKIND_INDEX) + rel->rd_indam->ambuildempty(rel); + else + { + log_smgrcreate(&rlocator, INIT_FORKNUM); + smgrimmedsync(srel, INIT_FORKNUM); + } + + /* drop the init fork, mark file then revert persistence at abort */ + pending = (PendingCleanup *) + MemoryContextAlloc(TopMemoryContext, sizeof(PendingCleanup)); + pending->rlocator = rlocator; + pending->op = PCOP_UNLINK_FORK | PCOP_UNLINK_MARK | PCOP_SET_PERSISTENCE; + pending->unlink_forknum = INIT_FORKNUM; + pending->unlink_mark = SMGR_MARK_UNCOMMITTED; + pending->bufpersistence = true; + pending->backend = InvalidBackendId; + pending->atCommit = false; + pending->nestLevel = GetCurrentTransactionNestLevel(); + pending->next = pendingCleanups; + pendingCleanups = pending; + + /* drop mark file at commit */ + pending = (PendingCleanup *) + MemoryContextAlloc(TopMemoryContext, sizeof(PendingCleanup)); + pending->rlocator = rlocator; + pending->op = PCOP_UNLINK_MARK; + pending->unlink_forknum = INIT_FORKNUM; + pending->unlink_mark = SMGR_MARK_UNCOMMITTED; + pending->backend = InvalidBackendId; + pending->atCommit = true; + pending->nestLevel = GetCurrentTransactionNestLevel(); + pending->next = pendingCleanups; + pendingCleanups = pending; +} + +/* + * RelationDropInitFork + * Delete physical storage for the init fork of a relation. + */ +void +RelationDropInitFork(Relation rel) +{ + RelFileLocator rlocator = rel->rd_locator; + PendingCleanup *pending; + PendingCleanup *prev; + PendingCleanup *next; + bool inxact_created = false; + + /* switch buffer persistence */ + SetRelationBuffersPersistence(RelationGetSmgr(rel), true, false); + + /* + * Search for pending-unlink associated with the init-fork of the + * relation. The presence of one indicates that the init fork was created + * within the current transaction. + */ + prev = NULL; + for (pending = pendingCleanups; pending != NULL; pending = next) + { + next = pending->next; + + if (RelFileLocatorEquals(rlocator, pending->rlocator) && + pending->unlink_forknum != INIT_FORKNUM) + { + /* unlink list entry */ + if (prev) + prev->next = next; + else + pendingCleanups = next; + + pfree(pending); + /* prev does not change */ + + inxact_created = true; + } + else + prev = pending; + } + + /* + * If the init-fork was created in this transaction, we immediately remove + * both the init fork and mark file. Otherwise, we register an at-commit + * pending-unlink for the existing init fork. See + * RelationCreateInitFork. + */ + if (inxact_created) + { + SMgrRelation srel = smgropen(rlocator, InvalidBackendId); + ForkNumber forknum = INIT_FORKNUM; + BlockNumber firstblock = 0; + + /* + * Some AMs initialize INIT fork via buffer manager. To properly drop + * the init fork, we need to drop all buffers for the INIT fork first, + * then unlink the INIT fork along with the mark file. + */ + DropRelationBuffers(srel, &forknum, 1, &firstblock); + log_smgrunlinkmark(&rlocator, INIT_FORKNUM, SMGR_MARK_UNCOMMITTED); + smgrunlinkmark(srel, INIT_FORKNUM, SMGR_MARK_UNCOMMITTED, false); + log_smgrunlink(&rlocator, INIT_FORKNUM); + smgrunlink(srel, INIT_FORKNUM, false); + return; + } + + /* register drop of this init fork file at commit */ + pending = (PendingCleanup *) + MemoryContextAlloc(TopMemoryContext, sizeof(PendingCleanup)); + pending->rlocator = rlocator; + pending->op = PCOP_UNLINK_FORK; + pending->unlink_forknum = INIT_FORKNUM; + pending->backend = InvalidBackendId; + pending->atCommit = true; + pending->nestLevel = GetCurrentTransactionNestLevel(); + pending->next = pendingCleanups; + pendingCleanups = pending; + + /* revert buffer-persistence changes at abort */ + pending = (PendingCleanup *) + MemoryContextAlloc(TopMemoryContext, sizeof(PendingCleanup)); + pending->rlocator = rlocator; + pending->op = PCOP_SET_PERSISTENCE; + pending->bufpersistence = false; + pending->backend = InvalidBackendId; + pending->atCommit = false; + pending->nestLevel = GetCurrentTransactionNestLevel(); + pending->next = pendingCleanups; + pendingCleanups = pending; +} + /* * Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL. */ @@ -305,6 +503,25 @@ log_smgrunlinkmark(const RelFileLocator *rlocator, ForkNumber forkNum, XLogInsert(RM_SMGR_ID, XLOG_SMGR_MARK | XLR_SPECIAL_REL_UPDATE); } +/* + * Perform XLogInsert of an XLOG_SMGR_BUFPERSISTENCE record to WAL. + */ +void +log_smgrbufpersistence(const RelFileLocator *rlocator, bool persistence) +{ + xl_smgr_bufpersistence xlrec; + + /* + * Make an XLOG entry reporting the change of buffer persistence. + */ + xlrec.rlocator = *rlocator; + xlrec.persistence = persistence; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + XLogInsert(RM_SMGR_ID, XLOG_SMGR_BUFPERSISTENCE | XLR_SPECIAL_REL_UPDATE); +} + /* * RelationDropStorage * Schedule unlinking of physical storage at transaction commit. @@ -858,10 +1075,28 @@ smgrDoPendingCleanups(bool isCommit) srel = smgropen(pending->rlocator, pending->backend); Assert((pending->op & - ~(PCOP_UNLINK_FORK | PCOP_UNLINK_MARK)) == 0); + ~(PCOP_UNLINK_FORK | PCOP_UNLINK_MARK | + PCOP_SET_PERSISTENCE)) == 0); + + if (pending->op & PCOP_SET_PERSISTENCE) + { + SetRelationBuffersPersistence(srel, pending->bufpersistence, + InRecovery); + } if (pending->op & PCOP_UNLINK_FORK) { + /* + * Unlink the fork file. Currently we only apply this + * operation for init forks and it is ceratin that the init + * fork is not loaded on shared buffers at this point. In + * the case of RelationDropInitFork, the function should + * have dropped buffers. In the case of + * RelationCreateInitFork, PCOP_SET_PERSISTENCE is set and + * the buffers were dropped just before. + */ + Assert(pending->unlink_forknum == INIT_FORKNUM); + /* Don't emit wal while recovery. */ if (!InRecovery) log_smgrunlink(&pending->rlocator, @@ -1311,6 +1546,59 @@ smgr_redo(XLogReaderState *record) } } } + else if (info == XLOG_SMGR_BUFPERSISTENCE) + { + xl_smgr_bufpersistence *xlrec = + (xl_smgr_bufpersistence *) XLogRecGetData(record); + SMgrRelation reln; + PendingCleanup *pending; + PendingCleanup *prev = NULL; + + reln = smgropen(xlrec->rlocator, InvalidBackendId); + SetRelationBuffersPersistence(reln, xlrec->persistence, true); + + /* + * Delete pending action for persistence change if any. We should have + * at most one entry for this action. + */ + for (pending = pendingCleanups; pending != NULL; + pending = pending->next) + { + if (RelFileLocatorEquals(xlrec->rlocator, pending->rlocator) && + (pending->op & PCOP_SET_PERSISTENCE) != 0) + { + Assert(pending->bufpersistence == xlrec->persistence); + + if (prev) + prev->next = pending->next; + else + pendingCleanups = pending->next; + + pfree(pending); + break; + } + + prev = pending; + } + + /* + * At abort time, revert any changes to buffer-persistence that were + * made in this transaction. + */ + if (!pending) + { + pending = (PendingCleanup *) + MemoryContextAlloc(TopMemoryContext, sizeof(PendingCleanup)); + pending->rlocator = xlrec->rlocator; + pending->op = PCOP_SET_PERSISTENCE; + pending->bufpersistence = !xlrec->persistence; + pending->backend = InvalidBackendId; + pending->atCommit = false; + pending->nestLevel = GetCurrentTransactionNestLevel(); + pending->next = pendingCleanups; + pendingCleanups = pending; + } + } else elog(PANIC, "smgr_redo: unknown op code %u", info); } diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 3e2c5f797c..becef96927 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -55,6 +55,7 @@ #include "commands/defrem.h" #include "commands/event_trigger.h" #include "commands/policy.h" +#include "commands/progress.h" #include "commands/sequence.h" #include "commands/tablecmds.h" #include "commands/tablespace.h" @@ -5439,6 +5440,189 @@ ATParseTransformCmd(List **wqueue, AlteredTableInfo *tab, Relation rel, return newcmd; } +/* + * RelationChangePersistence: perform in-place persistence change of a relation + */ +static void +RelationChangePersistence(AlteredTableInfo *tab, char persistence, + LOCKMODE lockmode) +{ + Relation rel; + Relation classRel; + HeapTuple tuple, + newtuple; + Datum new_val[Natts_pg_class]; + bool new_null[Natts_pg_class], + new_repl[Natts_pg_class]; + int i; + List *relids; + ListCell *lc_oid; + + Assert(tab->rewrite == AT_REWRITE_ALTER_PERSISTENCE); + Assert(lockmode == AccessExclusiveLock); + + /* + * ATRewriteTable should be used instead of this function under the + * following condition. + */ + Assert(tab->constraints == NULL && tab->partition_constraint == NULL && + tab->newvals == NULL && !tab->verify_new_notnull); + + rel = table_open(tab->relid, lockmode); + + Assert(rel->rd_rel->relpersistence != persistence); + + elog(DEBUG1, "perform in-place persistence change"); + + /* + * Initially we gather all relations that require persistence change. + */ + + /* Collect OIDs of indexes and toast relations */ + relids = RelationGetIndexList(rel); + relids = lcons_oid(rel->rd_id, relids); + + /* Add toast relation if any */ + if (OidIsValid(rel->rd_rel->reltoastrelid)) + { + List *toastidx; + Relation toastrel = table_open(rel->rd_rel->reltoastrelid, lockmode); + + relids = lappend_oid(relids, rel->rd_rel->reltoastrelid); + toastidx = RelationGetIndexList(toastrel); + relids = list_concat(relids, toastidx); + pfree(toastidx); + table_close(toastrel, NoLock); + } + + table_close(rel, NoLock); + + /* Make changes in storage */ + classRel = table_open(RelationRelationId, RowExclusiveLock); + + foreach(lc_oid, relids) + { + Oid reloid = lfirst_oid(lc_oid); + Relation r = relation_open(reloid, lockmode); + + /* + * XXXX: Some access methods do not support in-place persistence + * changes. GiST uses page LSNs to figure out whether a block has been + * modified. However UNLOGGED GiST indexes use fake LSNs that are + * incompatible with the real LSNs used for LOGGED indexes. + * + * Maybe if gistGetFakeLSN behaved the same way for permanent and + * unlogged indexes, we could potentially avoid index rebuilds in + * exchange for emitting some extra WAL records while the index is + * unlogged. + * + * Check relam against a positive list so that we take the hard way for + * unknown AMs. + */ + if (r->rd_rel->relkind == RELKIND_INDEX && + /* GiST is excluded */ + r->rd_rel->relam != BTREE_AM_OID && + r->rd_rel->relam != HASH_AM_OID && + r->rd_rel->relam != GIN_AM_OID && + r->rd_rel->relam != SPGIST_AM_OID && + r->rd_rel->relam != BRIN_AM_OID) + { + int reindex_flags; + ReindexParams params = {0}; + + /* reindex doesn't allow concurrent use of the index */ + table_close(r, NoLock); + + reindex_flags = + REINDEX_REL_SUPPRESS_INDEX_USE | + REINDEX_REL_CHECK_CONSTRAINTS; + + /* Set the same persistence with the parent relation. */ + if (persistence == RELPERSISTENCE_UNLOGGED) + reindex_flags |= REINDEX_REL_FORCE_INDEXES_UNLOGGED; + else + reindex_flags |= REINDEX_REL_FORCE_INDEXES_PERMANENT; + + reindex_index(reloid, reindex_flags, persistence, ¶ms); + + continue; + } + + /* Create or drop init fork */ + if (persistence == RELPERSISTENCE_UNLOGGED) + RelationCreateInitFork(r); + else + RelationDropInitFork(r); + + /* + * If this relation is changed to WAL-logged, immediately sync all + * files except for init fork to establish the initial state on + * storage. The buffers should have already been flushed out by + * RelationCreate(Drop)InitFork called immediately above. The init fork + * should have already been synchronized as needed. + */ + if (persistence == RELPERSISTENCE_PERMANENT) + { + for (i = 0; i < INIT_FORKNUM; i++) + { + if (smgrexists(RelationGetSmgr(r), i)) + smgrimmedsync(RelationGetSmgr(r), i); + } + } + + /* Update catalog */ + tuple = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(reloid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for relation %u", reloid); + + memset(new_val, 0, sizeof(new_val)); + memset(new_null, false, sizeof(new_null)); + memset(new_repl, false, sizeof(new_repl)); + + new_val[Anum_pg_class_relpersistence - 1] = CharGetDatum(persistence); + new_null[Anum_pg_class_relpersistence - 1] = false; + new_repl[Anum_pg_class_relpersistence - 1] = true; + + newtuple = heap_modify_tuple(tuple, RelationGetDescr(classRel), + new_val, new_null, new_repl); + + CatalogTupleUpdate(classRel, &newtuple->t_self, newtuple); + heap_freetuple(newtuple); + + /* + * If wal_level >= replica, switching to LOGGED requires the relation + * content to be WAL-logged for later recovery. We don't emit this if + * wal_level = minimal. + */ + if (persistence == RELPERSISTENCE_PERMANENT && XLogIsNeeded()) + { + ForkNumber fork; + xl_smgr_truncate xlrec; + + xlrec.blkno = 0; + xlrec.rlocator = r->rd_locator; + xlrec.flags = SMGR_TRUNCATE_ALL; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + + XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE); + + for (fork = 0; fork < INIT_FORKNUM; fork++) + { + if (smgrexists(RelationGetSmgr(r), fork)) + log_newpage_range(r, fork, 0, + smgrnblocks(RelationGetSmgr(r), fork), + false); + } + } + + table_close(r, NoLock); + } + + table_close(classRel, NoLock); +} + /* * ATRewriteTables: ALTER TABLE phase 3 */ @@ -5569,48 +5753,55 @@ ATRewriteTables(AlterTableStmt *parsetree, List **wqueue, LOCKMODE lockmode, tab->relid, tab->rewrite); - /* - * Create transient table that will receive the modified data. - * - * Ensure it is marked correctly as logged or unlogged. We have - * to do this here so that buffers for the new relfilenumber will - * have the right persistence set, and at the same time ensure - * that the original filenumbers's buffers will get read in with - * the correct setting (i.e. the original one). Otherwise a - * rollback after the rewrite would possibly result with buffers - * for the original filenumbers having the wrong persistence - * setting. - * - * NB: This relies on swap_relation_files() also swapping the - * persistence. That wouldn't work for pg_class, but that can't be - * unlogged anyway. - */ - OIDNewHeap = make_new_heap(tab->relid, NewTableSpace, NewAccessMethod, - persistence, lockmode); + if (tab->rewrite == AT_REWRITE_ALTER_PERSISTENCE) + RelationChangePersistence(tab, persistence, lockmode); + else + { + /* + * Create transient table that will receive the modified data. + * + * Ensure it is marked correctly as logged or unlogged. We + * have to do this here so that buffers for the new + * relfilenumber will have the right persistence set, and at + * the same time ensure that the original filenumbers's buffers + * will get read in with the correct setting (i.e. the original + * one). Otherwise a rollback after the rewrite would possibly + * result with buffers for the original filenumbers having the + * wrong persistence setting. + * + * NB: This relies on swap_relation_files() also swapping the + * persistence. That wouldn't work for pg_class, but that + * can't be unlogged anyway. + */ + OIDNewHeap = make_new_heap(tab->relid, NewTableSpace, + NewAccessMethod, + persistence, lockmode); - /* - * Copy the heap data into the new table with the desired - * modifications, and test the current data within the table - * against new constraints generated by ALTER TABLE commands. - */ - ATRewriteTable(tab, OIDNewHeap, lockmode); + /* + * Copy the heap data into the new table with the desired + * modifications, and test the current data within the table + * against new constraints generated by ALTER TABLE commands. + */ + ATRewriteTable(tab, OIDNewHeap, lockmode); - /* - * Swap the physical files of the old and new heaps, then rebuild - * indexes and discard the old heap. We can use RecentXmin for - * the table's new relfrozenxid because we rewrote all the tuples - * in ATRewriteTable, so no older Xid remains in the table. Also, - * we never try to swap toast tables by content, since we have no - * interest in letting this code work on system catalogs. - */ - finish_heap_swap(tab->relid, OIDNewHeap, - false, false, true, - !OidIsValid(tab->newTableSpace), - RecentXmin, - ReadNextMultiXactId(), - persistence); + /* + * Swap the physical files of the old and new heaps, then + * rebuild indexes and discard the old heap. We can use + * RecentXmin for the table's new relfrozenxid because we + * rewrote all the tuples in ATRewriteTable, so no older Xid + * remains in the table. Also, we never try to swap toast + * tables by content, since we have no interest in letting + * this code work on system catalogs. + */ + finish_heap_swap(tab->relid, OIDNewHeap, + false, false, true, + !OidIsValid(tab->newTableSpace), + RecentXmin, + ReadNextMultiXactId(), + persistence); - InvokeObjectPostAlterHook(RelationRelationId, tab->relid, 0); + InvokeObjectPostAlterHook(RelationRelationId, tab->relid, 0); + } } else if (tab->rewrite > 0 && tab->relkind == RELKIND_SEQUENCE) { diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 0a05577b68..2b00ec3eed 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -3240,6 +3240,91 @@ DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, } } +/* --------------------------------------------------------------------- + * SetRelationBuffersPersistence + * + * This function changes the persistence of all buffer pages of a relation + * then writes all dirty pages of the relation out to disk when switching + * to PERMANENT. (or more precisely, to kernel disk buffers), ensuring + * that the kernel has an up-to-date view of the relation. + * + * The caller must be holding AccessExclusiveLock on the target relation + * to ensure that no other backend is busy dirtying more blocks of the + * relation. + * + * XXX currently it sequentially searches the buffer pool, should be + * changed to more clever ways of searching. This routine is not used in + * any performance-critical code paths, so it's not worth additional + * overhead to make it go faster; but see also DropRelationBuffers. + * -------------------------------------------------------------------- + */ +void +SetRelationBuffersPersistence(SMgrRelation srel, bool permanent, bool isRedo) +{ + int i; + RelFileLocatorBackend rlocator = srel->smgr_rlocator; + + Assert(!RelFileLocatorBackendIsTemp(rlocator)); + + if (!isRedo) + log_smgrbufpersistence(srel->smgr_rlocator.locator, permanent); + + ResourceOwnerEnlargeBuffers(CurrentResourceOwner); + + for (i = 0; i < NBuffers; i++) + { + BufferDesc *bufHdr = GetBufferDescriptor(i); + uint32 buf_state; + + if (!RelFileLocatorEquals(BufTagGetRelFileLocator(&bufHdr->tag), + rlocator.locator)) + continue; + + ReservePrivateRefCountEntry(); + + buf_state = LockBufHdr(bufHdr); + + if (!RelFileLocatorEquals(BufTagGetRelFileLocator(&bufHdr->tag), + rlocator.locator)) + { + UnlockBufHdr(bufHdr, buf_state); + continue; + } + + if (permanent) + { + /* The init fork is being dropped, drop buffers for it. */ + if (BufTagGetForkNum(&bufHdr->tag) == INIT_FORKNUM) + { + InvalidateBuffer(bufHdr); + continue; + } + + buf_state |= BM_PERMANENT; + pg_atomic_write_u32(&bufHdr->state, buf_state); + + /* flush this buffer when switching to PERMANENT */ + if ((buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY)) + { + PinBuffer_Locked(bufHdr); + LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), + LW_SHARED); + FlushBuffer(bufHdr, srel); + LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); + UnpinBuffer(bufHdr); + } + else + UnlockBufHdr(bufHdr, buf_state); + } + else + { + /* There shouldn't be an init fork */ + Assert(BufTagGetForkNum(&bufHdr->tag) != INIT_FORKNUM); + UnlockBufHdr(bufHdr, buf_state); + } + } +} + /* --------------------------------------------------------------------- * DropRelationsAllBuffers * diff --git a/src/backend/storage/file/reinit.c b/src/backend/storage/file/reinit.c index 250cfe9e44..bdd1200132 100644 --- a/src/backend/storage/file/reinit.c +++ b/src/backend/storage/file/reinit.c @@ -38,6 +38,7 @@ typedef struct { RelFileNumber relNumber; /* hash key */ bool has_init; /* has INIT fork */ + bool dirty_init; /* needs to remove INIT fork */ bool dirty_all; /* needs to remove all forks */ } relfile_entry; @@ -45,7 +46,10 @@ typedef struct * Clean up and reset relation files from before the last restart. * * If op includes UNLOGGED_RELATION_CLEANUP, we perform different operations - * depending on the existence of mark files. + * depending on the existence of the "cleanup" forks. + * + * If SMGR_MARK_UNCOMMITTED mark file for init fork is present, we remove the + * init fork along with the mark file. * * If SMGR_MARK_UNCOMMITTED mark file for main fork is present we remove the * whole relation along with the mark file. @@ -54,7 +58,7 @@ typedef struct * with the "init" fork, except for the "init" fork itself. * * If op includes UNLOGGED_RELATION_DROP_BUFFER, we drop all buffers for all - * relations that are to be cleaned up. + * relations that have the "cleanup" and/or the "init" forks. * * If op includes UNLOGGED_RELATION_INIT, we copy the "init" fork to the main * fork. @@ -241,7 +245,7 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, * Put the OID portion of the name into the hash table, * if it isn't already. If it has SMGR_MARK_UNCOMMITTED mark * files, the storage file is in dirty state, where clean up is - * needed. + * needed. isn't already. */ key = atooid(de->d_name); ent = hash_search(hash, &key, HASH_ENTER, &found); @@ -249,10 +253,13 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, if (!found) { ent->has_init = false; + ent->dirty_init = false; ent->dirty_all = false; } - if (forkNum == MAIN_FORKNUM && mark == SMGR_MARK_UNCOMMITTED) + if (forkNum == INIT_FORKNUM && mark == SMGR_MARK_UNCOMMITTED) + ent->dirty_init = true; + else if (forkNum == MAIN_FORKNUM && mark == SMGR_MARK_UNCOMMITTED) ent->dirty_all = true; else { @@ -276,11 +283,10 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, { /* * When we come here after recovery, smgr object for this file might - * have been created. In that case we need to drop all buffers then the - * smgr object. Otherwise checkpointer wrongly tries to flush buffers - * for nonexistent relation storage. This is safe as far as no other - * backends have accessed the relation before starting archive - * recovery. + * have been created. In that case we need to drop all buffers then + * the smgr object before initializing the unlogged relation. This is + * safe as far as no other backends have accessed the relation before + * starting archive recovery. */ HASH_SEQ_STATUS status; relfile_entry *ent; @@ -296,6 +302,13 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, { RelFileLocatorBackend rel; + /* + * The relation is persistent and stays persistent. Don't drop the + * buffers for this relation. + */ + if (ent->has_init && ent->dirty_init) + continue; + if (maxrels <= nrels) { maxrels *= 2; @@ -352,8 +365,24 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, if (!ent->has_init) continue; - if (forkNum == INIT_FORKNUM && mark == SMGR_MARK_NONE) - continue; + if (ent->dirty_init) + { + /* + * The crashed transaction did SET UNLOGGED. This relation + * is restored to a LOGGED relation. + */ + if (forkNum != INIT_FORKNUM) + continue; + } + else + { + /* + * we don't remove the INIT fork of a non-dirty + * relation files. + */ + if (forkNum == INIT_FORKNUM && mark == SMGR_MARK_NONE) + continue; + } } /* so, nuke it! */ diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index e9e4bafb01..ddc8014e55 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -434,6 +434,12 @@ extractPageInfo(XLogReaderState *record) * empty so we don't need to bother the content. */ } + else if (rmid == RM_SMGR_ID && rminfo == XLOG_SMGR_BUFPERSISTENCE) + { + /* + * We can safely ignore these. These don't make any on-disk changes. + */ + } else if (rmid == RM_XACT_ID && ((rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT || (rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT_PREPARED || diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index f7f3b8227f..b3a1f255d7 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -460,7 +460,6 @@ main(int argc, char **argv) if (showprogress) pg_log_info("reading source file list"); source->traverse_files(source, &process_source_file); - if (showprogress) pg_log_info("reading target file list"); traverse_datadir(datadir_target, &process_target_file); diff --git a/src/include/catalog/storage_xlog.h b/src/include/catalog/storage_xlog.h index a36646c6ee..6e79c68f5b 100644 --- a/src/include/catalog/storage_xlog.h +++ b/src/include/catalog/storage_xlog.h @@ -62,6 +62,12 @@ typedef struct xl_smgr_mark smgr_mark_action action; } xl_smgr_mark; +typedef struct xl_smgr_bufpersistence +{ + RelFileLocator rlocator; + bool persistence; +} xl_smgr_bufpersistence; + /* flags for xl_smgr_truncate */ #define SMGR_TRUNCATE_HEAP 0x0001 #define SMGR_TRUNCATE_VM 0x0002 @@ -82,6 +88,8 @@ extern void log_smgrcreatemark(const RelFileLocator *rlocator, ForkNumber forkNum, StorageMarks mark); extern void log_smgrunlinkmark(const RelFileLocator *rlocator, ForkNumber forkNum, StorageMarks mark); +extern void log_smgrbufpersistence(const RelFileLocator *rlocator, + bool persistence); extern void smgr_redo(XLogReaderState *record); extern void smgr_desc(StringInfo buf, XLogReaderState *record); diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index b8a18b8081..fd34810dc2 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -156,6 +156,8 @@ extern void DropRelationBuffers(struct SMgrRelationData *smgr_reln, int nforks, BlockNumber *firstDelBlock); extern void DropRelationsAllBuffers(struct SMgrRelationData **smgr_reln, int nlocators); +extern void SetRelationBuffersPersistence(struct SMgrRelationData *srel, + bool permanent, bool isRedo); extern void DropDatabaseBuffers(Oid dbid); #define RelationGetNumberOfBlocks(reln) \ diff --git a/src/test/recovery/t/013_crash_restart.pl b/src/test/recovery/t/013_crash_restart.pl index 9def8d2062..92e7b367df 100644 --- a/src/test/recovery/t/013_crash_restart.pl +++ b/src/test/recovery/t/013_crash_restart.pl @@ -86,24 +86,6 @@ ok( pump_until( $killme_stdout = ''; $killme_stderr = ''; -#create a table that should *not* survive, but has rows. -#the table's contents is requried to cause access to the storage file -#after a restart. -$killme_stdin .= q[ -CREATE TABLE not_alive AS SELECT 1 as a; -SELECT pg_relation_filepath('not_alive'); -]; -ok( pump_until( - $killme, $psql_timeout, - \$killme_stdout, qr/[[:alnum:]\/]+[\r\n]$/m), - 'added in-creation table'); -my $not_alive_relfile = $node->data_dir . "/" . $killme_stdout; -chomp($not_alive_relfile); -$killme_stdout = ''; -$killme_stderr = ''; - -# The relfile must be exists now -ok ( -e $not_alive_relfile, 'relfile for in-creation table'); # Start longrunning query in second session; its failure will signal that # crash-restart has occurred. The initial wait for the trivial select is to @@ -162,9 +144,6 @@ $killme->run(); ($monitor_stdin, $monitor_stdout, $monitor_stderr) = ('', '', ''); $monitor->run(); -# The relfile must have been removed due to the recent restart. -ok ( ! -e $not_alive_relfile, - 'relfile for the in-creation table should be removed after restart'); # Acquire pid of new backend $killme_stdin .= q[ diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 747b7557dc..8dbbb09e8c 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -3886,6 +3886,7 @@ xl_replorigin_set xl_restore_point xl_running_xacts xl_seq_rec +xl_smgr_bufpersistence xl_smgr_create xl_smgr_mark xl_smgr_truncate -- 2.31.1