From 29b68dfd1560b574201bb44ec7692828650de9b3 Mon Sep 17 00:00:00 2001 From: Kyotaro Horiguchi Date: Mon, 27 May 2019 16:06:30 +0900 Subject: [PATCH 2/3] Fix WAL skipping feature WAL-skipping operations mixed with WAL-logged operations can lead to database corruption after a crash. This patch changes the WAL-skipping feature so that no data modifcation is WAL-logged at all then sync such relations at commit. --- src/backend/access/heap/heapam.c | 4 +- src/backend/access/heap/heapam_handler.c | 22 +---------- src/backend/access/heap/rewriteheap.c | 13 ++----- src/backend/catalog/storage.c | 64 +++++++++++++++++++++++++------- src/backend/commands/cluster.c | 24 ++++++++++++ src/backend/commands/copy.c | 39 ++++--------------- src/backend/commands/createas.c | 5 +-- src/backend/commands/matview.c | 4 -- src/backend/commands/tablecmds.c | 10 ++--- src/backend/storage/buffer/bufmgr.c | 33 +++++++++++----- src/backend/utils/cache/relcache.c | 16 ++++++-- src/include/access/heapam.h | 1 - src/include/access/rewriteheap.h | 2 +- src/include/access/tableam.h | 40 ++------------------ src/include/storage/bufmgr.h | 1 + src/include/utils/rel.h | 17 ++++++++- 16 files changed, 148 insertions(+), 147 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 94309949fa..2f1d68762b 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -1941,7 +1941,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, MarkBufferDirty(buffer); /* XLOG stuff */ - if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation)) + if (RelationNeedsWAL(relation)) { xl_heap_insert xlrec; xl_heap_header xlhdr; @@ -2124,7 +2124,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, /* currently not needed (thus unsupported) for heap_multi_insert() */ AssertArg(!(options & HEAP_INSERT_NO_LOGICAL)); - needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation); + needwal = RelationNeedsWAL(relation); saveFreeSpace = RelationGetTargetPageFreeSpace(relation, HEAP_DEFAULT_FILLFACTOR); diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 09bc6fe98a..b9554f6064 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -556,18 +556,6 @@ tuple_lock_retry: return result; } -static void -heapam_finish_bulk_insert(Relation relation, int options) -{ - /* - * If we skipped writing WAL, then we need to sync the heap (but not - * indexes since those use WAL anyway / don't go through tableam) - */ - if (options & HEAP_INSERT_SKIP_WAL) - heap_sync(relation); -} - - /* ------------------------------------------------------------------------ * DDL related callbacks for heap AM. * ------------------------------------------------------------------------ @@ -699,7 +687,6 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, IndexScanDesc indexScan; TableScanDesc tableScan; HeapScanDesc heapScan; - bool use_wal; bool is_system_catalog; Tuplesortstate *tuplesort; TupleDesc oldTupDesc = RelationGetDescr(OldHeap); @@ -713,12 +700,6 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, /* Remember if it's a system catalog */ is_system_catalog = IsSystemRelation(OldHeap); - /* - * We need to log the copied data in WAL iff WAL archiving/streaming is - * enabled AND it's a WAL-logged rel. - */ - use_wal = XLogIsNeeded() && RelationNeedsWAL(NewHeap); - /* use_wal off requires smgr_targblock be initially invalid */ Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber); @@ -729,7 +710,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, /* Initialize the rewrite operation */ rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, *xid_cutoff, - *multi_cutoff, use_wal); + *multi_cutoff); /* Set up sorting if wanted */ @@ -2517,7 +2498,6 @@ static const TableAmRoutine heapam_methods = { .tuple_delete = heapam_tuple_delete, .tuple_update = heapam_tuple_update, .tuple_lock = heapam_tuple_lock, - .finish_bulk_insert = heapam_finish_bulk_insert, .tuple_fetch_row_version = heapam_fetch_row_version, .tuple_get_latest_tid = heap_get_latest_tid, diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index 72a448ad31..992d4b9880 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -144,7 +144,6 @@ typedef struct RewriteStateData Page rs_buffer; /* page currently being built */ BlockNumber rs_blockno; /* block where page will go */ bool rs_buffer_valid; /* T if any tuples in buffer */ - bool rs_use_wal; /* must we WAL-log inserts? */ bool rs_logical_rewrite; /* do we need to do logical rewriting */ TransactionId rs_oldest_xmin; /* oldest xmin used by caller to determine * tuple visibility */ @@ -238,15 +237,13 @@ static void logical_end_heap_rewrite(RewriteState state); * oldest_xmin xid used by the caller to determine which tuples are dead * freeze_xid xid before which tuples will be frozen * min_multi multixact before which multis will be removed - * use_wal should the inserts to the new heap be WAL-logged? * * Returns an opaque RewriteState, allocated in current memory context, * to be used in subsequent calls to the other functions. */ RewriteState begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xmin, - TransactionId freeze_xid, MultiXactId cutoff_multi, - bool use_wal) + TransactionId freeze_xid, MultiXactId cutoff_multi) { RewriteState state; MemoryContext rw_cxt; @@ -271,7 +268,6 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm /* new_heap needn't be empty, just locked */ state->rs_blockno = RelationGetNumberOfBlocks(new_heap); state->rs_buffer_valid = false; - state->rs_use_wal = use_wal; state->rs_oldest_xmin = oldest_xmin; state->rs_freeze_xid = freeze_xid; state->rs_cutoff_multi = cutoff_multi; @@ -330,7 +326,7 @@ end_heap_rewrite(RewriteState state) /* Write the last page, if any */ if (state->rs_buffer_valid) { - if (state->rs_use_wal) + if (RelationNeedsWAL(state->rs_new_rel)) log_newpage(&state->rs_new_rel->rd_node, MAIN_FORKNUM, state->rs_blockno, @@ -654,9 +650,6 @@ raw_heap_insert(RewriteState state, HeapTuple tup) { int options = HEAP_INSERT_SKIP_FSM; - if (!state->rs_use_wal) - options |= HEAP_INSERT_SKIP_WAL; - /* * While rewriting the heap for VACUUM FULL / CLUSTER, make sure data * for the TOAST table are not logically decoded. The main heap is @@ -695,7 +688,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup) /* Doesn't fit, so write out the existing page */ /* XLOG stuff */ - if (state->rs_use_wal) + if (RelationNeedsWAL(state->rs_new_rel)) log_newpage(&state->rs_new_rel->rd_node, MAIN_FORKNUM, state->rs_blockno, diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index 3cc886f7fe..e4bcdc390f 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -57,7 +57,8 @@ typedef struct PendingRelDelete { RelFileNode relnode; /* relation that may need to be deleted */ BackendId backend; /* InvalidBackendId if not a temp rel */ - bool atCommit; /* T=delete at commit; F=delete at abort */ + bool atCommit; /* T=work at commit; F=work at abort */ + bool dosync; /* T=work is sync; F=work is delete */ int nestLevel; /* xact nesting level of request */ struct PendingRelDelete *next; /* linked-list link */ } PendingRelDelete; @@ -114,10 +115,29 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence) pending->relnode = rnode; pending->backend = backend; pending->atCommit = false; /* delete if abort */ + pending->dosync = false; pending->nestLevel = GetCurrentTransactionNestLevel(); pending->next = pendingDeletes; pendingDeletes = pending; + /* + * We are going to skip WAL-logging for storage of persistent relations + * created in the current transaction when wal_level = minimal. The + * relation needs to be synced at commit. + */ + if (relpersistence == RELPERSISTENCE_PERMANENT && !XLogIsNeeded()) + { + pending = (PendingRelDelete *) + MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete)); + pending->relnode = rnode; + pending->backend = backend; + pending->atCommit = true; + pending->dosync = true; + pending->nestLevel = GetCurrentTransactionNestLevel(); + pending->next = pendingDeletes; + pendingDeletes = pending; + } + return srel; } @@ -155,6 +175,7 @@ RelationDropStorage(Relation rel) pending->relnode = rel->rd_node; pending->backend = rel->rd_backend; pending->atCommit = true; /* delete if commit */ + pending->dosync = false; pending->nestLevel = GetCurrentTransactionNestLevel(); pending->next = pendingDeletes; pendingDeletes = pending; @@ -428,21 +449,34 @@ smgrDoPendingDeletes(bool isCommit) { SMgrRelation srel; - srel = smgropen(pending->relnode, pending->backend); - - /* allocate the initial array, or extend it, if needed */ - if (maxrels == 0) + if (pending->dosync) { - maxrels = 8; - srels = palloc(sizeof(SMgrRelation) * maxrels); + /* Perform pending sync of WAL-skipped relation */ + FlushRelationBuffersWithoutRelcache(pending->relnode, + false); + srel = smgropen(pending->relnode, pending->backend); + smgrimmedsync(srel, MAIN_FORKNUM); + smgrclose(srel); } - else if (maxrels <= nrels) + else { - maxrels *= 2; - srels = repalloc(srels, sizeof(SMgrRelation) * maxrels); - } + /* Collect pending deletions */ + srel = smgropen(pending->relnode, pending->backend); - srels[nrels++] = srel; + /* allocate the initial array, or extend it, if needed */ + if (maxrels == 0) + { + maxrels = 8; + srels = palloc(sizeof(SMgrRelation) * maxrels); + } + else if (maxrels <= nrels) + { + maxrels *= 2; + srels = repalloc(srels, sizeof(SMgrRelation) * maxrels); + } + + srels[nrels++] = srel; + } } /* must explicitly free the list entry */ pfree(pending); @@ -489,8 +523,9 @@ smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr) nrels = 0; for (pending = pendingDeletes; pending != NULL; pending = pending->next) { + /* Pending syncs are excluded */ if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit - && pending->backend == InvalidBackendId) + && pending->backend == InvalidBackendId && !pending->dosync) nrels++; } if (nrels == 0) @@ -502,8 +537,9 @@ smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr) *ptr = rptr; for (pending = pendingDeletes; pending != NULL; pending = pending->next) { + /* Pending syncs are excluded */ if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit - && pending->backend == InvalidBackendId) + && pending->backend == InvalidBackendId && !pending->dosync) { *rptr = pending->relnode; rptr++; diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c index cedb4ee844..29f7bf6dbd 100644 --- a/src/backend/commands/cluster.c +++ b/src/backend/commands/cluster.c @@ -1034,12 +1034,36 @@ swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class, if (OidIsValid(relfilenode1) && OidIsValid(relfilenode2)) { + Relation rel1; + Relation rel2; + /* * Normal non-mapped relations: swap relfilenodes, reltablespaces, * relpersistence */ Assert(!target_is_pg_class); + /* Update creation subid hints of relcache */ + rel1 = relation_open(r1, ExclusiveLock); + rel2 = relation_open(r2, ExclusiveLock); + + /* + * New relation's relfilenode is created in the current transaction + * and used as old ralation's new relfilenode. So its + * newRelfilenodeSubid as new relation's createSubid. We don't fix + * rel2 since it would be deleted soon. + */ + Assert(rel2->rd_createSubid != InvalidSubTransactionId); + rel1->rd_newRelfilenodeSubid = rel2->rd_createSubid; + + /* record the first relfilenode change in the current transaction */ + if (rel1->rd_firstRelfilenodeSubid == InvalidSubTransactionId) + rel1->rd_firstRelfilenodeSubid = GetCurrentSubTransactionId(); + + relation_close(rel1, ExclusiveLock); + relation_close(rel2, ExclusiveLock); + + /* swap relfilenodes, reltablespaces, relpersistence */ swaptemp = relform1->relfilenode; relform1->relfilenode = relform2->relfilenode; relform2->relfilenode = swaptemp; diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 4f04d122c3..f02efd59fc 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -2535,9 +2535,6 @@ CopyMultiInsertBufferCleanup(CopyMultiInsertInfo *miinfo, for (i = 0; i < MAX_BUFFERED_TUPLES && buffer->slots[i] != NULL; i++) ExecDropSingleTupleTableSlot(buffer->slots[i]); - table_finish_bulk_insert(buffer->resultRelInfo->ri_RelationDesc, - miinfo->ti_options); - pfree(buffer); } @@ -2726,28 +2723,9 @@ CopyFrom(CopyState cstate) * If it does commit, we'll have done the table_finish_bulk_insert() at * the bottom of this routine first. * - * As mentioned in comments in utils/rel.h, the in-same-transaction test - * is not always set correctly, since in rare cases rd_newRelfilenodeSubid - * can be cleared before the end of the transaction. The exact case is - * when a relation sets a new relfilenode twice in same transaction, yet - * the second one fails in an aborted subtransaction, e.g. - * - * BEGIN; - * TRUNCATE t; - * SAVEPOINT save; - * TRUNCATE t; - * ROLLBACK TO save; - * COPY ... - * - * Also, if the target file is new-in-transaction, we assume that checking - * FSM for free space is a waste of time, even if we must use WAL because - * of archiving. This could possibly be wrong, but it's unlikely. - * - * The comments for table_tuple_insert and RelationGetBufferForTuple - * specify that skipping WAL logging is only safe if we ensure that our - * tuples do not go into pages containing tuples from any other - * transactions --- but this must be the case if we have a new table or - * new relfilenode, so we need no additional work to enforce that. + * If the target file is new-in-transaction, we assume that checking FSM + * for free space is a waste of time, even if we must use WAL because of + * archiving. This could possibly be wrong, but it's unlikely. * * We currently don't support this optimization if the COPY target is a * partitioned table as we currently only lazily initialize partition @@ -2763,15 +2741,14 @@ CopyFrom(CopyState cstate) * are not supported as per the description above. *---------- */ - /* createSubid is creation check, newRelfilenodeSubid is truncation check */ + /* + * createSubid is creation check, firstRelfilenodeSubid is truncation and + * cluster check. Partitioned table doesn't have storage. + */ if (RELKIND_HAS_STORAGE(cstate->rel->rd_rel->relkind) && (cstate->rel->rd_createSubid != InvalidSubTransactionId || - cstate->rel->rd_newRelfilenodeSubid != InvalidSubTransactionId)) - { + cstate->rel->rd_firstRelfilenodeSubid != InvalidSubTransactionId)) ti_options |= TABLE_INSERT_SKIP_FSM; - if (!XLogIsNeeded()) - ti_options |= TABLE_INSERT_SKIP_WAL; - } /* * Optimize if new relfilenode was created in this subxact or one of its diff --git a/src/backend/commands/createas.c b/src/backend/commands/createas.c index b7d220699f..8a91d946e3 100644 --- a/src/backend/commands/createas.c +++ b/src/backend/commands/createas.c @@ -558,8 +558,7 @@ intorel_startup(DestReceiver *self, int operation, TupleDesc typeinfo) * We can skip WAL-logging the insertions, unless PITR or streaming * replication is in use. We can skip the FSM in any case. */ - myState->ti_options = TABLE_INSERT_SKIP_FSM | - (XLogIsNeeded() ? 0 : TABLE_INSERT_SKIP_WAL); + myState->ti_options = TABLE_INSERT_SKIP_FSM; myState->bistate = GetBulkInsertState(); /* Not using WAL requires smgr_targblock be initially invalid */ @@ -604,8 +603,6 @@ intorel_shutdown(DestReceiver *self) FreeBulkInsertState(myState->bistate); - table_finish_bulk_insert(myState->rel, myState->ti_options); - /* close rel, but keep lock until commit */ table_close(myState->rel, NoLock); myState->rel = NULL; diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c index 537d0e8cef..1c854dcebf 100644 --- a/src/backend/commands/matview.c +++ b/src/backend/commands/matview.c @@ -463,8 +463,6 @@ transientrel_startup(DestReceiver *self, int operation, TupleDesc typeinfo) * replication is in use. We can skip the FSM in any case. */ myState->ti_options = TABLE_INSERT_SKIP_FSM | TABLE_INSERT_FROZEN; - if (!XLogIsNeeded()) - myState->ti_options |= TABLE_INSERT_SKIP_WAL; myState->bistate = GetBulkInsertState(); /* Not using WAL requires smgr_targblock be initially invalid */ @@ -509,8 +507,6 @@ transientrel_shutdown(DestReceiver *self) FreeBulkInsertState(myState->bistate); - table_finish_bulk_insert(myState->transientrel, myState->ti_options); - /* close transientrel, but keep lock until commit */ table_close(myState->transientrel, NoLock); myState->transientrel = NULL; diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index fb2be10794..bdb7b53d2a 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -4762,9 +4762,9 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode) /* * Prepare a BulkInsertState and options for table_tuple_insert. Because - * we're building a new heap, we can skip WAL-logging and fsync it to disk - * at the end instead (unless WAL-logging is required for archiving or - * streaming replication). The FSM is empty too, so don't bother using it. + * we're building a new heap, the underlying table AM can skip WAL-logging + * and fsync the relation to disk at the end of the current transaction + * instead. The FSM is empty too, so don't bother using it. */ if (newrel) { @@ -4772,8 +4772,6 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode) bistate = GetBulkInsertState(); ti_options = TABLE_INSERT_SKIP_FSM; - if (!XLogIsNeeded()) - ti_options |= TABLE_INSERT_SKIP_WAL; } else { @@ -5058,8 +5056,6 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode) { FreeBulkInsertState(bistate); - table_finish_bulk_insert(newrel, ti_options); - table_close(newrel, NoLock); } } diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 6f3a402854..41ff6da9d9 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -171,6 +171,7 @@ static HTAB *PrivateRefCountHash = NULL; static int32 PrivateRefCountOverflowed = 0; static uint32 PrivateRefCountClock = 0; static PrivateRefCountEntry *ReservedRefCountEntry = NULL; +static void FlushRelationBuffers_common(SMgrRelation smgr, bool islocal); static void ReservePrivateRefCountEntry(void); static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer); @@ -3191,20 +3192,32 @@ PrintPinnedBufs(void) void FlushRelationBuffers(Relation rel) { - int i; - BufferDesc *bufHdr; - - /* Open rel at the smgr level if not already done */ RelationOpenSmgr(rel); - if (RelationUsesLocalBuffers(rel)) + FlushRelationBuffers_common(rel->rd_smgr, RelationUsesLocalBuffers(rel)); +} + +void +FlushRelationBuffersWithoutRelcache(RelFileNode rnode, bool islocal) +{ + FlushRelationBuffers_common(smgropen(rnode, InvalidBackendId), islocal); +} + +static void +FlushRelationBuffers_common(SMgrRelation smgr, bool islocal) +{ + RelFileNode rnode = smgr->smgr_rnode.node; + int i; + BufferDesc *bufHdr; + + if (islocal) { for (i = 0; i < NLocBuffer; i++) { uint32 buf_state; bufHdr = GetLocalBufferDescriptor(i); - if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) && + if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) && ((buf_state = pg_atomic_read_u32(&bufHdr->state)) & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY)) { @@ -3221,7 +3234,7 @@ FlushRelationBuffers(Relation rel) PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); - smgrwrite(rel->rd_smgr, + smgrwrite(smgr, bufHdr->tag.forkNum, bufHdr->tag.blockNum, localpage, @@ -3251,18 +3264,18 @@ FlushRelationBuffers(Relation rel) * As in DropRelFileNodeBuffers, an unlocked precheck should be safe * and saves some cycles. */ - if (!RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node)) + if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode)) continue; ReservePrivateRefCountEntry(); buf_state = LockBufHdr(bufHdr); - if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) && + if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) && (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY)) { PinBuffer_Locked(bufHdr); LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); - FlushBuffer(bufHdr, rel->rd_smgr); + FlushBuffer(bufHdr, smgr); LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); UnpinBuffer(bufHdr, true); } diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 7aa5d7c7fa..b6b61d0b1b 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -2660,7 +2660,7 @@ static void RelationFlushRelation(Relation relation) { if (relation->rd_createSubid != InvalidSubTransactionId || - relation->rd_newRelfilenodeSubid != InvalidSubTransactionId) + relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId) { /* * New relcache entries are always rebuilt, not flushed; else we'd @@ -2800,7 +2800,7 @@ RelationCacheInvalidate(void) * pending invalidations. */ if (relation->rd_createSubid != InvalidSubTransactionId || - relation->rd_newRelfilenodeSubid != InvalidSubTransactionId) + relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId) continue; relcacheInvalsReceived++; @@ -3057,6 +3057,7 @@ AtEOXact_cleanup(Relation relation, bool isCommit) * Likewise, reset the hint about the relfilenode being new. */ relation->rd_newRelfilenodeSubid = InvalidSubTransactionId; + relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId; } /* @@ -3148,7 +3149,7 @@ AtEOSubXact_cleanup(Relation relation, bool isCommit, } /* - * Likewise, update or drop any new-relfilenode-in-subtransaction hint. + * Likewise, update or drop any new-relfilenode-in-subtransaction hints. */ if (relation->rd_newRelfilenodeSubid == mySubid) { @@ -3157,6 +3158,15 @@ AtEOSubXact_cleanup(Relation relation, bool isCommit, else relation->rd_newRelfilenodeSubid = InvalidSubTransactionId; } + + + if (relation->rd_firstRelfilenodeSubid == mySubid) + { + if (isCommit) + relation->rd_firstRelfilenodeSubid = parentSubid; + else + relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId; + } } diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 858bcb6bc9..80c2e1bafc 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -29,7 +29,6 @@ /* "options" flag bits for heap_insert */ -#define HEAP_INSERT_SKIP_WAL TABLE_INSERT_SKIP_WAL #define HEAP_INSERT_SKIP_FSM TABLE_INSERT_SKIP_FSM #define HEAP_INSERT_FROZEN TABLE_INSERT_FROZEN #define HEAP_INSERT_NO_LOGICAL TABLE_INSERT_NO_LOGICAL diff --git a/src/include/access/rewriteheap.h b/src/include/access/rewriteheap.h index 8056253916..7f9736e294 100644 --- a/src/include/access/rewriteheap.h +++ b/src/include/access/rewriteheap.h @@ -23,7 +23,7 @@ typedef struct RewriteStateData *RewriteState; extern RewriteState begin_heap_rewrite(Relation OldHeap, Relation NewHeap, TransactionId OldestXmin, TransactionId FreezeXid, - MultiXactId MultiXactCutoff, bool use_wal); + MultiXactId MultiXactCutoff); extern void end_heap_rewrite(RewriteState state); extern void rewrite_heap_tuple(RewriteState state, HeapTuple oldTuple, HeapTuple newTuple); diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 7f81703b78..b652cd6cef 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -407,22 +407,6 @@ typedef struct TableAmRoutine uint8 flags, TM_FailureData *tmfd); - /* - * Perform operations necessary to complete insertions made via - * tuple_insert and multi_insert with a BulkInsertState specified. This - * may for example be used to flush the relation, when the - * TABLE_INSERT_SKIP_WAL option was used. - * - * Typically callers of tuple_insert and multi_insert will just pass all - * the flags that apply to them, and each AM has to decide which of them - * make sense for it, and then only take actions in finish_bulk_insert for - * those flags, and ignore others. - * - * Optional callback. - */ - void (*finish_bulk_insert) (Relation rel, int options); - - /* ------------------------------------------------------------------------ * DDL related functionality. * ------------------------------------------------------------------------ @@ -1087,10 +1071,6 @@ table_compute_xid_horizon_for_tuples(Relation rel, * The options bitmask allows the caller to specify options that may change the * behaviour of the AM. The AM will ignore options that it does not support. * - * If the TABLE_INSERT_SKIP_WAL option is specified, the new tuple doesn't - * need to be logged to WAL, even for a non-temp relation. It is the AMs - * choice whether this optimization is supported. - * * If the TABLE_INSERT_SKIP_FSM option is specified, AMs are free to not reuse * free space in the relation. This can save some cycles when we know the * relation is new and doesn't contain useful amounts of free space. @@ -1112,8 +1092,7 @@ table_compute_xid_horizon_for_tuples(Relation rel, * heap's TOAST table, too, if the tuple requires any out-of-line data. * * The BulkInsertState object (if any; bistate can be NULL for default - * behavior) is also just passed through to RelationGetBufferForTuple. If - * `bistate` is provided, table_finish_bulk_insert() needs to be called. + * behavior) is also just passed through to RelationGetBufferForTuple. * * On return the slot's tts_tid and tts_tableOid are updated to reflect the * insertion. But note that any toasting of fields within the slot is NOT @@ -1248,6 +1227,8 @@ table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid, * update was done. However, any TOAST changes in the new tuple's * data are not reflected into *newtup. * + * See table_insert about skipping WAL-logging feature. + * * In the failure cases, the routine fills *tmfd with the tuple's t_ctid, * t_xmax, and, if possible, t_cmax. See comments for struct TM_FailureData * for additional info. @@ -1308,21 +1289,6 @@ table_tuple_lock(Relation rel, ItemPointer tid, Snapshot snapshot, flags, tmfd); } -/* - * Perform operations necessary to complete insertions made via - * tuple_insert and multi_insert with a BulkInsertState specified. This - * e.g. may e.g. used to flush the relation when inserting with - * TABLE_INSERT_SKIP_WAL specified. - */ -static inline void -table_finish_bulk_insert(Relation rel, int options) -{ - /* optional callback */ - if (rel->rd_tableam && rel->rd_tableam->finish_bulk_insert) - rel->rd_tableam->finish_bulk_insert(rel, options); -} - - /* ------------------------------------------------------------------------ * DDL related functionality. * ------------------------------------------------------------------------ diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 509f4b7ef1..ace5f5a2ae 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -189,6 +189,7 @@ extern BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum); extern void FlushOneBuffer(Buffer buffer); extern void FlushRelationBuffers(Relation rel); +extern void FlushRelationBuffersWithoutRelcache(RelFileNode rnode, bool islocal); extern void FlushDatabaseBuffers(Oid dbid); extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber forkNum, BlockNumber firstDelBlock); diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index d35b4a5061..5cbb5a7b27 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -76,10 +76,17 @@ typedef struct RelationData * transaction, with one of them occurring in a subsequently aborted * subtransaction, e.g. BEGIN; TRUNCATE t; SAVEPOINT save; TRUNCATE t; * ROLLBACK TO save; -- rd_newRelfilenode is now forgotten + * rd_firstRelfilenodeSubid is the ID of the hightest subtransaction the + * relfilenode change has took place first in the current + * transaction. This won't be forgotten as newRelfilenodeSubid is. A valid + * OID means that the currently active relfilenode is transaction-local + * and no-need for WAL-logging. */ SubTransactionId rd_createSubid; /* rel was created in current xact */ SubTransactionId rd_newRelfilenodeSubid; /* new relfilenode assigned in * current xact */ + SubTransactionId rd_firstRelfilenodeSubid; /* new relfilenode assigned + * first in current xact */ Form_pg_class rd_rel; /* RELATION tuple */ TupleDesc rd_att; /* tuple descriptor */ @@ -512,9 +519,15 @@ typedef struct ViewOptions /* * RelationNeedsWAL * True if relation needs WAL. + * + * Returns false if wal_level = minimal and this relation is created or + * truncated in the current transaction. */ -#define RelationNeedsWAL(relation) \ - ((relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT) +#define RelationNeedsWAL(relation) \ + ((relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT && \ + (XLogIsNeeded() || \ + (relation->rd_createSubid == InvalidSubTransactionId && \ + relation->rd_firstRelfilenodeSubid == InvalidSubTransactionId))) /* * RelationUsesLocalBuffers -- 2.16.3