From 29b68dfd1560b574201bb44ec7692828650de9b3 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horiguchi.kyotaro@oss.ntt.co.jp>
Date: Mon, 27 May 2019 16:06:30 +0900
Subject: [PATCH 2/3] Fix WAL skipping feature

WAL-skipping operations mixed with WAL-logged operations can lead to
database corruption after a crash. This patch changes the WAL-skipping
feature so that no data modifcation is WAL-logged at all then sync
such relations at commit.
---
 src/backend/access/heap/heapam.c         |  4 +-
 src/backend/access/heap/heapam_handler.c | 22 +----------
 src/backend/access/heap/rewriteheap.c    | 13 ++-----
 src/backend/catalog/storage.c            | 64 +++++++++++++++++++++++++-------
 src/backend/commands/cluster.c           | 24 ++++++++++++
 src/backend/commands/copy.c              | 39 ++++---------------
 src/backend/commands/createas.c          |  5 +--
 src/backend/commands/matview.c           |  4 --
 src/backend/commands/tablecmds.c         | 10 ++---
 src/backend/storage/buffer/bufmgr.c      | 33 +++++++++++-----
 src/backend/utils/cache/relcache.c       | 16 ++++++--
 src/include/access/heapam.h              |  1 -
 src/include/access/rewriteheap.h         |  2 +-
 src/include/access/tableam.h             | 40 ++------------------
 src/include/storage/bufmgr.h             |  1 +
 src/include/utils/rel.h                  | 17 ++++++++-
 16 files changed, 148 insertions(+), 147 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 94309949fa..2f1d68762b 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -1941,7 +1941,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 	MarkBufferDirty(buffer);
 
 	/* XLOG stuff */
-	if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation))
+	if (RelationNeedsWAL(relation))
 	{
 		xl_heap_insert xlrec;
 		xl_heap_header xlhdr;
@@ -2124,7 +2124,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
 	/* currently not needed (thus unsupported) for heap_multi_insert() */
 	AssertArg(!(options & HEAP_INSERT_NO_LOGICAL));
 
-	needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation);
+	needwal = RelationNeedsWAL(relation);
 	saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
 												   HEAP_DEFAULT_FILLFACTOR);
 
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 09bc6fe98a..b9554f6064 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -556,18 +556,6 @@ tuple_lock_retry:
 	return result;
 }
 
-static void
-heapam_finish_bulk_insert(Relation relation, int options)
-{
-	/*
-	 * If we skipped writing WAL, then we need to sync the heap (but not
-	 * indexes since those use WAL anyway / don't go through tableam)
-	 */
-	if (options & HEAP_INSERT_SKIP_WAL)
-		heap_sync(relation);
-}
-
-
 /* ------------------------------------------------------------------------
  * DDL related callbacks for heap AM.
  * ------------------------------------------------------------------------
@@ -699,7 +687,6 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 	IndexScanDesc indexScan;
 	TableScanDesc tableScan;
 	HeapScanDesc heapScan;
-	bool		use_wal;
 	bool		is_system_catalog;
 	Tuplesortstate *tuplesort;
 	TupleDesc	oldTupDesc = RelationGetDescr(OldHeap);
@@ -713,12 +700,6 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 	/* Remember if it's a system catalog */
 	is_system_catalog = IsSystemRelation(OldHeap);
 
-	/*
-	 * We need to log the copied data in WAL iff WAL archiving/streaming is
-	 * enabled AND it's a WAL-logged rel.
-	 */
-	use_wal = XLogIsNeeded() && RelationNeedsWAL(NewHeap);
-
 	/* use_wal off requires smgr_targblock be initially invalid */
 	Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber);
 
@@ -729,7 +710,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 
 	/* Initialize the rewrite operation */
 	rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, *xid_cutoff,
-								 *multi_cutoff, use_wal);
+								 *multi_cutoff);
 
 
 	/* Set up sorting if wanted */
@@ -2517,7 +2498,6 @@ static const TableAmRoutine heapam_methods = {
 	.tuple_delete = heapam_tuple_delete,
 	.tuple_update = heapam_tuple_update,
 	.tuple_lock = heapam_tuple_lock,
-	.finish_bulk_insert = heapam_finish_bulk_insert,
 
 	.tuple_fetch_row_version = heapam_fetch_row_version,
 	.tuple_get_latest_tid = heap_get_latest_tid,
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 72a448ad31..992d4b9880 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -144,7 +144,6 @@ typedef struct RewriteStateData
 	Page		rs_buffer;		/* page currently being built */
 	BlockNumber rs_blockno;		/* block where page will go */
 	bool		rs_buffer_valid;	/* T if any tuples in buffer */
-	bool		rs_use_wal;		/* must we WAL-log inserts? */
 	bool		rs_logical_rewrite; /* do we need to do logical rewriting */
 	TransactionId rs_oldest_xmin;	/* oldest xmin used by caller to determine
 									 * tuple visibility */
@@ -238,15 +237,13 @@ static void logical_end_heap_rewrite(RewriteState state);
  * oldest_xmin	xid used by the caller to determine which tuples are dead
  * freeze_xid	xid before which tuples will be frozen
  * min_multi	multixact before which multis will be removed
- * use_wal		should the inserts to the new heap be WAL-logged?
  *
  * Returns an opaque RewriteState, allocated in current memory context,
  * to be used in subsequent calls to the other functions.
  */
 RewriteState
 begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xmin,
-				   TransactionId freeze_xid, MultiXactId cutoff_multi,
-				   bool use_wal)
+				   TransactionId freeze_xid, MultiXactId cutoff_multi)
 {
 	RewriteState state;
 	MemoryContext rw_cxt;
@@ -271,7 +268,6 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 	/* new_heap needn't be empty, just locked */
 	state->rs_blockno = RelationGetNumberOfBlocks(new_heap);
 	state->rs_buffer_valid = false;
-	state->rs_use_wal = use_wal;
 	state->rs_oldest_xmin = oldest_xmin;
 	state->rs_freeze_xid = freeze_xid;
 	state->rs_cutoff_multi = cutoff_multi;
@@ -330,7 +326,7 @@ end_heap_rewrite(RewriteState state)
 	/* Write the last page, if any */
 	if (state->rs_buffer_valid)
 	{
-		if (state->rs_use_wal)
+		if (RelationNeedsWAL(state->rs_new_rel))
 			log_newpage(&state->rs_new_rel->rd_node,
 						MAIN_FORKNUM,
 						state->rs_blockno,
@@ -654,9 +650,6 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 	{
 		int			options = HEAP_INSERT_SKIP_FSM;
 
-		if (!state->rs_use_wal)
-			options |= HEAP_INSERT_SKIP_WAL;
-
 		/*
 		 * While rewriting the heap for VACUUM FULL / CLUSTER, make sure data
 		 * for the TOAST table are not logically decoded.  The main heap is
@@ -695,7 +688,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 			/* Doesn't fit, so write out the existing page */
 
 			/* XLOG stuff */
-			if (state->rs_use_wal)
+			if (RelationNeedsWAL(state->rs_new_rel))
 				log_newpage(&state->rs_new_rel->rd_node,
 							MAIN_FORKNUM,
 							state->rs_blockno,
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 3cc886f7fe..e4bcdc390f 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -57,7 +57,8 @@ typedef struct PendingRelDelete
 {
 	RelFileNode relnode;		/* relation that may need to be deleted */
 	BackendId	backend;		/* InvalidBackendId if not a temp rel */
-	bool		atCommit;		/* T=delete at commit; F=delete at abort */
+	bool		atCommit;		/* T=work at commit; F=work at abort */
+	bool		dosync;			/* T=work is sync; F=work is delete */
 	int			nestLevel;		/* xact nesting level of request */
 	struct PendingRelDelete *next;	/* linked-list link */
 } PendingRelDelete;
@@ -114,10 +115,29 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence)
 	pending->relnode = rnode;
 	pending->backend = backend;
 	pending->atCommit = false;	/* delete if abort */
+	pending->dosync = false;
 	pending->nestLevel = GetCurrentTransactionNestLevel();
 	pending->next = pendingDeletes;
 	pendingDeletes = pending;
 
+	/*
+	 * We are going to skip WAL-logging for storage of persistent relations
+	 * created in the current transaction when wal_level = minimal. The
+	 * relation needs to be synced at commit.
+	 */
+	if (relpersistence == RELPERSISTENCE_PERMANENT && !XLogIsNeeded())
+	{
+		pending = (PendingRelDelete *)
+			MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
+		pending->relnode = rnode;
+		pending->backend = backend;
+		pending->atCommit = true;
+		pending->dosync = true;
+		pending->nestLevel = GetCurrentTransactionNestLevel();
+		pending->next = pendingDeletes;
+		pendingDeletes = pending;
+	}
+
 	return srel;
 }
 
@@ -155,6 +175,7 @@ RelationDropStorage(Relation rel)
 	pending->relnode = rel->rd_node;
 	pending->backend = rel->rd_backend;
 	pending->atCommit = true;	/* delete if commit */
+	pending->dosync = false;
 	pending->nestLevel = GetCurrentTransactionNestLevel();
 	pending->next = pendingDeletes;
 	pendingDeletes = pending;
@@ -428,21 +449,34 @@ smgrDoPendingDeletes(bool isCommit)
 			{
 				SMgrRelation srel;
 
-				srel = smgropen(pending->relnode, pending->backend);
-
-				/* allocate the initial array, or extend it, if needed */
-				if (maxrels == 0)
+				if (pending->dosync)
 				{
-					maxrels = 8;
-					srels = palloc(sizeof(SMgrRelation) * maxrels);
+					/* Perform pending sync of WAL-skipped relation */
+					FlushRelationBuffersWithoutRelcache(pending->relnode,
+														false);
+					srel = smgropen(pending->relnode, pending->backend);
+					smgrimmedsync(srel, MAIN_FORKNUM);
+					smgrclose(srel);
 				}
-				else if (maxrels <= nrels)
+				else
 				{
-					maxrels *= 2;
-					srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
-				}
+					/* Collect pending deletions */
+					srel = smgropen(pending->relnode, pending->backend);
 
-				srels[nrels++] = srel;
+					/* allocate the initial array, or extend it, if needed */
+					if (maxrels == 0)
+					{
+						maxrels = 8;
+						srels = palloc(sizeof(SMgrRelation) * maxrels);
+					}
+					else if (maxrels <= nrels)
+					{
+						maxrels *= 2;
+						srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
+					}
+
+					srels[nrels++] = srel;
+				}
 			}
 			/* must explicitly free the list entry */
 			pfree(pending);
@@ -489,8 +523,9 @@ smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr)
 	nrels = 0;
 	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
 	{
+		/* Pending syncs are excluded */
 		if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
-			&& pending->backend == InvalidBackendId)
+			&& pending->backend == InvalidBackendId && !pending->dosync)
 			nrels++;
 	}
 	if (nrels == 0)
@@ -502,8 +537,9 @@ smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr)
 	*ptr = rptr;
 	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
 	{
+		/* Pending syncs are excluded */
 		if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
-			&& pending->backend == InvalidBackendId)
+			&& pending->backend == InvalidBackendId && !pending->dosync)
 		{
 			*rptr = pending->relnode;
 			rptr++;
diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c
index cedb4ee844..29f7bf6dbd 100644
--- a/src/backend/commands/cluster.c
+++ b/src/backend/commands/cluster.c
@@ -1034,12 +1034,36 @@ swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
 
 	if (OidIsValid(relfilenode1) && OidIsValid(relfilenode2))
 	{
+		Relation rel1;
+		Relation rel2;
+
 		/*
 		 * Normal non-mapped relations: swap relfilenodes, reltablespaces,
 		 * relpersistence
 		 */
 		Assert(!target_is_pg_class);
 
+		/* Update creation subid hints of relcache */
+		rel1 = relation_open(r1, ExclusiveLock);
+		rel2 = relation_open(r2, ExclusiveLock);
+
+		/*
+		 * New relation's relfilenode is created in the current transaction
+		 * and used as old ralation's new relfilenode. So its
+		 * newRelfilenodeSubid as new relation's createSubid. We don't fix
+		 * rel2 since it would be deleted soon.
+		 */
+		Assert(rel2->rd_createSubid != InvalidSubTransactionId);
+		rel1->rd_newRelfilenodeSubid = rel2->rd_createSubid;
+
+		/* record the first relfilenode change in the current transaction */
+		if (rel1->rd_firstRelfilenodeSubid == InvalidSubTransactionId)
+			rel1->rd_firstRelfilenodeSubid = GetCurrentSubTransactionId();
+
+		relation_close(rel1, ExclusiveLock);
+		relation_close(rel2, ExclusiveLock);
+
+		/* swap relfilenodes, reltablespaces, relpersistence */
 		swaptemp = relform1->relfilenode;
 		relform1->relfilenode = relform2->relfilenode;
 		relform2->relfilenode = swaptemp;
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 4f04d122c3..f02efd59fc 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -2535,9 +2535,6 @@ CopyMultiInsertBufferCleanup(CopyMultiInsertInfo *miinfo,
 	for (i = 0; i < MAX_BUFFERED_TUPLES && buffer->slots[i] != NULL; i++)
 		ExecDropSingleTupleTableSlot(buffer->slots[i]);
 
-	table_finish_bulk_insert(buffer->resultRelInfo->ri_RelationDesc,
-							 miinfo->ti_options);
-
 	pfree(buffer);
 }
 
@@ -2726,28 +2723,9 @@ CopyFrom(CopyState cstate)
 	 * If it does commit, we'll have done the table_finish_bulk_insert() at
 	 * the bottom of this routine first.
 	 *
-	 * As mentioned in comments in utils/rel.h, the in-same-transaction test
-	 * is not always set correctly, since in rare cases rd_newRelfilenodeSubid
-	 * can be cleared before the end of the transaction. The exact case is
-	 * when a relation sets a new relfilenode twice in same transaction, yet
-	 * the second one fails in an aborted subtransaction, e.g.
-	 *
-	 * BEGIN;
-	 * TRUNCATE t;
-	 * SAVEPOINT save;
-	 * TRUNCATE t;
-	 * ROLLBACK TO save;
-	 * COPY ...
-	 *
-	 * Also, if the target file is new-in-transaction, we assume that checking
-	 * FSM for free space is a waste of time, even if we must use WAL because
-	 * of archiving.  This could possibly be wrong, but it's unlikely.
-	 *
-	 * The comments for table_tuple_insert and RelationGetBufferForTuple
-	 * specify that skipping WAL logging is only safe if we ensure that our
-	 * tuples do not go into pages containing tuples from any other
-	 * transactions --- but this must be the case if we have a new table or
-	 * new relfilenode, so we need no additional work to enforce that.
+	 * If the target file is new-in-transaction, we assume that checking FSM
+	 * for free space is a waste of time, even if we must use WAL because of
+	 * archiving.  This could possibly be wrong, but it's unlikely.
 	 *
 	 * We currently don't support this optimization if the COPY target is a
 	 * partitioned table as we currently only lazily initialize partition
@@ -2763,15 +2741,14 @@ CopyFrom(CopyState cstate)
 	 * are not supported as per the description above.
 	 *----------
 	 */
-	/* createSubid is creation check, newRelfilenodeSubid is truncation check */
+	/*
+	 * createSubid is creation check, firstRelfilenodeSubid is truncation and
+	 * cluster check. Partitioned table doesn't have storage.
+	 */
 	if (RELKIND_HAS_STORAGE(cstate->rel->rd_rel->relkind) &&
 		(cstate->rel->rd_createSubid != InvalidSubTransactionId ||
-		 cstate->rel->rd_newRelfilenodeSubid != InvalidSubTransactionId))
-	{
+		 cstate->rel->rd_firstRelfilenodeSubid != InvalidSubTransactionId))
 		ti_options |= TABLE_INSERT_SKIP_FSM;
-		if (!XLogIsNeeded())
-			ti_options |= TABLE_INSERT_SKIP_WAL;
-	}
 
 	/*
 	 * Optimize if new relfilenode was created in this subxact or one of its
diff --git a/src/backend/commands/createas.c b/src/backend/commands/createas.c
index b7d220699f..8a91d946e3 100644
--- a/src/backend/commands/createas.c
+++ b/src/backend/commands/createas.c
@@ -558,8 +558,7 @@ intorel_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
 	 * We can skip WAL-logging the insertions, unless PITR or streaming
 	 * replication is in use. We can skip the FSM in any case.
 	 */
-	myState->ti_options = TABLE_INSERT_SKIP_FSM |
-		(XLogIsNeeded() ? 0 : TABLE_INSERT_SKIP_WAL);
+	myState->ti_options = TABLE_INSERT_SKIP_FSM;
 	myState->bistate = GetBulkInsertState();
 
 	/* Not using WAL requires smgr_targblock be initially invalid */
@@ -604,8 +603,6 @@ intorel_shutdown(DestReceiver *self)
 
 	FreeBulkInsertState(myState->bistate);
 
-	table_finish_bulk_insert(myState->rel, myState->ti_options);
-
 	/* close rel, but keep lock until commit */
 	table_close(myState->rel, NoLock);
 	myState->rel = NULL;
diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c
index 537d0e8cef..1c854dcebf 100644
--- a/src/backend/commands/matview.c
+++ b/src/backend/commands/matview.c
@@ -463,8 +463,6 @@ transientrel_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
 	 * replication is in use. We can skip the FSM in any case.
 	 */
 	myState->ti_options = TABLE_INSERT_SKIP_FSM | TABLE_INSERT_FROZEN;
-	if (!XLogIsNeeded())
-		myState->ti_options |= TABLE_INSERT_SKIP_WAL;
 	myState->bistate = GetBulkInsertState();
 
 	/* Not using WAL requires smgr_targblock be initially invalid */
@@ -509,8 +507,6 @@ transientrel_shutdown(DestReceiver *self)
 
 	FreeBulkInsertState(myState->bistate);
 
-	table_finish_bulk_insert(myState->transientrel, myState->ti_options);
-
 	/* close transientrel, but keep lock until commit */
 	table_close(myState->transientrel, NoLock);
 	myState->transientrel = NULL;
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index fb2be10794..bdb7b53d2a 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -4762,9 +4762,9 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode)
 
 	/*
 	 * Prepare a BulkInsertState and options for table_tuple_insert. Because
-	 * we're building a new heap, we can skip WAL-logging and fsync it to disk
-	 * at the end instead (unless WAL-logging is required for archiving or
-	 * streaming replication). The FSM is empty too, so don't bother using it.
+	 * we're building a new heap, the underlying table AM can skip WAL-logging
+	 * and fsync the relation to disk at the end of the current transaction
+	 * instead. The FSM is empty too, so don't bother using it.
 	 */
 	if (newrel)
 	{
@@ -4772,8 +4772,6 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode)
 		bistate = GetBulkInsertState();
 
 		ti_options = TABLE_INSERT_SKIP_FSM;
-		if (!XLogIsNeeded())
-			ti_options |= TABLE_INSERT_SKIP_WAL;
 	}
 	else
 	{
@@ -5058,8 +5056,6 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode)
 	{
 		FreeBulkInsertState(bistate);
 
-		table_finish_bulk_insert(newrel, ti_options);
-
 		table_close(newrel, NoLock);
 	}
 }
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 6f3a402854..41ff6da9d9 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -171,6 +171,7 @@ static HTAB *PrivateRefCountHash = NULL;
 static int32 PrivateRefCountOverflowed = 0;
 static uint32 PrivateRefCountClock = 0;
 static PrivateRefCountEntry *ReservedRefCountEntry = NULL;
+static void FlushRelationBuffers_common(SMgrRelation smgr, bool islocal);
 
 static void ReservePrivateRefCountEntry(void);
 static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer);
@@ -3191,20 +3192,32 @@ PrintPinnedBufs(void)
 void
 FlushRelationBuffers(Relation rel)
 {
-	int			i;
-	BufferDesc *bufHdr;
-
-	/* Open rel at the smgr level if not already done */
 	RelationOpenSmgr(rel);
 
-	if (RelationUsesLocalBuffers(rel))
+	FlushRelationBuffers_common(rel->rd_smgr, RelationUsesLocalBuffers(rel));
+}
+
+void
+FlushRelationBuffersWithoutRelcache(RelFileNode rnode, bool islocal)
+{
+	FlushRelationBuffers_common(smgropen(rnode, InvalidBackendId), islocal);
+}
+
+static void
+FlushRelationBuffers_common(SMgrRelation smgr, bool islocal)
+{
+	RelFileNode rnode = smgr->smgr_rnode.node;
+	int i;
+	BufferDesc *bufHdr;
+
+	if (islocal)
 	{
 		for (i = 0; i < NLocBuffer; i++)
 		{
 			uint32		buf_state;
 
 			bufHdr = GetLocalBufferDescriptor(i);
-			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
+			if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
 				((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
 				 (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
 			{
@@ -3221,7 +3234,7 @@ FlushRelationBuffers(Relation rel)
 
 				PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
 
-				smgrwrite(rel->rd_smgr,
+				smgrwrite(smgr,
 						  bufHdr->tag.forkNum,
 						  bufHdr->tag.blockNum,
 						  localpage,
@@ -3251,18 +3264,18 @@ FlushRelationBuffers(Relation rel)
 		 * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
 		 * and saves some cycles.
 		 */
-		if (!RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
+		if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode))
 			continue;
 
 		ReservePrivateRefCountEntry();
 
 		buf_state = LockBufHdr(bufHdr);
-		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
+		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
 			(buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
 		{
 			PinBuffer_Locked(bufHdr);
 			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
-			FlushBuffer(bufHdr, rel->rd_smgr);
+			FlushBuffer(bufHdr, smgr);
 			LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
 			UnpinBuffer(bufHdr, true);
 		}
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index 7aa5d7c7fa..b6b61d0b1b 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -2660,7 +2660,7 @@ static void
 RelationFlushRelation(Relation relation)
 {
 	if (relation->rd_createSubid != InvalidSubTransactionId ||
-		relation->rd_newRelfilenodeSubid != InvalidSubTransactionId)
+		relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId)
 	{
 		/*
 		 * New relcache entries are always rebuilt, not flushed; else we'd
@@ -2800,7 +2800,7 @@ RelationCacheInvalidate(void)
 		 * pending invalidations.
 		 */
 		if (relation->rd_createSubid != InvalidSubTransactionId ||
-			relation->rd_newRelfilenodeSubid != InvalidSubTransactionId)
+			relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId)
 			continue;
 
 		relcacheInvalsReceived++;
@@ -3057,6 +3057,7 @@ AtEOXact_cleanup(Relation relation, bool isCommit)
 	 * Likewise, reset the hint about the relfilenode being new.
 	 */
 	relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
+	relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
 }
 
 /*
@@ -3148,7 +3149,7 @@ AtEOSubXact_cleanup(Relation relation, bool isCommit,
 	}
 
 	/*
-	 * Likewise, update or drop any new-relfilenode-in-subtransaction hint.
+	 * Likewise, update or drop any new-relfilenode-in-subtransaction hints.
 	 */
 	if (relation->rd_newRelfilenodeSubid == mySubid)
 	{
@@ -3157,6 +3158,15 @@ AtEOSubXact_cleanup(Relation relation, bool isCommit,
 		else
 			relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
 	}
+
+
+	if (relation->rd_firstRelfilenodeSubid == mySubid)
+	{
+		if (isCommit)
+			relation->rd_firstRelfilenodeSubid = parentSubid;
+		else
+			relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
+	}
 }
 
 
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 858bcb6bc9..80c2e1bafc 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -29,7 +29,6 @@
 
 
 /* "options" flag bits for heap_insert */
-#define HEAP_INSERT_SKIP_WAL	TABLE_INSERT_SKIP_WAL
 #define HEAP_INSERT_SKIP_FSM	TABLE_INSERT_SKIP_FSM
 #define HEAP_INSERT_FROZEN		TABLE_INSERT_FROZEN
 #define HEAP_INSERT_NO_LOGICAL	TABLE_INSERT_NO_LOGICAL
diff --git a/src/include/access/rewriteheap.h b/src/include/access/rewriteheap.h
index 8056253916..7f9736e294 100644
--- a/src/include/access/rewriteheap.h
+++ b/src/include/access/rewriteheap.h
@@ -23,7 +23,7 @@ typedef struct RewriteStateData *RewriteState;
 
 extern RewriteState begin_heap_rewrite(Relation OldHeap, Relation NewHeap,
 									   TransactionId OldestXmin, TransactionId FreezeXid,
-									   MultiXactId MultiXactCutoff, bool use_wal);
+									   MultiXactId MultiXactCutoff);
 extern void end_heap_rewrite(RewriteState state);
 extern void rewrite_heap_tuple(RewriteState state, HeapTuple oldTuple,
 							   HeapTuple newTuple);
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index 7f81703b78..b652cd6cef 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -407,22 +407,6 @@ typedef struct TableAmRoutine
 							   uint8 flags,
 							   TM_FailureData *tmfd);
 
-	/*
-	 * Perform operations necessary to complete insertions made via
-	 * tuple_insert and multi_insert with a BulkInsertState specified. This
-	 * may for example be used to flush the relation, when the
-	 * TABLE_INSERT_SKIP_WAL option was used.
-	 *
-	 * Typically callers of tuple_insert and multi_insert will just pass all
-	 * the flags that apply to them, and each AM has to decide which of them
-	 * make sense for it, and then only take actions in finish_bulk_insert for
-	 * those flags, and ignore others.
-	 *
-	 * Optional callback.
-	 */
-	void		(*finish_bulk_insert) (Relation rel, int options);
-
-
 	/* ------------------------------------------------------------------------
 	 * DDL related functionality.
 	 * ------------------------------------------------------------------------
@@ -1087,10 +1071,6 @@ table_compute_xid_horizon_for_tuples(Relation rel,
  * The options bitmask allows the caller to specify options that may change the
  * behaviour of the AM. The AM will ignore options that it does not support.
  *
- * If the TABLE_INSERT_SKIP_WAL option is specified, the new tuple doesn't
- * need to be logged to WAL, even for a non-temp relation. It is the AMs
- * choice whether this optimization is supported.
- *
  * If the TABLE_INSERT_SKIP_FSM option is specified, AMs are free to not reuse
  * free space in the relation. This can save some cycles when we know the
  * relation is new and doesn't contain useful amounts of free space.
@@ -1112,8 +1092,7 @@ table_compute_xid_horizon_for_tuples(Relation rel,
  * heap's TOAST table, too, if the tuple requires any out-of-line data.
  *
  * The BulkInsertState object (if any; bistate can be NULL for default
- * behavior) is also just passed through to RelationGetBufferForTuple. If
- * `bistate` is provided, table_finish_bulk_insert() needs to be called.
+ * behavior) is also just passed through to RelationGetBufferForTuple.
  *
  * On return the slot's tts_tid and tts_tableOid are updated to reflect the
  * insertion. But note that any toasting of fields within the slot is NOT
@@ -1248,6 +1227,8 @@ table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid,
  * update was done.  However, any TOAST changes in the new tuple's
  * data are not reflected into *newtup.
  *
+ * See table_insert about skipping WAL-logging feature.
+ *
  * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
  * t_xmax, and, if possible, t_cmax.  See comments for struct TM_FailureData
  * for additional info.
@@ -1308,21 +1289,6 @@ table_tuple_lock(Relation rel, ItemPointer tid, Snapshot snapshot,
 									   flags, tmfd);
 }
 
-/*
- * Perform operations necessary to complete insertions made via
- * tuple_insert and multi_insert with a BulkInsertState specified. This
- * e.g. may e.g. used to flush the relation when inserting with
- * TABLE_INSERT_SKIP_WAL specified.
- */
-static inline void
-table_finish_bulk_insert(Relation rel, int options)
-{
-	/* optional callback */
-	if (rel->rd_tableam && rel->rd_tableam->finish_bulk_insert)
-		rel->rd_tableam->finish_bulk_insert(rel, options);
-}
-
-
 /* ------------------------------------------------------------------------
  * DDL related functionality.
  * ------------------------------------------------------------------------
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 509f4b7ef1..ace5f5a2ae 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -189,6 +189,7 @@ extern BlockNumber RelationGetNumberOfBlocksInFork(Relation relation,
 												   ForkNumber forkNum);
 extern void FlushOneBuffer(Buffer buffer);
 extern void FlushRelationBuffers(Relation rel);
+extern void FlushRelationBuffersWithoutRelcache(RelFileNode rnode, bool islocal);
 extern void FlushDatabaseBuffers(Oid dbid);
 extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode,
 								   ForkNumber forkNum, BlockNumber firstDelBlock);
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index d35b4a5061..5cbb5a7b27 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -76,10 +76,17 @@ typedef struct RelationData
 	 * transaction, with one of them occurring in a subsequently aborted
 	 * subtransaction, e.g. BEGIN; TRUNCATE t; SAVEPOINT save; TRUNCATE t;
 	 * ROLLBACK TO save; -- rd_newRelfilenode is now forgotten
+	 * rd_firstRelfilenodeSubid is the ID of the hightest subtransaction the
+	 * relfilenode change has took place first in the current
+	 * transaction. This won't be forgotten as newRelfilenodeSubid is. A valid
+	 * OID means that the currently active relfilenode is transaction-local
+	 * and no-need for WAL-logging.
 	 */
 	SubTransactionId rd_createSubid;	/* rel was created in current xact */
 	SubTransactionId rd_newRelfilenodeSubid;	/* new relfilenode assigned in
 												 * current xact */
+	SubTransactionId rd_firstRelfilenodeSubid;	/* new relfilenode assigned
+												 * first in current xact */
 
 	Form_pg_class rd_rel;		/* RELATION tuple */
 	TupleDesc	rd_att;			/* tuple descriptor */
@@ -512,9 +519,15 @@ typedef struct ViewOptions
 /*
  * RelationNeedsWAL
  *		True if relation needs WAL.
+ *
+ * Returns false if wal_level = minimal and this relation is created or
+ * truncated in the current transaction.
  */
-#define RelationNeedsWAL(relation) \
-	((relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT)
+#define RelationNeedsWAL(relation)										\
+	((relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT &&	\
+	 (XLogIsNeeded() ||													\
+	  (relation->rd_createSubid == InvalidSubTransactionId &&			\
+	   relation->rd_firstRelfilenodeSubid == InvalidSubTransactionId)))
 
 /*
  * RelationUsesLocalBuffers
-- 
2.16.3