Re: [HACKERS] WAL logging problem in 9.4.3? - Mailing list pgsql-hackers
| From | Kyotaro HORIGUCHI |
|---|---|
| Subject | Re: [HACKERS] WAL logging problem in 9.4.3? |
| Date | |
| Msg-id | 20170914.153459.94374240.horiguchi.kyotaro@lab.ntt.co.jp Whole thread Raw |
| In response to | Re: [HACKERS] WAL logging problem in 9.4.3? (Kyotaro HORIGUCHI <horiguchi.kyotaro@lab.ntt.co.jp>) |
| Responses |
Re: [HACKERS] WAL logging problem in 9.4.3?
|
| List | pgsql-hackers |
At Wed, 13 Sep 2017 17:42:39 +0900 (Tokyo Standard Time), Kyotaro HORIGUCHI <horiguchi.kyotaro@lab.ntt.co.jp> wrote in
<20170913.174239.25978735.horiguchi.kyotaro@lab.ntt.co.jp>
> filterdiff seems to did something wrong..
# to did...
The patch is broken by filterdiff so I send a new patch made
directly by git format-patch. I confirmed that a build completes
with applying this.
regards,
--
Kyotaro Horiguchi
NTT Open Source Software Center
From 7086b5855080065f73de4d099cbaab09511f01fc Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horiguchi.kyotaro@lab.ntt.co.jp>
Date: Tue, 12 Sep 2017 13:01:33 +0900
Subject: [PATCH] Fix WAL logging problem
---src/backend/access/heap/heapam.c | 113 +++++++++---src/backend/access/heap/pruneheap.c | 3
+-src/backend/access/heap/rewriteheap.c | 4 +-src/backend/access/heap/visibilitymap.c | 3
+-src/backend/access/transam/xact.c | 7 +src/backend/catalog/storage.c | 318
+++++++++++++++++++++++++++++---src/backend/commands/copy.c | 13 +-src/backend/commands/createas.c
| 9 +-src/backend/commands/matview.c | 6 +-src/backend/commands/tablecmds.c | 8
+-src/backend/commands/vacuumlazy.c | 6 +-src/backend/storage/buffer/bufmgr.c | 40
+++-src/backend/utils/cache/relcache.c | 13 ++src/include/access/heapam.h | 8
+-src/include/catalog/storage.h | 5 +-src/include/storage/bufmgr.h | 2 +src/include/utils/rel.h
| 8 +17 files changed, 476 insertions(+), 90 deletions(-)
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index d20f038..e40254d 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -34,6 +34,28 @@ * the POSTGRES heap access method used for all POSTGRES * relations. *
+ * WAL CONSIDERATIONS
+ * All heap operations are normally WAL-logged. but there are a few
+ * exceptions. Temporary and unlogged relations never need to be
+ * WAL-logged, but we can also skip WAL-logging for a table that was
+ * created in the same transaction, if we don't need WAL for PITR or
+ * WAL archival purposes (i.e. if wal_level=minimal), and we fsync()
+ * the file to disk at COMMIT instead.
+ *
+ * The same-relation optimization is not employed automatically on all
+ * updates to a table that was created in the same transacton, because
+ * for a small number of changes, it's cheaper to just create the WAL
+ * records than fsyncing() the whole relation at COMMIT. It is only
+ * worthwhile for (presumably) large operations like COPY, CLUSTER,
+ * or VACUUM FULL. Use heap_register_sync() to initiate such an
+ * operation; it will cause any subsequent updates to the table to skip
+ * WAL-logging, if possible, and cause the heap to be synced to disk at
+ * COMMIT.
+ *
+ * To make that work, all modifications to heap must use
+ * HeapNeedsWAL() to check if WAL-logging is needed in this transaction
+ * for the given block.
+ * *------------------------------------------------------------------------- */#include "postgres.h"
@@ -56,6 +78,7 @@#include "access/xlogutils.h"#include "catalog/catalog.h"#include "catalog/namespace.h"
+#include "catalog/storage.h"#include "miscadmin.h"#include "pgstat.h"#include "port/atomics.h"
@@ -2373,12 +2396,6 @@ ReleaseBulkInsertStatePin(BulkInsertState bistate) * The new tuple is stamped with current
transactionID and the specified * command ID. *
- * If the HEAP_INSERT_SKIP_WAL option is specified, the new tuple is not
- * logged in WAL, even for a non-temp relation. Safe usage of this behavior
- * requires that we arrange that all new tuples go into new pages not
- * containing any tuples from other transactions, and that the relation gets
- * fsync'd before commit. (See also heap_sync() comments)
- * * The HEAP_INSERT_SKIP_FSM option is passed directly to * RelationGetBufferForTuple, which see for more info. *
@@ -2409,6 +2426,7 @@ ReleaseBulkInsertStatePin(BulkInsertState bistate) * TID where the tuple was stored. But note
thatany toasting of fields * within the tuple data is NOT reflected into *tup. */
+extern HTAB *pendingSyncs;Oidheap_insert(Relation relation, HeapTuple tup, CommandId cid, int options,
BulkInsertStatebistate)
@@ -2482,7 +2500,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, MarkBufferDirty(buffer); /*
XLOGstuff */
- if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation))
+ if (BufferNeedsWAL(relation, buffer)) { xl_heap_insert xlrec; xl_heap_header xlhdr;
@@ -2681,12 +2699,10 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, int ndone;
char *scratch = NULL; Page page;
- bool needwal; Size saveFreeSpace; bool need_tuple_data =
RelationIsLogicallyLogged(relation); bool need_cids = RelationIsAccessibleInLogicalDecoding(relation);
- needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation); saveFreeSpace =
RelationGetTargetPageFreeSpace(relation, HEAP_DEFAULT_FILLFACTOR);
@@ -2701,7 +2717,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, * palloc() within a
criticalsection is not safe, so we allocate this * beforehand. */
- if (needwal)
+ if (RelationNeedsWAL(relation)) scratch = palloc(BLCKSZ); /*
@@ -2736,6 +2752,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, Buffer
vmbuffer= InvalidBuffer; bool all_visible_cleared = false; int nthispage;
+ bool needwal; CHECK_FOR_INTERRUPTS();
@@ -2747,6 +2764,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
InvalidBuffer, options, bistate, &vmbuffer, NULL); page =
BufferGetPage(buffer);
+ needwal = BufferNeedsWAL(relation, buffer); /* NO EREPORT(ERROR) from here till changes are logged */
START_CRIT_SECTION();
@@ -3303,7 +3321,7 @@ l1: * NB: heap_abort_speculative() uses the same xlog record and replay * routines.
*/
- if (RelationNeedsWAL(relation))
+ if (BufferNeedsWAL(relation, buffer)) { xl_heap_delete xlrec; XLogRecPtr recptr;
@@ -4269,7 +4287,8 @@ l2: MarkBufferDirty(buffer); /* XLOG stuff */
- if (RelationNeedsWAL(relation))
+ if (BufferNeedsWAL(relation, buffer) ||
+ BufferNeedsWAL(relation, newbuf)) { XLogRecPtr recptr;
@@ -5160,7 +5179,7 @@ failed: * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG *
entriesfor everything anyway.) */
- if (RelationNeedsWAL(relation))
+ if (BufferNeedsWAL(relation, *buffer)) { xl_heap_lock xlrec; XLogRecPtr recptr;
@@ -5894,7 +5913,7 @@ l4: MarkBufferDirty(buf); /* XLOG stuff */
- if (RelationNeedsWAL(rel))
+ if (BufferNeedsWAL(rel, buf)) { xl_heap_lock_updated xlrec; XLogRecPtr
recptr;
@@ -6050,7 +6069,7 @@ heap_finish_speculative(Relation relation, HeapTuple tuple) htup->t_ctid = tuple->t_self;
/*XLOG stuff */
- if (RelationNeedsWAL(relation))
+ if (BufferNeedsWAL(relation, buffer)) { xl_heap_confirm xlrec; XLogRecPtr recptr;
@@ -6183,7 +6202,7 @@ heap_abort_speculative(Relation relation, HeapTuple tuple) * The WAL records generated here
matchheap_delete(). The same recovery * routines are used. */
- if (RelationNeedsWAL(relation))
+ if (BufferNeedsWAL(relation, buffer)) { xl_heap_delete xlrec; XLogRecPtr recptr;
@@ -6292,7 +6311,7 @@ heap_inplace_update(Relation relation, HeapTuple tuple) MarkBufferDirty(buffer); /* XLOG
stuff*/
- if (RelationNeedsWAL(relation))
+ if (BufferNeedsWAL(relation, buffer)) { xl_heap_inplace xlrec; XLogRecPtr recptr;
@@ -7406,7 +7425,7 @@ log_heap_clean(Relation reln, Buffer buffer, XLogRecPtr recptr; /* Caller should not
callme on a non-WAL-logged relation */
- Assert(RelationNeedsWAL(reln));
+ Assert(BufferNeedsWAL(reln, buffer)); xlrec.latestRemovedXid = latestRemovedXid; xlrec.nredirected =
nredirected;
@@ -7454,7 +7473,7 @@ log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid, XLogRecPtr recptr;
/* Caller should not call me on a non-WAL-logged relation */
- Assert(RelationNeedsWAL(reln));
+ Assert(BufferNeedsWAL(reln, buffer)); /* nor when there are no tuples to freeze */ Assert(ntuples > 0);
@@ -7539,7 +7558,7 @@ log_heap_update(Relation reln, Buffer oldbuf, int bufflags; /* Caller should not
callme on a non-WAL-logged relation */
- Assert(RelationNeedsWAL(reln));
+ Assert(BufferNeedsWAL(reln, newbuf) || BufferNeedsWAL(reln, oldbuf)); XLogBeginInsert();
@@ -8630,8 +8649,13 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) */ /* Deal with old tuple
version*/
- oldaction = XLogReadBufferForRedo(record, (oldblk == newblk) ? 0 : 1,
- &obuffer);
+ if (oldblk == newblk)
+ oldaction = XLogReadBufferForRedo(record, 0, &obuffer);
+ else if (XLogRecHasBlockRef(record, 1))
+ oldaction = XLogReadBufferForRedo(record, 1, &obuffer);
+ else
+ oldaction = BLK_DONE;
+ if (oldaction == BLK_NEEDS_REDO) { page = BufferGetPage(obuffer);
@@ -8685,6 +8709,8 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) PageInit(page,
BufferGetPageSize(nbuffer),0); newaction = BLK_NEEDS_REDO; }
+ else if (!XLogRecHasBlockRef(record, 0))
+ newaction = BLK_DONE; else newaction = XLogReadBufferForRedo(record, 0, &nbuffer);
@@ -9121,9 +9147,16 @@ heap2_redo(XLogReaderState *record) * heap_sync - sync a heap, for use when no WAL has
beenwritten * * This forces the heap contents (including TOAST heap if any) down to disk.
- * If we skipped using WAL, and WAL is otherwise needed, we must force the
- * relation down to disk before it's safe to commit the transaction. This
- * requires writing out any dirty buffers and then doing a forced fsync.
+ * If we did any changes to the heap bypassing the buffer manager, we must
+ * force the relation down to disk before it's safe to commit the
+ * transaction, because the direct modifications will not be flushed by
+ * the next checkpoint.
+ *
+ * We used to also use this after batch operations like COPY and CLUSTER,
+ * if we skipped using WAL and WAL is otherwise needed, but there were
+ * corner-cases involving other WAL-logged operations to the same
+ * relation, where that was not enough. heap_register_sync() should be
+ * used for that purpose instead. * * Indexes are not touched. (Currently, index operations associated with * the
commandsthat use this are WAL-logged and so do not need fsync.
@@ -9233,3 +9266,33 @@ heap_mask(char *pagedata, BlockNumber blkno) } }}
+
+/*
+ * heap_register_sync - register a heap to be synced to disk at commit
+ *
+ * This can be used to skip WAL-logging changes on a relation file that has
+ * been created in the same transaction. This makes note of the current size of
+ * the relation, and ensures that when the relation is extended, any changes
+ * to the new blocks in the heap, in the same transaction, will not be
+ * WAL-logged. Instead, the heap contents are flushed to disk at commit,
+ * like heap_sync() does.
+ *
+ * This does the same for the TOAST heap, if any. Indexes are not affected.
+ */
+void
+heap_register_sync(Relation rel)
+{
+ /* non-WAL-logged tables never need fsync */
+ if (!RelationNeedsWAL(rel))
+ return;
+
+ RecordPendingSync(rel);
+ if (OidIsValid(rel->rd_rel->reltoastrelid))
+ {
+ Relation toastrel;
+
+ toastrel = heap_open(rel->rd_rel->reltoastrelid, AccessShareLock);
+ RecordPendingSync(toastrel);
+ heap_close(toastrel, AccessShareLock);
+ }
+}
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 52231ac..97edb99 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -20,6 +20,7 @@#include "access/htup_details.h"#include "access/xlog.h"#include "catalog/catalog.h"
+#include "catalog/storage.h"#include "miscadmin.h"#include "pgstat.h"#include "storage/bufmgr.h"
@@ -259,7 +260,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, /* *
Emita WAL HEAP_CLEAN record showing what we did */
- if (RelationNeedsWAL(relation))
+ if (BufferNeedsWAL(relation, buffer)) { XLogRecPtr recptr;
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index bd560e4..3c457db 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -649,9 +649,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup) } else if (HeapTupleHasExternal(tup) ||
tup->t_len> TOAST_TUPLE_THRESHOLD) heaptup = toast_insert_or_update(state->rs_new_rel, tup, NULL,
- HEAP_INSERT_SKIP_FSM |
- (state->rs_use_wal ?
- 0 : HEAP_INSERT_SKIP_WAL));
+ HEAP_INSERT_SKIP_FSM); else heaptup = tup;
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 4c2a13a..971d469 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -88,6 +88,7 @@#include "access/heapam_xlog.h"#include "access/visibilitymap.h"#include "access/xlog.h"
+#include "catalog/storage.h"#include "miscadmin.h"#include "storage/bufmgr.h"#include "storage/lmgr.h"
@@ -307,7 +308,7 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, map[mapByte] |= (flags
<<mapOffset); MarkBufferDirty(vmBuf);
- if (RelationNeedsWAL(rel))
+ if (BufferNeedsWAL(rel, heapBuf)) { if (XLogRecPtrIsInvalid(recptr)) {
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 93dca7a..7fba3df 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -2008,6 +2008,9 @@ CommitTransaction(void) /* close large objects before lower-level cleanup */
AtEOXact_LargeObject(true);
+ /* Flush updates to relations that we didn't WAL-logged */
+ smgrDoPendingSyncs(true);
+ /* * Mark serializable transaction as complete for predicate locking * purposes. This should be done as
lateas we can put it and still allow
@@ -2236,6 +2239,9 @@ PrepareTransaction(void) /* close large objects before lower-level cleanup */
AtEOXact_LargeObject(true);
+ /* Flush updates to relations that we didn't WAL-logged */
+ smgrDoPendingSyncs(true);
+ /* * Mark serializable transaction as complete for predicate locking * purposes. This should be done as
lateas we can put it and still allow
@@ -2549,6 +2555,7 @@ AbortTransaction(void) AtAbort_Notify(); AtEOXact_RelationMap(false);
AtAbort_Twophase();
+ smgrDoPendingSyncs(false); /* abandone pending syncs */ /* * Advertise the fact that we aborted in
pg_xact(assuming that we got as
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 9a5fde0..6bc1088 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -29,6 +29,7 @@#include "catalog/storage_xlog.h"#include "storage/freespace.h"#include "storage/smgr.h"
+#include "utils/hsearch.h"#include "utils/memutils.h"#include "utils/rel.h"
@@ -64,6 +65,49 @@ typedef struct PendingRelDeletestatic PendingRelDelete *pendingDeletes = NULL; /* head of linked
list*//*
+ * We also track relation files (RelFileNode values) that have been created
+ * in the same transaction, and that have been modified without WAL-logging
+ * the action (an optimization possible with wal_level=minimal). When we are
+ * about to skip WAL-logging, a PendingRelSync entry is created, and
+ * 'sync_above' is set to the current size of the relation. Any operations
+ * on blocks < sync_above need to be WAL-logged as usual, but for operations
+ * on higher blocks, WAL-logging is skipped.
+ *
+ * NB: after WAL-logging has been skipped for a block, we must not WAL-log
+ * any subsequent actions on the same block either. Replaying the WAL record
+ * of the subsequent action might fail otherwise, as the "before" state of
+ * the block might not match, as the earlier actions were not WAL-logged.
+ * Likewise, after we have WAL-logged an operation for a block, we must
+ * WAL-log any subsequent operations on the same page as well. Replaying
+ * a possible full-page-image from the earlier WAL record would otherwise
+ * revert the page to the old state, even if we sync the relation at end
+ * of transaction.
+ *
+ * If a relation is truncated (without creating a new relfilenode), and we
+ * emit a WAL record of the truncation, we can't skip WAL-logging for any
+ * of the truncated blocks anymore, as replaying the truncation record will
+ * destroy all the data inserted after that. But if we have already decided
+ * to skip WAL-logging changes to a relation, and the relation is truncated,
+ * we don't need to WAL-log the truncation either.
+ *
+ * This mechanism is currently only used by heaps. Indexes are always
+ * WAL-logged. Also, this only applies for wal_level=minimal; with higher
+ * WAL levels we need the WAL for PITR/replication anyway.
+ */
+typedef struct PendingRelSync
+{
+ RelFileNode relnode; /* relation created in same xact */
+ BlockNumber sync_above; /* WAL-logging skipped for blocks >=
+ * sync_above */
+ BlockNumber truncated_to; /* truncation WAL record was written */
+} PendingRelSync;
+
+/* Relations that need to be fsync'd at commit */
+static HTAB *pendingSyncs = NULL;
+
+static void createPendingSyncsHash(void);
+
+/* * RelationCreateStorage * Create physical storage for a relation. *
@@ -226,6 +270,8 @@ RelationPreserveStorage(RelFileNode rnode, bool atCommit)voidRelationTruncate(Relation rel,
BlockNumbernblocks){
+ PendingRelSync *pending = NULL;
+ bool found; bool fsm; bool vm;
@@ -260,37 +306,81 @@ RelationTruncate(Relation rel, BlockNumber nblocks) */ if (RelationNeedsWAL(rel)) {
- /*
- * Make an XLOG entry reporting the file truncation.
- */
- XLogRecPtr lsn;
- xl_smgr_truncate xlrec;
-
- xlrec.blkno = nblocks;
- xlrec.rnode = rel->rd_node;
- xlrec.flags = SMGR_TRUNCATE_ALL;
-
- XLogBeginInsert();
- XLogRegisterData((char *) &xlrec, sizeof(xlrec));
+ /* no_pending_sync is ignored since new entry is created here */
+ if (!rel->pending_sync)
+ {
+ if (!pendingSyncs)
+ createPendingSyncsHash();
+ elog(DEBUG2, "RelationTruncate: accessing hash");
+ pending = (PendingRelSync *) hash_search(pendingSyncs,
+ (void *) &rel->rd_node,
+ HASH_ENTER, &found);
+ if (!found)
+ {
+ pending->sync_above = InvalidBlockNumber;
+ pending->truncated_to = InvalidBlockNumber;
+ }
- lsn = XLogInsert(RM_SMGR_ID,
- XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
+ rel->no_pending_sync= false;
+ rel->pending_sync = pending;
+ }
- /*
- * Flush, because otherwise the truncation of the main relation might
- * hit the disk before the WAL record, and the truncation of the FSM
- * or visibility map. If we crashed during that window, we'd be left
- * with a truncated heap, but the FSM or visibility map would still
- * contain entries for the non-existent heap pages.
- */
- if (fsm || vm)
- XLogFlush(lsn);
+ if (rel->pending_sync->sync_above == InvalidBlockNumber ||
+ rel->pending_sync->sync_above < nblocks)
+ {
+ /*
+ * Make an XLOG entry reporting the file truncation.
+ */
+ XLogRecPtr lsn;
+ xl_smgr_truncate xlrec;
+
+ xlrec.blkno = nblocks;
+ xlrec.rnode = rel->rd_node;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, sizeof(xlrec));
+
+ lsn = XLogInsert(RM_SMGR_ID,
+ XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
+
+ elog(DEBUG2, "WAL-logged truncation of rel %u/%u/%u to %u blocks",
+ rel->rd_node.spcNode, rel->rd_node.dbNode, rel->rd_node.relNode,
+ nblocks);
+
+ /*
+ * Flush, because otherwise the truncation of the main relation
+ * might hit the disk before the WAL record, and the truncation of
+ * the FSM or visibility map. If we crashed during that window,
+ * we'd be left with a truncated heap, but the FSM or visibility
+ * map would still contain entries for the non-existent heap
+ * pages.
+ */
+ if (fsm || vm)
+ XLogFlush(lsn);
+
+ rel->pending_sync->truncated_to = nblocks;
+ } } /* Do the real work */ smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks);}
+/* create the hash table to track pending at-commit fsyncs */
+static void
+createPendingSyncsHash(void)
+{
+ /* First time through: initialize the hash table */
+ HASHCTL ctl;
+
+ MemSet(&ctl, 0, sizeof(ctl));
+ ctl.keysize = sizeof(RelFileNode);
+ ctl.entrysize = sizeof(PendingRelSync);
+ ctl.hash = tag_hash;
+ pendingSyncs = hash_create("pending relation sync table", 5,
+ &ctl, HASH_ELEM | HASH_FUNCTION);
+}
+/* * smgrDoPendingDeletes() -- Take care of relation deletes at end of xact. *
@@ -369,6 +459,24 @@ smgrDoPendingDeletes(bool isCommit)}/*
+ * RelationRemovePendingSync() -- remove pendingSync entry for a relation
+ */
+void
+RelationRemovePendingSync(Relation rel)
+{
+ bool found;
+
+ rel->pending_sync = NULL;
+ rel->no_pending_sync = true;
+ if (pendingSyncs)
+ {
+ elog(DEBUG2, "RelationRemovePendingSync: accessing hash");
+ hash_search(pendingSyncs, (void *) &rel->rd_node, HASH_REMOVE, &found);
+ }
+}
+
+
+/* * smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted. * * The return value is the number of
relationsscheduled for termination.
@@ -419,6 +527,170 @@ smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr) return nrels;}
+
+/*
+ * Remember that the given relation needs to be sync'd at commit, because we
+ * are going to skip WAL-logging subsequent actions to it.
+ */
+void
+RecordPendingSync(Relation rel)
+{
+ bool found = true;
+ BlockNumber nblocks;
+
+ Assert(RelationNeedsWAL(rel));
+
+ /* ignore no_pending_sync since new entry is created here */
+ if (!rel->pending_sync)
+ {
+ if (!pendingSyncs)
+ createPendingSyncsHash();
+
+ /* Look up or create an entry */
+ rel->no_pending_sync = false;
+ elog(DEBUG2, "RecordPendingSync: accessing hash");
+ rel->pending_sync =
+ (PendingRelSync *) hash_search(pendingSyncs,
+ (void *) &rel->rd_node,
+ HASH_ENTER, &found);
+ }
+
+ nblocks = RelationGetNumberOfBlocks(rel);
+ if (!found)
+ {
+ rel->pending_sync->truncated_to = InvalidBlockNumber;
+ rel->pending_sync->sync_above = nblocks;
+
+ elog(DEBUG2,
+ "registering new pending sync for rel %u/%u/%u at block %u",
+ rel->rd_node.spcNode, rel->rd_node.dbNode, rel->rd_node.relNode,
+ nblocks);
+
+ }
+ else if (rel->pending_sync->sync_above == InvalidBlockNumber)
+ {
+ elog(DEBUG2, "registering pending sync for rel %u/%u/%u at block %u",
+ rel->rd_node.spcNode, rel->rd_node.dbNode, rel->rd_node.relNode,
+ nblocks);
+ rel->pending_sync->sync_above = nblocks;
+ }
+ else
+ elog(DEBUG2,
+ "pending sync for rel %u/%u/%u was already registered at block %u (new %u)",
+ rel->rd_node.spcNode, rel->rd_node.dbNode, rel->rd_node.relNode,
+ rel->pending_sync->sync_above, nblocks);
+}
+
+/*
+ * Do changes to given heap page need to be WAL-logged?
+ *
+ * This takes into account any previous RecordPendingSync() requests.
+ *
+ * Note that it is required to check this before creating any WAL records for
+ * heap pages - it is not merely an optimization! WAL-logging a record, when
+ * we have already skipped a previous WAL record for the same page could lead
+ * to failure at WAL replay, as the "before" state expected by the record
+ * might not match what's on disk. Also, if the heap was truncated earlier, we
+ * must WAL-log any changes to the once-truncated blocks, because replaying
+ * the truncation record will destroy them.
+ */
+bool
+BufferNeedsWAL(Relation rel, Buffer buf)
+{
+ BlockNumber blkno = InvalidBlockNumber;
+
+ if (!RelationNeedsWAL(rel))
+ return false;
+
+ elog(DEBUG2, "BufferNeedsWAL(r %d, b %d): hash = %p, ent=%p, neg = %d", rel->rd_id, BufferGetBlockNumber(buf),
pendingSyncs,rel->pending_sync, rel->no_pending_sync);
+ /* no further work if we know that we don't have pending sync */
+ if (!pendingSyncs || rel->no_pending_sync)
+ return true;
+
+ /* do the real work */
+ if (!rel->pending_sync)
+ {
+ bool found = false;
+
+ /*
+ * Hold the entry in rel. This relies on the fact that hash entry
+ * never moves.
+ */
+ rel->pending_sync =
+ (PendingRelSync *) hash_search(pendingSyncs,
+ (void *) &rel->rd_node,
+ HASH_FIND, &found);
+ elog(DEBUG2, "BufferNeedsWAL: accessing hash : %s", found ? "found" : "not found");
+ if (!found)
+ {
+ /* we don't have no one. don't access the hash no longer */
+ rel->no_pending_sync = true;
+ return true;
+ }
+ }
+
+ blkno = BufferGetBlockNumber(buf);
+ if (rel->pending_sync->sync_above == InvalidBlockNumber ||
+ rel->pending_sync->sync_above > blkno)
+ {
+ elog(DEBUG2, "not skipping WAL-logging for rel %u/%u/%u block %u, because sync_above is %u",
+ rel->rd_node.spcNode, rel->rd_node.dbNode, rel->rd_node.relNode,
+ blkno, rel->pending_sync->sync_above);
+ return true;
+ }
+
+ /*
+ * We have emitted a truncation record for this block.
+ */
+ if (rel->pending_sync->truncated_to != InvalidBlockNumber &&
+ rel->pending_sync->truncated_to <= blkno)
+ {
+ elog(DEBUG2, "not skipping WAL-logging for rel %u/%u/%u block %u, because it was truncated earlier in the same
xact",
+ rel->rd_node.spcNode, rel->rd_node.dbNode, rel->rd_node.relNode,
+ blkno);
+ return true;
+ }
+
+ elog(DEBUG2, "skipping WAL-logging for rel %u/%u/%u block %u",
+ rel->rd_node.spcNode, rel->rd_node.dbNode, rel->rd_node.relNode,
+ blkno);
+
+ return false;
+}
+
+/*
+ * Sync to disk any relations that we skipped WAL-logging for earlier.
+ */
+void
+smgrDoPendingSyncs(bool isCommit)
+{
+ if (!pendingSyncs)
+ return;
+
+ if (isCommit)
+ {
+ HASH_SEQ_STATUS status;
+ PendingRelSync *pending;
+
+ hash_seq_init(&status, pendingSyncs);
+
+ while ((pending = hash_seq_search(&status)) != NULL)
+ {
+ if (pending->sync_above != InvalidBlockNumber)
+ {
+ FlushRelationBuffersWithoutRelCache(pending->relnode, false);
+ smgrimmedsync(smgropen(pending->relnode, InvalidBackendId), MAIN_FORKNUM);
+
+ elog(DEBUG2, "syncing rel %u/%u/%u", pending->relnode.spcNode,
+ pending->relnode.dbNode, pending->relnode.relNode);
+ }
+ }
+ }
+
+ hash_destroy(pendingSyncs);
+ pendingSyncs = NULL;
+}
+/* * PostPrepare_smgr -- Clean up after a successful PREPARE *
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index cfa3f05..6c0ffae 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -2347,8 +2347,7 @@ CopyFrom(CopyState cstate) * - data is being written to relfilenode created in this
transaction * then we can skip writing WAL. It's safe because if the transaction * doesn't commit, we'll
discardthe table (or the new relfilenode file).
- * If it does commit, we'll have done the heap_sync at the bottom of this
- * routine first.
+ * If it does commit, commit will do heap_sync(). * * As mentioned in comments in utils/rel.h, the
in-same-transactiontest * is not always set correctly, since in rare cases rd_newRelfilenodeSubid
@@ -2380,7 +2379,7 @@ CopyFrom(CopyState cstate) { hi_options |= HEAP_INSERT_SKIP_FSM; if
(!XLogIsNeeded())
- hi_options |= HEAP_INSERT_SKIP_WAL;
+ heap_register_sync(cstate->rel); } /*
@@ -2862,11 +2861,11 @@ CopyFrom(CopyState cstate) FreeExecutorState(estate); /*
- * If we skipped writing WAL, then we need to sync the heap (but not
- * indexes since those use WAL anyway)
+ * If we skipped writing WAL, then we will sync the heap at the end of
+ * the transaction. (We used to do it here, but it was later found out
+ * that to be safe, we must also avoid WAL-logging any subsequent
+ * actions on the pages we skipped WAL for). Indexes always use WAL. */
- if (hi_options & HEAP_INSERT_SKIP_WAL)
- heap_sync(cstate->rel); return processed;}
diff --git a/src/backend/commands/createas.c b/src/backend/commands/createas.c
index e60210c..dbc2028 100644
--- a/src/backend/commands/createas.c
+++ b/src/backend/commands/createas.c
@@ -567,8 +567,9 @@ intorel_startup(DestReceiver *self, int operation, TupleDesc typeinfo) * We can skip
WAL-loggingthe insertions, unless PITR or streaming * replication is in use. We can skip the FSM in any case.
*/
- myState->hi_options = HEAP_INSERT_SKIP_FSM |
- (XLogIsNeeded() ? 0 : HEAP_INSERT_SKIP_WAL);
+ if (!XLogIsNeeded())
+ heap_register_sync(intoRelationDesc);
+ myState->hi_options = HEAP_INSERT_SKIP_FSM; myState->bistate = GetBulkInsertState(); /* Not using WAL
requiressmgr_targblock be initially invalid */
@@ -617,9 +618,7 @@ intorel_shutdown(DestReceiver *self) FreeBulkInsertState(myState->bistate);
- /* If we skipped using WAL, must heap_sync before commit */
- if (myState->hi_options & HEAP_INSERT_SKIP_WAL)
- heap_sync(myState->rel);
+ /* If we skipped using WAL, we will sync the relation at commit */ /* close rel, but keep lock until commit */
heap_close(myState->rel, NoLock);
diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c
index d2e0376..5645a6e 100644
--- a/src/backend/commands/matview.c
+++ b/src/backend/commands/matview.c
@@ -477,7 +477,7 @@ transientrel_startup(DestReceiver *self, int operation, TupleDesc typeinfo) */
myState->hi_options= HEAP_INSERT_SKIP_FSM | HEAP_INSERT_FROZEN; if (!XLogIsNeeded())
- myState->hi_options |= HEAP_INSERT_SKIP_WAL;
+ heap_register_sync(transientrel); myState->bistate = GetBulkInsertState(); /* Not using WAL requires
smgr_targblockbe initially invalid */
@@ -520,9 +520,7 @@ transientrel_shutdown(DestReceiver *self) FreeBulkInsertState(myState->bistate);
- /* If we skipped using WAL, must heap_sync before commit */
- if (myState->hi_options & HEAP_INSERT_SKIP_WAL)
- heap_sync(myState->transientrel);
+ /* If we skipped using WAL, we will sync the relation at commit */ /* close transientrel, but keep lock until
commit*/ heap_close(myState->transientrel, NoLock);
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 96354bd..3fdb99d 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -4401,8 +4401,9 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode) bistate =
GetBulkInsertState(); hi_options = HEAP_INSERT_SKIP_FSM;
+ if (!XLogIsNeeded())
- hi_options |= HEAP_INSERT_SKIP_WAL;
+ heap_register_sync(newrel); } else {
@@ -4675,8 +4676,6 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode)
FreeBulkInsertState(bistate); /* If we skipped writing WAL, then we need to sync the heap. */
- if (hi_options & HEAP_INSERT_SKIP_WAL)
- heap_sync(newrel); heap_close(newrel, NoLock); }
@@ -10656,11 +10655,12 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode) /* * Create
andcopy all forks of the relation, and schedule unlinking of
- * old physical files.
+ * old physical files. Pending syncs for the old node is no longer needed. * * NOTE: any conflict in
relfilenodevalue will be caught in * RelationCreateStorage(). */
+ RelationRemovePendingSync(rel); RelationCreateStorage(newrnode, rel->rd_rel->relpersistence); /* copy main
fork*/
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index 45b1859..757ed7f 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -891,7 +891,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, * page has
beenpreviously WAL-logged, and if not, do that * now. */
- if (RelationNeedsWAL(onerel) &&
+ if (BufferNeedsWAL(onerel, buf) && PageGetLSN(page) == InvalidXLogRecPtr)
log_newpage_buffer(buf, true);
@@ -1118,7 +1118,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, } /*
NowWAL-log freezing if necessary */
- if (RelationNeedsWAL(onerel))
+ if (BufferNeedsWAL(onerel, buf)) { XLogRecPtr recptr;
@@ -1476,7 +1476,7 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, MarkBufferDirty(buffer);
/* XLOG stuff */
- if (RelationNeedsWAL(onerel))
+ if (BufferNeedsWAL(onerel, buffer)) { XLogRecPtr recptr;
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 15795b0..be57547 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -451,6 +451,7 @@ static BufferDesc *BufferAlloc(SMgrRelation smgr, BufferAccessStrategy strategy,
bool *foundPtr);static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
+static void FlushRelationBuffers_common(SMgrRelation smgr, bool islocal);static void AtProcExit_Buffers(int code,
Datumarg);static void CheckForBufferLeaks(void);static int rnode_comparator(const void *p1, const void *p2);
@@ -3147,20 +3148,41 @@ PrintPinnedBufs(void)voidFlushRelationBuffers(Relation rel){
- int i;
- BufferDesc *bufHdr;
- /* Open rel at the smgr level if not already done */ RelationOpenSmgr(rel);
- if (RelationUsesLocalBuffers(rel))
+ FlushRelationBuffers_common(rel->rd_smgr, RelationUsesLocalBuffers(rel));
+}
+
+/*
+ * Like FlushRelationBuffers(), but the relation is specified by a
+ * RelFileNode
+ */
+void
+FlushRelationBuffersWithoutRelCache(RelFileNode rnode, bool islocal)
+{
+ FlushRelationBuffers_common(smgropen(rnode, InvalidBackendId), islocal);
+}
+
+/*
+ * Code shared between functions FlushRelationBuffers() and
+ * FlushRelationBuffersWithoutRelCache().
+ */
+static void
+FlushRelationBuffers_common(SMgrRelation smgr, bool islocal)
+{
+ RelFileNode rnode = smgr->smgr_rnode.node;
+ int i;
+ BufferDesc *bufHdr;
+
+ if (islocal) { for (i = 0; i < NLocBuffer; i++) { uint32 buf_state;
bufHdr= GetLocalBufferDescriptor(i);
- if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
+ if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) && ((buf_state =
pg_atomic_read_u32(&bufHdr->state))& (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY)) {
@@ -3177,7 +3199,7 @@ FlushRelationBuffers(Relation rel) PageSetChecksumInplace(localpage,
bufHdr->tag.blockNum);
- smgrwrite(rel->rd_smgr,
+ smgrwrite(smgr, bufHdr->tag.forkNum,
bufHdr->tag.blockNum, localpage,
@@ -3207,18 +3229,18 @@ FlushRelationBuffers(Relation rel) * As in DropRelFileNodeBuffers, an unlocked precheck
shouldbe safe * and saves some cycles. */
- if (!RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
+ if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode)) continue; ReservePrivateRefCountEntry();
buf_state = LockBufHdr(bufHdr);
- if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
+ if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) && (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID
|BM_DIRTY)) { PinBuffer_Locked(bufHdr);
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),LW_SHARED);
- FlushBuffer(bufHdr, rel->rd_smgr);
+ FlushBuffer(bufHdr, smgr); LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
UnpinBuffer(bufHdr,true); }
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index b8e3780..3dff4ed 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -72,6 +72,7 @@#include "optimizer/var.h"#include "rewrite/rewriteDefine.h"#include "rewrite/rowsecurity.h"
+#include "storage/bufmgr.h"#include "storage/lmgr.h"#include "storage/smgr.h"#include "utils/array.h"
@@ -418,6 +419,10 @@ AllocateRelationDesc(Form_pg_class relp) /* which we mark as a reference-counted tupdesc */
relation->rd_att->tdrefcount= 1;
+ /* We don't know if pending sync for this relation exists so far */
+ relation->pending_sync = NULL;
+ relation->no_pending_sync = false;
+ MemoryContextSwitchTo(oldcxt); return relation;
@@ -2040,6 +2045,10 @@ formrdesc(const char *relationName, Oid relationReltype, relation->rd_rel->relhasindex =
true; }
+ /* We don't know if pending sync for this relation exists so far */
+ relation->pending_sync = NULL;
+ relation->no_pending_sync = false;
+ /* * add new reldesc to relcache */
@@ -3364,6 +3373,10 @@ RelationBuildLocalRelation(const char *relname, else rel->rd_rel->relfilenode =
relfilenode;
+ /* newly built relation has no pending sync */
+ rel->no_pending_sync = true;
+ rel->pending_sync = NULL;
+ RelationInitLockInfo(rel); /* see lmgr.c */ RelationInitPhysicalAddr(rel);
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 4e41024..79b964f 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -25,10 +25,9 @@/* "options" flag bits for heap_insert */
-#define HEAP_INSERT_SKIP_WAL 0x0001
-#define HEAP_INSERT_SKIP_FSM 0x0002
-#define HEAP_INSERT_FROZEN 0x0004
-#define HEAP_INSERT_SPECULATIVE 0x0008
+#define HEAP_INSERT_SKIP_FSM 0x0001
+#define HEAP_INSERT_FROZEN 0x0002
+#define HEAP_INSERT_SPECULATIVE 0x0004typedef struct BulkInsertStateData *BulkInsertState;
@@ -179,6 +178,7 @@ extern void simple_heap_delete(Relation relation, ItemPointer tid);extern void
simple_heap_update(Relationrelation, ItemPointer otid, HeapTuple tup);
+extern void heap_register_sync(Relation relation);extern void heap_sync(Relation relation);extern void
heap_update_snapshot(HeapScanDescscan, Snapshot snapshot);
diff --git a/src/include/catalog/storage.h b/src/include/catalog/storage.h
index a3a97db..03964e2 100644
--- a/src/include/catalog/storage.h
+++ b/src/include/catalog/storage.h
@@ -22,13 +22,16 @@ extern void RelationCreateStorage(RelFileNode rnode, char relpersistence);extern void
RelationDropStorage(Relationrel);extern void RelationPreserveStorage(RelFileNode rnode, bool atCommit);extern void
RelationTruncate(Relationrel, BlockNumber nblocks);
-
+extern void RelationRemovePendingSync(Relation rel);/* * These functions used to be in storage/smgr/smgr.c, which
explainsthe * naming */extern void smgrDoPendingDeletes(bool isCommit);extern int smgrGetPendingDeletes(bool
forCommit,RelFileNode **ptr);
+extern void smgrDoPendingSyncs(bool isCommit);
+extern void RecordPendingSync(Relation rel);
+bool BufferNeedsWAL(Relation rel, Buffer buf);extern void AtSubCommit_smgr(void);extern void
AtSubAbort_smgr(void);externvoid PostPrepare_smgr(void);
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 98b63fc..598d1a0 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -190,6 +190,8 @@ extern BlockNumber RelationGetNumberOfBlocksInFork(Relation relation,
ForkNumber forkNum);extern void FlushOneBuffer(Buffer buffer);extern void FlushRelationBuffers(Relation rel);
+extern void FlushRelationBuffersWithoutRelCache(RelFileNode rnode,
+ bool islocal);extern void FlushDatabaseBuffers(Oid dbid);extern void
DropRelFileNodeBuffers(RelFileNodeBackendrnode, ForkNumber forkNum, BlockNumber firstDelBlock);
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index 4bc61e5..c7610bd 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -216,6 +216,14 @@ typedef struct RelationData /* use "struct" here to avoid needing to include pgstat.h: */
structPgStat_TableStatus *pgstat_info; /* statistics collection area */
+
+ /*
+ * no_pending_sync is true if this relation is known not to have pending
+ * syncs. Elsewise searching for registered sync is required if
+ * pending_sync is NULL.
+ */
+ bool no_pending_sync;
+ struct PendingRelSync *pending_sync;} RelationData;
--
2.9.2
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
pgsql-hackers by date: