From 41c5bbfcb86dc15b20ce210e727ff4296edfafbd Mon Sep 17 00:00:00 2001 From: Movead Date: Thu, 15 Jan 2026 23:01:16 +0800 Subject: [PATCH] Patch enable pg_rewind work without wal_log_hints and data_checksums --- src/bin/pg_rewind/.gitignore | 3 + src/bin/pg_rewind/Makefile | 15 ++- src/bin/pg_rewind/parsexlog.c | 166 +++++++++++++++++++++++++++++++++- src/bin/pg_rewind/pg_rewind.c | 34 ++++--- src/bin/pg_rewind/pg_rewind.h | 5 +- 5 files changed, 204 insertions(+), 19 deletions(-) diff --git a/src/bin/pg_rewind/.gitignore b/src/bin/pg_rewind/.gitignore index 79ddca3eec9..8f8ab3bcfde 100644 --- a/src/bin/pg_rewind/.gitignore +++ b/src/bin/pg_rewind/.gitignore @@ -1,6 +1,9 @@ # Files generated during build /xlogreader.c /pg_rewind +/compat.c +/xactdesc.c +/standbydesc.c # Generated by test suite /tmp_check/ diff --git a/src/bin/pg_rewind/Makefile b/src/bin/pg_rewind/Makefile index 32a35c57612..c4608494c6c 100644 --- a/src/bin/pg_rewind/Makefile +++ b/src/bin/pg_rewind/Makefile @@ -19,7 +19,9 @@ override CPPFLAGS := -I$(libpq_srcdir) -DFRONTEND $(CPPFLAGS) LDFLAGS_INTERNAL += -L$(top_builddir)/src/fe_utils -lpgfeutils $(libpq_pgport) OBJS = \ + $(RMGRDESCOBJS) \ $(WIN32RES) \ + compat.o \ datapagemap.o \ file_ops.o \ filemap.o \ @@ -30,7 +32,10 @@ OBJS = \ timeline.o \ xlogreader.o -EXTRA_CLEAN = xlogreader.c +EXTRA_CLEAN = xlogreader.c compat.c + +RMGRDESCSOURCES = xactdesc.c standbydesc.c +RMGRDESCOBJS = $(patsubst %.c,%.o,$(RMGRDESCSOURCES)) all: pg_rewind @@ -40,6 +45,12 @@ pg_rewind: $(OBJS) | submake-libpq submake-libpgport xlogreader.c: % : $(top_srcdir)/src/backend/access/transam/% rm -f $@ && $(LN_S) $< . +compat.c: % : $(top_srcdir)/src/bin/pg_waldump/% + rm -f $@ && $(LN_S) $< . + +$(RMGRDESCSOURCES): % : $(top_srcdir)/src/backend/access/rmgrdesc/% + rm -f $@ && $(LN_S) $< . + install: all installdirs $(INSTALL_PROGRAM) pg_rewind$(X) '$(DESTDIR)$(bindir)/pg_rewind$(X)' @@ -50,7 +61,7 @@ uninstall: rm -f '$(DESTDIR)$(bindir)/pg_rewind$(X)' clean distclean: - rm -f pg_rewind$(X) $(OBJS) xlogreader.c + rm -f pg_rewind$(X) $(OBJS) $(RMGRDESCSOURCES) xlogreader.c compat.c rm -rf tmp_check check: diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index db7a7e73042..9c36fa44950 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -23,6 +23,8 @@ #include "fe_utils/archive.h" #include "filemap.h" #include "pg_rewind.h" +#include "storage/standbydefs.h" +#include "access/transam.h" /* * RmgrNames is an array of the built-in resource manager names, to make error @@ -38,11 +40,14 @@ static const char *const RmgrNames[RM_MAX_ID + 1] = { #define RmgrName(rmid) (((rmid) <= RM_MAX_BUILTIN_ID) ? \ RmgrNames[rmid] : "custom") -static void extractPageInfo(XLogReaderState *record); +static void extractPageInfo(XLogReaderState *record, bool backward); +static bool RewindTransactionIdPrecedes(TransactionId id1, TransactionId id2); static int xlogreadfd = -1; static XLogSegNo xlogreadsegno = 0; static char xlogfpath[MAXPGPATH]; +static TransactionId min_commited_xid = InvalidTransactionId; +static TransactionId base_xid = InvalidTransactionId; typedef struct XLogPageReadPrivate { @@ -63,7 +68,7 @@ static int SimpleXLogPageRead(XLogReaderState *xlogreader, * 'endpoint' is the first one that is not read. */ void -extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, +extractPageMapForward(const char *datadir, XLogRecPtr startpoint, int tliIndex, XLogRecPtr endpoint, const char *restoreCommand) { XLogRecord *record; @@ -97,7 +102,7 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, LSN_FORMAT_ARGS(errptr)); } - extractPageInfo(xlogreader); + extractPageInfo(xlogreader, false); } while (xlogreader->EndRecPtr < endpoint); /* @@ -116,6 +121,72 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, } } +void +extractPageMapBackward(const char *datadir, XLogRecPtr startpoint, int tliIndex, + const char *restoreCommand) +{ + /* Walk backwards, starting from the given record */ + XLogRecord *record; + XLogRecPtr searchptr; + XLogReaderState *xlogreader; + char *errormsg; + XLogPageReadPrivate private; + + /* + * The given fork pointer points to the end of the last common record, + * which is not necessarily the beginning of the next record, if the + * previous record happens to end at a page boundary. Skip over the page + * header in that case to find the next record. + */ + if (startpoint % XLOG_BLCKSZ == 0) + { + if (XLogSegmentOffset(startpoint, WalSegSz) == 0) + startpoint += SizeOfXLogLongPHD; + else + startpoint += SizeOfXLogShortPHD; + } + + private.tliIndex = tliIndex; + private.restoreCommand = restoreCommand; + xlogreader = XLogReaderAllocate(WalSegSz, datadir, + XL_ROUTINE(.page_read = &SimpleXLogPageRead), + &private); + if (xlogreader == NULL) + pg_fatal("out of memory while allocating a WAL reading processor"); + + searchptr = startpoint; + for (;;) + { + XLogBeginRead(xlogreader, searchptr); + record = XLogReadRecord(xlogreader, &errormsg); + + if (record == NULL) + { + if (errormsg) + pg_fatal("could not find previous WAL record at %X/%X: %s", + LSN_FORMAT_ARGS(searchptr), + errormsg); + else + pg_fatal("could not find previous WAL record at %X/%X", + LSN_FORMAT_ARGS(searchptr)); + } + + extractPageInfo(xlogreader, true); + /* We can break if met a safety snapshot */ + if (base_xid <= min_commited_xid) + break; + /* Walk backwards to previous record. */ + searchptr = record->xl_prev; + } + + XLogReaderFree(xlogreader); + if (xlogreadfd != -1) + { + close(xlogreadfd); + xlogreadfd = -1; + } +} + /* * Reads one WAL record. Returns the end position of the record, without * doing anything with the record itself. @@ -270,6 +341,17 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex, } } +bool check_rewind_safety(void) +{ + /* If no committed transactions, it's safe to rewind */ + if (min_commited_xid == InvalidTransactionId) + return true; + /* If base_xid is less than or equal to min_commited_xid, it's safe */ + if (base_xid <= min_commited_xid) + return true; + return false; +} + /* XLogReader callback function, to read a WAL page */ static int SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, @@ -382,11 +464,66 @@ SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, return XLOG_BLCKSZ; } +static void +track_rewind_snapshot(XLogReaderState *record, bool backward) +{ + uint8 info = 0; + RmgrId rmid = XLogRecGetRmid(record); + + if (rmid != RM_XACT_ID && rmid != RM_STANDBY_ID) + return; + + if (rmid == RM_XACT_ID) + { + xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record); + xl_xact_parsed_commit parsed; + TransactionId current_xid = InvalidTransactionId; + + /* We finished tracking during forward read phase */ + if(backward) + return; + + info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK; + ParseCommitRecord(XLogRecGetInfo(record), xlrec, &parsed); + if (info == XLOG_XACT_COMMIT) + { + current_xid = XLogRecGetXid(record); + } + else if (info == XLOG_XACT_COMMIT_PREPARED) + { + current_xid = parsed.twophase_xid; + } + if(min_commited_xid == InvalidTransactionId || + RewindTransactionIdPrecedes(current_xid, min_commited_xid)) + { + min_commited_xid = current_xid; + if (showprogress) + pg_log_info("Get min committed xid: %u", min_commited_xid); + } + } + else if (rmid == RM_STANDBY_ID) + { + info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + if (info == XLOG_RUNNING_XACTS) + { + xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record); + + if(base_xid == InvalidTransactionId || RewindTransactionIdPrecedes(xlrec->nextXid, base_xid)) + { + base_xid = xlrec->nextXid; + if (showprogress) + pg_log_info("Get base xid: %u at %X/%X", base_xid, LSN_FORMAT_ARGS(record->ReadRecPtr)); + } + } + } +} + /* * Extract information on which blocks the current record modifies. */ static void -extractPageInfo(XLogReaderState *record) +extractPageInfo(XLogReaderState *record, bool backward) { int block_id; RmgrId rmid = XLogRecGetRmid(record); @@ -464,6 +601,8 @@ extractPageInfo(XLogReaderState *record) rmid, RmgrName(rmid), info); } + track_rewind_snapshot(record, backward); + for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++) { RelFileLocator rlocator; @@ -481,3 +620,22 @@ extractPageInfo(XLogReaderState *record) process_target_wal_block_change(forknum, rlocator, blkno); } } + +/* + * RewindTransactionIdPrecedes --- is id1 logically < id2? + */ +static bool +RewindTransactionIdPrecedes(TransactionId id1, TransactionId id2) +{ + /* + * If either ID is a permanent XID then we can just do unsigned + * comparison. If both are normal, do a modulo-2^32 comparison. + */ + int32 diff; + + if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2)) + return (id1 < id2); + + diff = (int32) (id1 - id2); + return (diff < 0); +} diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index 31693843b3c..18266d688b1 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -74,6 +74,7 @@ static bool debug = false; bool showprogress = false; bool dry_run = false; bool do_sync = true; +bool deep_dig = false; static bool restore_wal = false; DataDirSyncMethod sync_method = DATA_DIR_SYNC_METHOD_FSYNC; @@ -103,6 +104,7 @@ usage(const char *progname) printf(_(" -N, --no-sync do not wait for changes to be written\n" " safely to disk\n")); printf(_(" -P, --progress write progress messages\n")); + printf(_(" -d, --deep-dig perform a deep dig for more wal\n")); printf(_(" -R, --write-recovery-conf write configuration for replication\n" " (requires --source-server)\n")); printf(_(" --config-file=FILENAME use specified main server configuration\n" @@ -135,6 +137,7 @@ main(int argc, char **argv) {"progress", no_argument, NULL, 'P'}, {"debug", no_argument, NULL, 3}, {"sync-method", required_argument, NULL, 6}, + {"deep-dig", no_argument, NULL, 'd'}, {NULL, 0, NULL, 0} }; int option_index; @@ -153,6 +156,7 @@ main(int argc, char **argv) bool no_ensure_shutdown = false; bool rewind_needed; bool writerecoveryconf = false; + bool page_consistence = false; filemap_t *filemap; pg_logging_init(argv[0]); @@ -174,7 +178,7 @@ main(int argc, char **argv) } } - while ((c = getopt_long(argc, argv, "cD:nNPR", long_options, &option_index)) != -1) + while ((c = getopt_long(argc, argv, "cD:nNPRd", long_options, &option_index)) != -1) { switch (c) { @@ -227,6 +231,9 @@ main(int argc, char **argv) if (!parse_sync_method(optarg, &sync_method)) exit(1); break; + case 'd': + deep_dig = true; + break; default: /* getopt_long already emitted a complaint */ @@ -351,6 +358,8 @@ main(int argc, char **argv) pg_free(buffer); sanityChecks(); + page_consistence = ControlFile_target.data_checksum_version == PG_DATA_CHECKSUM_VERSION || + ControlFile_target.wal_log_hints; /* * Usually, the TLI can be found in the latest checkpoint record. But if @@ -494,8 +503,19 @@ main(int argc, char **argv) */ if (showprogress) pg_log_info("reading WAL in target"); - extractPageMap(datadir_target, chkptrec, lastcommontliIndex, + extractPageMapForward(datadir_target, chkptrec, lastcommontliIndex, target_wal_endrec, restore_command); + if(!page_consistence && !check_rewind_safety()) + { + /* + * Next time we will search more wal which will spend more time, and + * if user prefer a deep dig we will do it. Otherwise, we just stop here. + */ + if(!deep_dig) + pg_fatal("Can not safety rewind the target cluster, use --deep-dig to have a try."); + extractPageMapBackward(datadir_target, chkptrec, lastcommontliIndex, + restore_command); + } /* * We have collected all information we need from both systems. Decide @@ -758,16 +778,6 @@ sanityChecks(void) pg_fatal("clusters are not compatible with this version of pg_rewind"); } - /* - * Target cluster need to use checksums or hint bit wal-logging, this to - * prevent from data corruption that could occur because of hint bits. - */ - if (ControlFile_target.data_checksum_version != PG_DATA_CHECKSUM_VERSION && - !ControlFile_target.wal_log_hints) - { - pg_fatal("target server needs to use either data checksums or \"wal_log_hints = on\""); - } - /* * Target cluster better not be running. This doesn't guard against * someone starting the cluster concurrently. Also, this is probably more diff --git a/src/bin/pg_rewind/pg_rewind.h b/src/bin/pg_rewind/pg_rewind.h index 9a981f7f246..01dfc47fcfa 100644 --- a/src/bin/pg_rewind/pg_rewind.h +++ b/src/bin/pg_rewind/pg_rewind.h @@ -32,9 +32,11 @@ extern uint64 fetch_size; extern uint64 fetch_done; /* in parsexlog.c */ -extern void extractPageMap(const char *datadir, XLogRecPtr startpoint, +extern void extractPageMapForward(const char *datadir, XLogRecPtr startpoint, int tliIndex, XLogRecPtr endpoint, const char *restoreCommand); +extern void extractPageMapBackward(const char *datadir, XLogRecPtr startpoint, + int tliIndex, const char *restoreCommand); extern void findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex, XLogRecPtr *lastchkptrec, TimeLineID *lastchkpttli, @@ -42,6 +44,7 @@ extern void findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, const char *restoreCommand); extern XLogRecPtr readOneRecord(const char *datadir, XLogRecPtr ptr, int tliIndex, const char *restoreCommand); +extern bool check_rewind_safety(void); /* in pg_rewind.c */ extern void progress_report(bool finished); -- 2.47.2