From a189e0d922c19a23739a4508b0e722c7459e872d Mon Sep 17 00:00:00 2001 From: Alena Vinter Date: Wed, 10 Sep 2025 14:06:09 +0700 Subject: [PATCH 3/3] Handle rewind failure when a timeline ends with an overwritten contrecord. When a common timeline ends with an overwritten contrecord, the divergence point may not point to the start of a valid WAL record on the target, causing errors and making rewind impossible. To handle this case, when the target timeline is unfinished, we look for a checkpoint preceding the divergence point starting from the last checkpoint on the target rather than from the divergence point itself. This ensures we always begin from a known-valid position in WAL. --- src/bin/pg_rewind/parsexlog.c | 25 +++++++++++++------------ src/bin/pg_rewind/pg_rewind.c | 18 ++++++++++++++++-- src/bin/pg_rewind/pg_rewind.h | 7 ++++--- 3 files changed, 33 insertions(+), 17 deletions(-) diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index 8f4b282c6b1..7d01fefb1d5 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -165,9 +165,10 @@ readOneRecord(const char *datadir, XLogRecPtr ptr, int tliIndex, * Find the previous checkpoint preceding given WAL location. */ void -findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex, - XLogRecPtr *lastchkptrec, TimeLineID *lastchkpttli, - XLogRecPtr *lastchkptredo, const char *restoreCommand) +findLastCheckpoint(const char *datadir, XLogRecPtr startptr, XLogRecPtr forkptr, + int tliIndex, XLogRecPtr *lastchkptrec, + TimeLineID *lastchkpttli, XLogRecPtr *lastchkptredo, + const char *restoreCommand) { /* Walk backwards, starting from the given record */ XLogRecord *record; @@ -179,17 +180,17 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex, TimeLineID current_tli = 0; /* - * The given fork pointer points to the end of the last common record, - * which is not necessarily the beginning of the next record, if the - * previous record happens to end at a page boundary. Skip over the page - * header in that case to find the next record. + * The given start pointer may point to a page boundary if the startptr is + * the end of the last common record which is not necessarily the beginning + * of the next record. Skip over the page header in that case to find the + * next record. */ - if (forkptr % XLOG_BLCKSZ == 0) + if (startptr % XLOG_BLCKSZ == 0) { - if (XLogSegmentOffset(forkptr, WalSegSz) == 0) - forkptr += SizeOfXLogLongPHD; + if (XLogSegmentOffset(startptr, WalSegSz) == 0) + startptr += SizeOfXLogLongPHD; else - forkptr += SizeOfXLogShortPHD; + startptr += SizeOfXLogShortPHD; } private.tliIndex = tliIndex; @@ -200,7 +201,7 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex, if (xlogreader == NULL) pg_fatal("out of memory while allocating a WAL reading processor"); - searchptr = forkptr; + searchptr = startptr; for (;;) { uint8 info; diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index 0c68dd4235e..c2ecf72cacf 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -140,6 +140,7 @@ main(int argc, char **argv) int option_index; int c; XLogRecPtr divergerec; + XLogRecPtr checkpoint_search_startrec; int lastcommontliIndex; XLogRecPtr chkptrec; TimeLineID chkpttli; @@ -459,8 +460,21 @@ main(int argc, char **argv) /* Initialize hashtable that tracks WAL files protected from removal */ keepwal_init(); - findLastCheckpoint(datadir_target, divergerec, lastcommontliIndex, - &chkptrec, &chkpttli, &chkptredo, restore_command); + /* + * If the last common timeline is incomplete on the target, a divergence + * point from the source's finished timeline may not exist in the target's + * WAL. Therefore, start searching for a checkpoint preceding the divergence + * point from the last checkpoint on the target server to find a safe common + * point. + */ + if (targetHistory[lastcommontliIndex].end == InvalidXLogRecPtr) + checkpoint_search_startrec = ControlFile_target.checkPoint; + else + checkpoint_search_startrec = divergerec; + + findLastCheckpoint(datadir_target, checkpoint_search_startrec, divergerec, + lastcommontliIndex, &chkptrec, &chkpttli, &chkptredo, + restore_command); pg_log_info("rewinding from last common checkpoint at %X/%08X on timeline %u", LSN_FORMAT_ARGS(chkptrec), chkpttli); diff --git a/src/bin/pg_rewind/pg_rewind.h b/src/bin/pg_rewind/pg_rewind.h index 9cea144d2b2..4879be1d1d4 100644 --- a/src/bin/pg_rewind/pg_rewind.h +++ b/src/bin/pg_rewind/pg_rewind.h @@ -35,9 +35,10 @@ extern uint64 fetch_done; extern void extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, XLogRecPtr endpoint, const char *restoreCommand); -extern void findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, - int tliIndex, - XLogRecPtr *lastchkptrec, TimeLineID *lastchkpttli, +extern void findLastCheckpoint(const char *datadir, XLogRecPtr startptr, + XLogRecPtr forkptr, int tliIndex, + XLogRecPtr *lastchkptrec, + TimeLineID *lastchkpttli, XLogRecPtr *lastchkptredo, const char *restoreCommand); extern XLogRecPtr readOneRecord(const char *datadir, XLogRecPtr ptr, -- 2.51.0