From c2811b147d50a7e2e8f6566344e9f1f85614560d Mon Sep 17 00:00:00 2001 From: Amul Sul Date: Thu, 6 Nov 2025 13:48:33 +0530 Subject: [PATCH v10 5/8] pg_waldump: Remove the restriction on the order of archived WAL files. With previous patch, pg_waldump would stop decoding if WAL files were not in the required sequence. With this patch, decoding will now continue. Any WAL file that is out of order will be written to a temporary location, from which it will be read later. Once a temporary file has been read, it will be removed. --- doc/src/sgml/ref/pg_waldump.sgml | 8 +- src/bin/pg_waldump/archive_waldump.c | 230 ++++++++++++++++++++++++--- src/bin/pg_waldump/pg_waldump.c | 41 ++++- src/bin/pg_waldump/pg_waldump.h | 4 + src/bin/pg_waldump/t/001_basic.pl | 3 +- 5 files changed, 264 insertions(+), 22 deletions(-) diff --git a/doc/src/sgml/ref/pg_waldump.sgml b/doc/src/sgml/ref/pg_waldump.sgml index d004bb0f67e..27adf77755c 100644 --- a/doc/src/sgml/ref/pg_waldump.sgml +++ b/doc/src/sgml/ref/pg_waldump.sgml @@ -149,8 +149,12 @@ PostgreSQL documentation of PGDATA. - If a tar archive is provided, its WAL segment files must be in - sequential order; otherwise, an error will be reported. + If a tar archive is provided and its WAL segment files are not in + sequential order, those files will be written to a temporary directory + named starting with waldump_tmp. This directory will be + created inside the directory specified by the TMPDIR + environment variable if it is set; otherwise, it will be created within + the same directory as the tar archive. diff --git a/src/bin/pg_waldump/archive_waldump.c b/src/bin/pg_waldump/archive_waldump.c index 6ba067163d5..aa64da9a57d 100644 --- a/src/bin/pg_waldump/archive_waldump.c +++ b/src/bin/pg_waldump/archive_waldump.c @@ -17,6 +17,7 @@ #include #include "access/xlog_internal.h" +#include "common/file_perm.h" #include "common/hashfn.h" #include "common/logging.h" #include "fe_utils/simple_list.h" @@ -27,6 +28,9 @@ */ #define READ_CHUNK_SIZE (128 * 1024) +/* Temporary exported WAL file directory */ +static char *TmpWalSegDir = NULL; + /* Hash entry structure for holding WAL segment data read from the archive */ typedef struct ArchivedWALEntry { @@ -64,6 +68,11 @@ typedef struct astreamer_waldump static int read_archive_file(XLogDumpPrivate *privateInfo, Size count); static ArchivedWALEntry *get_archive_wal_entry(XLogSegNo segno, XLogDumpPrivate *privateInfo); +static void setup_tmpseg_dir(const char *waldir); +static void cleanup_tmpseg_dir_atexit(void); + +static FILE *prepare_tmp_write(XLogSegNo segno); +static void perform_tmp_write(XLogSegNo segno, StringInfo buf, FILE *file); static astreamer *astreamer_waldump_new(XLogDumpPrivate *privateInfo); static void astreamer_waldump_content(astreamer *streamer, @@ -121,7 +130,9 @@ is_archive_file(const char *fname, pg_compress_algorithm *compression) /* * Initializes the tar archive reader, creates a hash table for WAL entries, * checks for existing valid WAL segments in the archive file and retrieves the - * segment size, and sets up filters for relevant entries. + * segment size, and sets up filters for relevant entries. It also configures a + * temporary directory for out-of-order WAL data and registers an exit callback + * to clean up temporary files. */ void init_archive_reader(XLogDumpPrivate *privateInfo, const char *waldir, @@ -197,6 +208,13 @@ init_archive_reader(XLogDumpPrivate *privateInfo, const char *waldir, privateInfo->endSegNo = UINT64_MAX; else XLByteToSeg(privateInfo->endptr, privateInfo->endSegNo, WalSegSz); + + /* + * Setup temporary directory to store WAL segments and set up an exit + * callback to remove it upon completion. + */ + setup_tmpseg_dir(waldir); + atexit(cleanup_tmpseg_dir_atexit); } /* @@ -370,15 +388,18 @@ read_archive_file(XLogDumpPrivate *privateInfo, Size count) } /* - * Returns the archived WAL entry from the hash table if it exists. Otherwise, + * Returns the archived WAL entry from the hash table if it exists. Otherwise, * it invokes the routine to read the archived file, which then populates the - * entry in the hash table. + * entry in the hash table. If the archive streamer happens to be reading a + * WAL from archive file that is not currently needed, that WAL data is written + * to a temporary file. */ static ArchivedWALEntry * get_archive_wal_entry(XLogSegNo segno, XLogDumpPrivate *privateInfo) { ArchivedWALEntry *entry = NULL; char fname[MAXFNAMELEN]; + FILE *write_fp = NULL; /* Search hash table */ entry = ArchivedWAL_lookup(ArchivedWAL_HTAB, segno); @@ -392,8 +413,11 @@ get_archive_wal_entry(XLogSegNo segno, XLogDumpPrivate *privateInfo) entry = privateInfo->cur_wal; /* Fetch more data */ - if (read_archive_file(privateInfo, READ_CHUNK_SIZE) == 0) - break; /* archive file ended */ + if (entry == NULL || entry->buf.len == 0) + { + if (read_archive_file(privateInfo, READ_CHUNK_SIZE) == 0) + break; /* archive file ended */ + } /* * Either, here for the first time, or the archived streamer is @@ -419,20 +443,31 @@ get_archive_wal_entry(XLogSegNo segno, XLogDumpPrivate *privateInfo) } /* - * XXX: If the segment being read not the requested one, the data must - * be buffered, as we currently lack the mechanism to write it to a - * temporary file. This is a known limitation that will be fixed in the - * next patch, as the buffer could grow up to the full WAL segment - * size. + * Archive streamer is currently reading a file that isn't the one + * asked for, but it's required in the future. It should be written to + * a temporary location for retrieval when needed. */ - if (segno > entry->segno) - continue; - /* WAL segments must be archived in order */ - pg_log_error("WAL files are not archived in sequential order"); - pg_log_error_detail("Expecting segment number " UINT64_FORMAT " but found " UINT64_FORMAT ".", - segno, entry->segno); - exit(1); + /* Create a temporary file if one does not already exist */ + if (!entry->tmpseg_exists) + { + write_fp = prepare_tmp_write(entry->segno); + entry->tmpseg_exists = true; + } + + /* Flush data from the buffer to the file */ + perform_tmp_write(entry->segno, &entry->buf, write_fp); + resetStringInfo(&entry->buf); + + /* + * The change in the current segment entry indicates that the reading + * of this file has ended. + */ + if (entry != privateInfo->cur_wal && write_fp != NULL) + { + fclose(write_fp); + write_fp = NULL; + } } /* Requested WAL segment not found */ @@ -441,7 +476,166 @@ get_archive_wal_entry(XLogSegNo segno, XLogDumpPrivate *privateInfo) } /* - * Create an astreamer that can read WAL from a tar file. + * Set up a temporary directory to temporarily store WAL segments. + */ +static void +setup_tmpseg_dir(const char *waldir) +{ + char *template; + + /* + * Use the directory specified by the TMPDIR environment variable. If it's + * not set, use the provided WAL directory to extract WAL file + * temporarily. + */ + template = psprintf("%s/waldump_tmp-XXXXXX", + getenv("TMPDIR") ? getenv("TMPDIR") : waldir); + TmpWalSegDir = mkdtemp(template); + + if (TmpWalSegDir == NULL) + pg_fatal("could not create directory \"%s\": %m", template); + + canonicalize_path(TmpWalSegDir); + + pg_log_debug("created directory \"%s\"", TmpWalSegDir); +} + +/* + * Removes the temporarily store WAL segments, if any, at exiting. + */ +static void +cleanup_tmpseg_dir_atexit(void) +{ + ArchivedWAL_iterator it; + ArchivedWALEntry *entry; + + /* Remove temporary segments */ + ArchivedWAL_start_iterate(ArchivedWAL_HTAB, &it); + while ((entry = ArchivedWAL_iterate(ArchivedWAL_HTAB, &it)) != NULL) + { + if (entry->tmpseg_exists) + { + remove_tmp_walseg(entry->segno, false); + entry->tmpseg_exists = false; + } + } + + /* Remove temporary directory */ + if (rmdir(TmpWalSegDir) == 0) + pg_log_debug("removed directory \"%s\"", TmpWalSegDir); +} + +/* + * Generate the temporary WAL file path. + * + * Note that the caller is responsible to pfree it. + */ +char * +get_tmp_walseg_path(XLogSegNo segno) +{ + char *fpath = (char *) palloc(MAXPGPATH); + + Assert(TmpWalSegDir); + + snprintf(fpath, MAXPGPATH, "%s/%08X%08X", + TmpWalSegDir, + (uint32) (segno / XLogSegmentsPerXLogId(WalSegSz)), + (uint32) (segno % XLogSegmentsPerXLogId(WalSegSz))); + + return fpath; +} + +/* + * Routine to check whether a temporary file exists for the corresponding WAL + * segment number. + */ +bool +tmp_walseg_exists(XLogSegNo segno) +{ + ArchivedWALEntry *entry; + + entry = ArchivedWAL_lookup(ArchivedWAL_HTAB, segno); + + if (entry == NULL) + return false; + + return entry->tmpseg_exists; +} + +/* + * Create an empty placeholder file and return its handle. + */ +static FILE * +prepare_tmp_write(XLogSegNo segno) +{ + FILE *file; + char *fpath; + + fpath = get_tmp_walseg_path(segno); + + /* Create an empty placeholder */ + file = fopen(fpath, PG_BINARY_W); + if (file == NULL) + pg_fatal("could not create file \"%s\": %m", fpath); + +#ifndef WIN32 + if (chmod(fpath, pg_file_create_mode)) + pg_fatal("could not set permissions on file \"%s\": %m", + fpath); +#endif + + pg_log_debug("temporarily exporting file \"%s\"", fpath); + pfree(fpath); + + return file; +} + +/* + * Write buffer data to the given file handle. + */ +static void +perform_tmp_write(XLogSegNo segno, StringInfo buf, FILE *file) +{ + Assert(file); + + errno = 0; + if (buf->len > 0 && fwrite(buf->data, buf->len, 1, file) != 1) + { + /* + * If write didn't set errno, assume problem is no disk space + */ + if (errno == 0) + errno = ENOSPC; + pg_fatal("could not write to file \"%s\": %m", + get_tmp_walseg_path(segno)); + } +} + +/* + * Remove temporary file + */ +void +remove_tmp_walseg(XLogSegNo segno, bool update_entry) +{ + char *fpath = get_tmp_walseg_path(segno); + + if (unlink(fpath) == 0) + pg_log_debug("removed file \"%s\"", fpath); + pfree(fpath); + + /* Update entry if requested */ + if (update_entry) + { + ArchivedWALEntry *entry; + + entry = ArchivedWAL_lookup(ArchivedWAL_HTAB, segno); + Assert(entry != NULL); + entry->tmpseg_exists = false; + } +} + +/* + * Create an astreamer that can read WAL from tar file. */ static astreamer * astreamer_waldump_new(XLogDumpPrivate *privateInfo) diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c index e2c96c3f4ca..96d17e99690 100644 --- a/src/bin/pg_waldump/pg_waldump.c +++ b/src/bin/pg_waldump/pg_waldump.c @@ -474,12 +474,51 @@ TarWALDumpReadPage(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, { XLogDumpPrivate *private = state->private_data; int count = required_read_len(private, targetPagePtr, reqLen); + XLogSegNo nextSegNo; /* Bail out if the count to be read is not valid */ if (count < 0) return -1; - /* Read the WAL page from the archive streamer */ + /* + * If the target page is in a different segment, first check for the WAL + * segment's physical existence in the temporary directory. + */ + nextSegNo = state->seg.ws_segno; + if (!XLByteInSeg(targetPagePtr, nextSegNo, WalSegSz)) + { + if (state->seg.ws_file >= 0) + { + close(state->seg.ws_file); + state->seg.ws_file = -1; + + /* Remove this file, as it is no longer needed. */ + remove_tmp_walseg(nextSegNo, true); + } + + XLByteToSeg(targetPagePtr, nextSegNo, WalSegSz); + state->seg.ws_tli = private->timeline; + state->seg.ws_segno = nextSegNo; + + /* + * If the next segment exists, open it and continue reading from there + */ + if (tmp_walseg_exists(nextSegNo)) + { + char *fpath; + + fpath = get_tmp_walseg_path(nextSegNo); + state->seg.ws_file = open(fpath, O_RDONLY | PG_BINARY, 0); + pfree(fpath); + } + } + + /* Continue reading from the open WAL segment, if any */ + if (state->seg.ws_file >= 0) + return WALDumpReadPage(state, targetPagePtr, count, targetPtr, + readBuff); + + /* Otherwise, read the WAL page from the archive streamer */ return read_archive_wal_page(private, targetPagePtr, count, readBuff); } diff --git a/src/bin/pg_waldump/pg_waldump.h b/src/bin/pg_waldump/pg_waldump.h index ec7a33d40e0..03e02625ba1 100644 --- a/src/bin/pg_waldump/pg_waldump.h +++ b/src/bin/pg_waldump/pg_waldump.h @@ -58,4 +58,8 @@ extern int read_archive_wal_page(XLogDumpPrivate *privateInfo, XLogRecPtr targetPagePtr, Size count, char *readBuff); +extern char *get_tmp_walseg_path(XLogSegNo segno); +extern bool tmp_walseg_exists(XLogSegNo segno); +extern void remove_tmp_walseg(XLogSegNo segno, bool update_entry); + #endif /* end of PG_WALDUMP_H */ diff --git a/src/bin/pg_waldump/t/001_basic.pl b/src/bin/pg_waldump/t/001_basic.pl index 13567fbdba1..68b0cdd29e5 100644 --- a/src/bin/pg_waldump/t/001_basic.pl +++ b/src/bin/pg_waldump/t/001_basic.pl @@ -7,6 +7,7 @@ use Cwd; use PostgreSQL::Test::Cluster; use PostgreSQL::Test::Utils; use Test::More; +use List::Util qw(shuffle); my $tar = $ENV{TAR}; @@ -272,7 +273,7 @@ sub generate_archive } closedir $dh; - @files = sort @files; + @files = shuffle @files; # move into the WAL directory before archiving files my $cwd = getcwd; -- 2.47.1