From 52eb52c4f9d6e1316b88a8a4c0e591d15ced8b3f Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Thu, 18 Feb 2021 23:21:49 +0000 Subject: [PATCH v1 2/2] Keep track of notified-ready-for-archive position through crashes. If a WAL record that spans across multiple segments is fully flushed to disk but the server crashes before the segment can be marked as ready-for-archive, we will fail to mark it as ready-for- archive when the server starts again. To fix, we add a last_notified file in the archive_status directory that is used to restore the corresponding value in shared memory. This file is only used for primary servers, as standbys have their own code path for marking segments as ready-for-archive that isn't aware of WAL record boundaries. If the server starts up in point- in-time recovery, standby mode, or with archive_mode turned off, we delete this file. In those cases, the file will get initialized normally if and when it begins writing its own WAL in the future. --- src/backend/access/transam/xlog.c | 224 +++++++++++++++++++++++++++++++++++++- src/bin/pg_resetwal/pg_resetwal.c | 5 +- 2 files changed, 226 insertions(+), 3 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 950840d584..19ecc98f25 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -537,6 +537,9 @@ typedef struct RecordBoundaryEntry XLogRecPtr pos; } RecordBoundaryEntry; +/* File for persisting the last notified segment through crashes. */ +#define LAST_NOTIFIED_FILE (XLOGDIR "/archive_status/last_notified") + /* * Shared state data for WAL insertion. */ @@ -991,12 +994,22 @@ static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos); static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos); static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr); static void checkXLogConsistency(XLogReaderState *record); + +typedef struct LastNotifiedSegment +{ + XLogSegNo segno; + bool crash_handling; +} LastNotifiedSegment; + static void RegisterRecordBoundaryEntry(XLogSegNo seg, XLogRecPtr pos); static void NotifySegmentsReadyForArchive(void); static XLogSegNo GetLastNotifiedSegment(void); static void SetLastNotifiedSegment(XLogSegNo seg); static XLogSegNo GetLatestRecordBoundarySegment(void); static void RemoveRecordBoundariesUpTo(XLogSegNo seg); +static void InitializeLastNotifiedSegment(void); +static LastNotifiedSegment ReadLastNotifiedSegmentFile(void); +static void WriteLastNotifiedSegmentFile(XLogSegNo segno, bool crash_handling); static void WALInsertLockAcquire(void); static void WALInsertLockAcquireExclusive(void); @@ -1408,9 +1421,27 @@ NotifySegmentsReadyForArchive(void) { XLogSegNo i; - /* create the archive status files */ + /* create the archive_status files and update last_notified */ for (i = GetLastNotifiedSegment() + 1; i < latest_boundary_seg; i++) + { + /* + * We must update the last_notified file _before_ creating the + * archive status file for the segment. This ensures that we can + * recover in the event of a crash. RemoveXlogFile() + * recycles/removes segments prior to clearing the archive status + * file, so if after a crash the segment exists but its + * archive status file does not, we know we must create its archive + * status file. + * + * It's also possible that we crash in the process of notifying + * several segments in this loop, but we will not know to create + * archive status files for those after a crash. Instead, we will + * attempt to notify them again after a later segment becomes + * eligible for archival. + */ + WriteLastNotifiedSegmentFile(i, true); XLogArchiveNotifySeg(i); + } /* update shared memory */ SetLastNotifiedSegment(latest_boundary_seg - 1); @@ -1513,6 +1544,191 @@ SetLastNotifiedSegment(XLogSegNo seg) SpinLockRelease(&XLogCtl->info_lck); } +/* + * InitializeLastNotifiedSegment + * + * If the last_notified file does not exist, no action is taken. + * + * If archiving is turned off, the server is a standby, or the server is + * perfoming point-in-time recovery, the last_notified file is deleted. (Once + * the standby is promoted or recovery completes, a new last_notified file will + * be created in XLogWrite().) + * + * Otherwise, initialize lastNotifiedSeg with the value in the file and make + * sure that segment's archive status is created. + */ +static void +InitializeLastNotifiedSegment(void) +{ + char fname[MAXPGPATH]; + char path[MAXPGPATH]; + struct stat filestats; + LastNotifiedSegment seg; + int rc; + + rc = stat(LAST_NOTIFIED_FILE, &filestats); + if (rc != 0 && errno != ENOENT) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", + LAST_NOTIFIED_FILE))); + + /* + * If the last_notified file does not exist, there's nothing to do. If this + * file is supposed to exist, it will be created in the future by + * XLogWrite(). + */ + if (rc != 0) + return; + + /* + * If the server is a standby, is performing point-in-time recovery, or + * archiving is turned off, delete the last_notified file and move on. Once + * the standby is promoted or recovery completes, a new last_notified file + * will be created in XLogWrite(). + */ + if (StandbyModeRequested || + ArchiveRecoveryRequested || + !XLogArchivingActive()) + { + durable_unlink(LAST_NOTIFIED_FILE, PANIC); + return; + } + + /* + * Retrieve the latest notified segment from the file. + */ + seg = ReadLastNotifiedSegmentFile(); + XLogCtl->lastNotifiedSeg = seg.segno; + + /* + * If crash_handling was disabled for the segment (e.g., the value was + * purely for initialization purposes), there is no need to ensure that an + * archive status file was created. + */ + if (!seg.crash_handling) + return; + + /* + * If the segment still exists but there is no corresponding archive status + * file, the server must have crashed between updating last_notified and + * creating the archive status file. We must create an archive status file + * for the segment now. + */ + XLogFilePath(path, ThisTimeLineID, seg.segno, wal_segment_size); + rc = stat(path, &filestats); + if (rc != 0 && errno != ENOENT) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", + path))); + + XLogFileName(fname, ThisTimeLineID, seg.segno, wal_segment_size); + if (rc == 0 && !XLogArchiveIsReadyOrDone(fname)) + XLogArchiveNotify(fname); +} + +/* + * ReadLastNotifiedSegmentFile + * + * Reads the segment number from the last_notified file. The caller is + * responsible for ensuring that the file exists. + */ +static LastNotifiedSegment +ReadLastNotifiedSegmentFile(void) +{ + LastNotifiedSegment seg; + int readBytes; + int fd; + + fd = OpenTransientFile(LAST_NOTIFIED_FILE, O_RDONLY | PG_BINARY); + if (fd < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + LAST_NOTIFIED_FILE))); + + readBytes = read(fd, &seg, sizeof(seg)); + if (readBytes != sizeof(seg)) + { + if (readBytes < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + LAST_NOTIFIED_FILE))); + else + ereport(PANIC, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read file \"%s\": read %d of %zu", + LAST_NOTIFIED_FILE, readBytes, sizeof(seg)))); + } + + if (CloseTransientFile(fd) != 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", + LAST_NOTIFIED_FILE))); + + return seg; +} + +/* + * WriteLastNotifiedSegmentFile + * + * Writes the given segment number to the last_notified file. The caller is + * responsible for ensuring that no other backends can perform this at the same + * time. + */ +static void +WriteLastNotifiedSegmentFile(XLogSegNo segno, bool crash_handling) +{ + LastNotifiedSegment seg; + int fd; + char path[MAXPGPATH]; + + memset(&seg, 0, sizeof(seg)); + seg.segno = segno; + seg.crash_handling = crash_handling; + + snprintf(path, MAXPGPATH, "%s.tmp", LAST_NOTIFIED_FILE); + + /* make sure no old temp file is remaining */ + if (unlink(path) < 0 && errno != ENOENT) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", + path))); + + fd = OpenTransientFile(path, + O_CREAT | O_EXCL | O_WRONLY | PG_BINARY); + if (fd < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", + path))); + + errno = 0; + if ((write(fd, &seg, sizeof(seg))) != sizeof(seg)) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", + path))); + } + + if (CloseTransientFile(fd) != 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", + path))); + + /* fsync, rename to permanent file, fsync file and directory */ + durable_rename(path, LAST_NOTIFIED_FILE, PANIC); +} + /* * Reserves the right amount of space for a record of given size from the WAL. * *StartPos is set to the beginning of the reserved section, *EndPos to @@ -2836,7 +3052,10 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible) XLogSegNoIsInvalid(GetLastNotifiedSegment())) { LWLockAcquire(ArchNotifyLock, LW_EXCLUSIVE); + SetLastNotifiedSegment(openLogSegNo - 1); + WriteLastNotifiedSegmentFile(openLogSegNo - 1, false); + LWLockRelease(ArchNotifyLock); } @@ -7210,6 +7429,9 @@ StartupXLOG(void) InRecovery = true; } + /* If necessary, read the last_notified file. */ + InitializeLastNotifiedSegment(); + /* REDO */ if (InRecovery) { diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c index 805dafef07..fe58a00854 100644 --- a/src/bin/pg_resetwal/pg_resetwal.c +++ b/src/bin/pg_resetwal/pg_resetwal.c @@ -1074,11 +1074,12 @@ KillExistingArchiveStatus(void) while (errno = 0, (xlde = readdir(xldir)) != NULL) { - if (strspn(xlde->d_name, "0123456789ABCDEF") == XLOG_FNAME_LEN && + if ((strcmp(xlde->d_name, "last_notified") == 0) || + (strspn(xlde->d_name, "0123456789ABCDEF") == XLOG_FNAME_LEN && (strcmp(xlde->d_name + XLOG_FNAME_LEN, ".ready") == 0 || strcmp(xlde->d_name + XLOG_FNAME_LEN, ".done") == 0 || strcmp(xlde->d_name + XLOG_FNAME_LEN, ".partial.ready") == 0 || - strcmp(xlde->d_name + XLOG_FNAME_LEN, ".partial.done") == 0)) + strcmp(xlde->d_name + XLOG_FNAME_LEN, ".partial.done") == 0))) { snprintf(path, sizeof(path), "%s/%s", ARCHSTATDIR, xlde->d_name); if (unlink(path) < 0) -- 2.16.6