From 4240fd71ced2fca770b802cf08b03cab7a2ee89f Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Mon, 28 Jun 2021 12:03:02 +1200 Subject: [PATCH] Remove control file dependency on 512 byte sectors. Instead of making requirements about sector-level atomicity on power loss, use a double-write strategy. We write out two copies of the control file data with a synchronization barrier in between. Only one of them can be torn by power loss, on an overwrite file system. XXX WIP XXX pg_filenode.map has the same problem Discussion: https://postgr.es/m/17064-bb0d7904ef72add3%40postgresql.org --- src/backend/access/transam/xlog.c | 118 +++++++++++++++--- src/common/controldata_utils.c | 81 ++++++------ src/include/catalog/pg_control.h | 8 -- .../recovery/t/026_corrupted_control_file.pl | 94 ++++++++++++++ 4 files changed, 236 insertions(+), 65 deletions(-) create mode 100644 src/test/recovery/t/026_corrupted_control_file.pl diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 1b3a3d9bea..3442d5d95a 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -4660,10 +4660,8 @@ WriteControlFile(void) * Ensure that the size of the pg_control data structure is sane. See the * comments for these symbols in pg_control.h. */ - StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_MAX_SAFE_SIZE, - "pg_control is too large for atomic disk writes"); - StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_FILE_SIZE, - "sizeof(ControlFileData) exceeds PG_CONTROL_FILE_SIZE"); + StaticAssertStmt(sizeof(ControlFileData) * 2 <= PG_CONTROL_FILE_SIZE, + "sizeof(ControlFileData) * 2 exceeds PG_CONTROL_FILE_SIZE"); /* * Initialize version and compatibility-check fields @@ -4704,6 +4702,14 @@ WriteControlFile(void) memset(buffer, 0, PG_CONTROL_FILE_SIZE); memcpy(buffer, ControlFile, sizeof(ControlFileData)); + /* + * Make second copy of the control file data. No need to synchronize the + * file in between copies (as UpdateControlFile() does), because this path + * is used during bootstrapping only. + */ + memcpy(buffer + sizeof(ControlFileData), + ControlFile, sizeof(ControlFileData)); + fd = BasicOpenFile(XLOG_CONTROL_FILE, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); if (fd < 0) @@ -4744,6 +4750,8 @@ WriteControlFile(void) static void ReadControlFile(void) { + ControlFileData control_file_array[2]; + bool crc_good[2]; pg_crc32c crc; int fd; static char wal_segsz_str[20]; @@ -4761,8 +4769,8 @@ ReadControlFile(void) XLOG_CONTROL_FILE))); pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ); - r = read(fd, ControlFile, sizeof(ControlFileData)); - if (r != sizeof(ControlFileData)) + r = read(fd, control_file_array, sizeof(control_file_array)); + if (r != sizeof(control_file_array)) { if (r < 0) ereport(PANIC, @@ -4773,12 +4781,98 @@ ReadControlFile(void) ereport(PANIC, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("could not read file \"%s\": read %d of %zu", - XLOG_CONTROL_FILE, r, sizeof(ControlFileData)))); + XLOG_CONTROL_FILE, r, sizeof(control_file_array)))); } pgstat_report_wait_end(); close(fd); + /* + * We read two copies. Usually they are identical, but they might not be + * if there was a crash while UpdateControlFile() was writing them out. + * They were written with a synchronization barrier between them, so we can + * assume that at least one of them was written out fully, without having + * to make any assumptions about things like sectors and atomicity on + * overwrite filesystems. + */ + for (int i = 0; i < 2; ++i) + { + INIT_CRC32C(crc); + COMP_CRC32C(crc, &control_file_array[i], + offsetof(ControlFileData, crc)); + FIN_CRC32C(crc); + crc_good[i] = EQ_CRC32C(crc, control_file_array[i].crc); + } + + if (crc_good[0]) + { + /* The first copy is good, so that's the one we'll use to start up. */ + *ControlFile = control_file_array[0]; + + /* + * There is a hazard if the second copy is corrupted, though: + * UpdateControlFile() writes first then second. If we leave the + * second copy corrupted now, another power loss event could corrupt + * the first copy, leaving no good copies on the disk. So, if the + * second copy is corrupted, fix it now. + */ + if (!crc_good[1]) + { + elog(WARNING, "incorrect checksum for second copy of control data, correcting"); + fd = BasicOpenFile(XLOG_CONTROL_FILE, O_RDWR | PG_BINARY); + if (fd < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + XLOG_CONTROL_FILE))); + r = pg_pwrite(fd, ControlFile, sizeof(*ControlFile), + sizeof(*ControlFile)); + if (r != sizeof(*ControlFile)) + { + if (r < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", + XLOG_CONTROL_FILE))); + else + ereport(PANIC, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not write file \"%s\": wrote %d of %zu", + XLOG_CONTROL_FILE, r, sizeof(*ControlFile)))); + } + if (pg_fsync(openLogFile) != 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", + XLOG_CONTROL_FILE))); + close(fd); + } + } + else if (crc_good[1]) + { + /* The second copy is good, so that's the one we'll use to start up. */ + *ControlFile = control_file_array[1]; + elog(WARNING, "incorrect checksum for first copy of control data, using second copy"); + + /* + * We don't have to worry about correcting the corrupted first copy. + * The next call to UpdateControlFile() will overwrite the first copy + * first, so there won't be a risk of both copies being corrupted on + * overwrite filesystems. + */ + } + else + { + /* + * Both copies failed CRC checks, so we can't start up. We'll use the + * first one for the checks on file format version that we perform + * next, because only its version is expected to be at the same offset + * in all versions. + */ + *ControlFile = control_file_array[0]; + elog(WARNING, "incorrect checksum for both copies of control data"); + } + /* * Check for expected pg_control format version. If this is wrong, the * CRC check will likely fail because we'll be checking the wrong number @@ -4803,14 +4897,8 @@ ReadControlFile(void) ControlFile->pg_control_version, PG_CONTROL_VERSION), errhint("It looks like you need to initdb."))); - /* Now check the CRC. */ - INIT_CRC32C(crc); - COMP_CRC32C(crc, - (char *) ControlFile, - offsetof(ControlFileData, crc)); - FIN_CRC32C(crc); - - if (!EQ_CRC32C(crc, ControlFile->crc)) + /* Now report CRC failure. */ + if (!crc_good[0] && !crc_good[1]) ereport(FATAL, (errmsg("incorrect checksum in control file"))); diff --git a/src/common/controldata_utils.c b/src/common/controldata_utils.c index 7d4af7881e..844f1639fa 100644 --- a/src/common/controldata_utils.c +++ b/src/common/controldata_utils.c @@ -44,6 +44,10 @@ * Get controlfile values. The result is returned as a palloc'd copy of the * control file data. * + * XXX Unlike the backend's ReadControlFile(), this function does not currently + * consider the second copy of ControlFileData and will simply report CRC check + * failure if the first copy is corrupted. + * * crc_ok_p can be used by the caller to see whether the CRC of the control * file data is correct. */ @@ -157,16 +161,13 @@ update_controlfile(const char *DataDir, ControlFileData *ControlFile, bool do_sync) { int fd; - char buffer[PG_CONTROL_FILE_SIZE]; char ControlFilePath[MAXPGPATH]; /* * Apply the same static assertions as in backend's WriteControlFile(). */ - StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_MAX_SAFE_SIZE, - "pg_control is too large for atomic disk writes"); - StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_FILE_SIZE, - "sizeof(ControlFileData) exceeds PG_CONTROL_FILE_SIZE"); + StaticAssertStmt(sizeof(ControlFileData) * 2 <= PG_CONTROL_FILE_SIZE, + "sizeof(ControlFileData) * 2 exceeds PG_CONTROL_FILE_SIZE"); /* Recalculate CRC of control file */ INIT_CRC32C(ControlFile->crc); @@ -175,14 +176,6 @@ update_controlfile(const char *DataDir, offsetof(ControlFileData, crc)); FIN_CRC32C(ControlFile->crc); - /* - * Write out PG_CONTROL_FILE_SIZE bytes into pg_control by zero-padding - * the excess over sizeof(ControlFileData), to avoid premature EOF related - * errors when reading it. - */ - memset(buffer, 0, PG_CONTROL_FILE_SIZE); - memcpy(buffer, ControlFile, sizeof(ControlFileData)); - snprintf(ControlFilePath, sizeof(ControlFilePath), "%s/%s", DataDir, XLOG_CONTROL_FILE); #ifndef FRONTEND @@ -205,47 +198,51 @@ update_controlfile(const char *DataDir, } #endif - errno = 0; -#ifndef FRONTEND - pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE); -#endif - if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE) + /* Write out two copies with a sychronization barrier in between. */ + for (int i = 0; i < 2; ++i) { - /* if write didn't set errno, assume problem is no disk space */ - if (errno == 0) - errno = ENOSPC; - -#ifndef FRONTEND - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not write file \"%s\": %m", - ControlFilePath))); -#else - pg_log_fatal("could not write file \"%s\": %m", ControlFilePath); - exit(EXIT_FAILURE); -#endif - } + errno = 0; #ifndef FRONTEND - pgstat_report_wait_end(); + pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE); #endif + if (write(fd, ControlFile, sizeof(*ControlFile)) != sizeof(*ControlFile)) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; - if (do_sync) - { #ifndef FRONTEND - pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE); - if (pg_fsync(fd) != 0) ereport(PANIC, (errcode_for_file_access(), - errmsg("could not fsync file \"%s\": %m", + errmsg("could not write file \"%s\": %m", ControlFilePath))); - pgstat_report_wait_end(); #else - if (fsync(fd) != 0) - { - pg_log_fatal("could not fsync file \"%s\": %m", ControlFilePath); + pg_log_fatal("could not write file \"%s\": %m", ControlFilePath); exit(EXIT_FAILURE); +#endif } +#ifndef FRONTEND + pgstat_report_wait_end(); #endif + + if (do_sync) + { +#ifndef FRONTEND + pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE); + if (pg_fsync(fd) != 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", + ControlFilePath))); + pgstat_report_wait_end(); +#else + if (fsync(fd) != 0) + { + pg_log_fatal("could not fsync file \"%s\": %m", ControlFilePath); + exit(EXIT_FAILURE); + } +#endif + } } if (close(fd) != 0) diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index e3f48158ce..24d98441d1 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -230,14 +230,6 @@ typedef struct ControlFileData pg_crc32c crc; } ControlFileData; -/* - * Maximum safe value of sizeof(ControlFileData). For reliability's sake, - * it's critical that pg_control updates be atomic writes. That generally - * means the active data can't be more than one disk sector, which is 512 - * bytes on common hardware. Be very careful about raising this limit. - */ -#define PG_CONTROL_MAX_SAFE_SIZE 512 - /* * Physical size of the pg_control file. Note that this is considerably * bigger than the actually used size (ie, sizeof(ControlFileData)). diff --git a/src/test/recovery/t/026_corrupted_control_file.pl b/src/test/recovery/t/026_corrupted_control_file.pl new file mode 100644 index 0000000000..03a42c79d0 --- /dev/null +++ b/src/test/recovery/t/026_corrupted_control_file.pl @@ -0,0 +1,94 @@ + +# Copyright (c) 2021, PostgreSQL Global Development Group + +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More; +use Config; + +plan tests => 8; + +# find $pat in logfile of $node after $off-th byte +# XXX: function copied from t/019_repslot_limit.pl +sub find_in_log +{ + my ($node, $pat, $off) = @_; + + $off = 0 unless defined $off; + my $log = TestLib::slurp_file($node->logfile); + return 0 if (length($log) <= $off); + + $log = substr($log, $off); + + return $log =~ m/$pat/; +} + +my $psql_timeout = IPC::Run::timer(60); + +my $node = get_new_node('test'); +my $control_file = $node->data_dir . "/global/pg_control"; +$node->init(); + +# We start up without warnings if there is no corruption. +my $logstart = -s $node->logfile; +$node->start(); +$node->stop(); +ok(!find_in_log($node, "incorrect checksum"), + "both copies ok #1"); + +# Corrupt the first copy of the control data... we can still start up, using +# the second copy. +open(my $file, '+<', $control_file); +print $file "................"; +close($file); +$logstart = -s $node->logfile; +$node->start(); +$node->stop(); +ok(find_in_log($node, "incorrect checksum for first copy of control data", $logstart), + "first copy bad"); + +# There should be no corruption now. +$logstart = -s $node->logfile; +$node->start(); +$node->stop(); +ok(!find_in_log($node, "incorrect checksum", $logstart), + "both copies ok #2"); + +# Corrupt the second copy of the control data. We don't want to hard-code its +# location in this test, so we'll just grab a chunk of bytes from the front of +# the control file, and then search for the second instance of that string and +# corrupt it. +my $control_file_contents = slurp_file($control_file); +my $header = substr $control_file_contents, 0, 64; +my $index = rindex $control_file_contents, $header; +ok($index > 0, "second copy exists"); +open($file, '+<', $control_file); +seek($file, $index, 0); +print $file "................"; +close($file); +$logstart = -s $node->logfile; +$node->start(); +$node->stop(); +ok(find_in_log($node, "incorrect checksum for second copy of control data, correcting", $logstart), + "second copy bad"); + +# There should be no corruption now. +$logstart = -s $node->logfile; +$node->start(); +$node->stop(); +ok(!find_in_log($node, "incorrect checksum", $logstart), + "both copies ok #3"); + +# Corrupt both copies. Now we can't start up! +my $control_file_size = -s $control_file; +open($file, '+<', $control_file); +print $file ("." x $control_file_size); +close($file); +$logstart = -s $node->logfile; +$node->start(fail_ok => 1); +ok(find_in_log($node, "incorrect checksum for both copies of control data", $logstart), + "both copies bad 1/2"); +ok(find_in_log($node, "FATAL: database files are incompatible with server", $logstart), + "both copies bad 2/2"); -- 2.30.2