diff --git a/doc/src/sgml/ref/pg_checksums.sgml b/doc/src/sgml/ref/pg_checksums.sgml index 1f4d4ab8b4..5027809a59 100644 --- a/doc/src/sgml/ref/pg_checksums.sgml +++ b/doc/src/sgml/ref/pg_checksums.sgml @@ -37,12 +37,10 @@ PostgreSQL documentation Description pg_checksums checks, enables or disables data - checksums in a PostgreSQL cluster. The server - must be shut down cleanly before running - pg_checksums. The exit status is zero if there - are no checksum errors when checking them, and nonzero if at least one - checksum failure is detected. If enabling or disabling checksums, the - exit status is nonzero if the operation failed. + checksums in a PostgreSQL cluster. The exit + status is zero if there are no checksum errors when checking them, and + nonzero if at least one checksum failure is detected. If enabling or + disabling checksums, the exit status is nonzero if the operation failed. @@ -50,6 +48,12 @@ PostgreSQL documentation the cluster, disabling checksums will only update the file pg_control. + + + If the cluster is online, pages that have been (re-)written since the last + checkpoint will not count as checksum failures if they cannot be verified + correctly. + diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c index 5c185ebed8..f62fdc90a4 100644 --- a/src/bin/pg_checksums/pg_checksums.c +++ b/src/bin/pg_checksums/pg_checksums.c @@ -1,8 +1,7 @@ /*------------------------------------------------------------------------- * * pg_checksums.c - * Checks, enables or disables page level checksums for an offline - * cluster + * Checks, enables or disables page level checksums for a cluster * * Copyright (c) 2010-2019, PostgreSQL Global Development Group * @@ -30,13 +29,20 @@ static int64 files = 0; +static int64 skippedfiles = 0; static int64 blocks = 0; static int64 badblocks = 0; +static int64 skippedblocks = 0; static ControlFileData *ControlFile; +static XLogRecPtr checkpointLSN; +static int64 blocks_last_lsn_update = 0; static char *only_relfilenode = NULL; static bool do_sync = true; static bool verbose = false; +static bool online = false; + +static char *DataDir = NULL; typedef enum { @@ -97,6 +103,23 @@ static const char *const skip[] = { NULL, }; +static void +update_checkpoint_lsn(void) +{ + bool crc_ok; + + ControlFile = get_controlfile(DataDir, progname, &crc_ok); + if (!crc_ok) + { + fprintf(stderr, _("%s: pg_control CRC value is incorrect\n"), progname); + exit(1); + } + + /* Update checkpointLSN and blocks_last_lsn_update with the current value */ + checkpointLSN = ControlFile->checkPoint; + blocks_last_lsn_update = blocks; +} + static bool skipfile(const char *fn) { @@ -116,6 +139,10 @@ scan_file(const char *fn, BlockNumber segmentno) PageHeader header = (PageHeader) buf.data; int f; BlockNumber blockno; + int block_loc = 0; + bool block_retry = false; + bool all_zeroes; + size_t *pagebytes; int flags; Assert(mode == PG_MODE_ENABLE || @@ -126,6 +153,12 @@ scan_file(const char *fn, BlockNumber segmentno) if (f < 0) { + if (online && errno == ENOENT) + { + /* File was removed in the meantime */ + return; + } + fprintf(stderr, _("%s: could not open file \"%s\": %s\n"), progname, fn, strerror(errno)); exit(1); @@ -136,27 +169,129 @@ scan_file(const char *fn, BlockNumber segmentno) for (blockno = 0;; blockno++) { uint16 csum; - int r = read(f, buf.data, BLCKSZ); + int r = read(f, buf.data + block_loc, BLCKSZ - block_loc); if (r == 0) + { + /* + * We had a short read and got an EOF before we could read the + * whole block, so count this as a skipped block. + */ + if (online && block_loc != 0) + skippedblocks++; break; - if (r != BLCKSZ) + } + if (r < 0) { - fprintf(stderr, _("%s: could not read block %u in file \"%s\": read %d of %d\n"), - progname, blockno, fn, r, BLCKSZ); - exit(1); + skippedfiles++; + fprintf(stderr, _("%s: could not read block %u in file \"%s\": %s\n"), + progname, blockno, fn, strerror(errno)); + return; + } + if (r < (BLCKSZ - block_loc)) + { + if (online) + { + /* + * We read only parts of the block, possibly due to a + * concurrent extension of the relation file. Increment + * block_loc by the amount that we read and try again. + */ + block_loc += r; + continue; + } + else + { + skippedfiles++; + fprintf(stderr, _("%s: could not read block %u in file \"%s\": read %d of %d\n"), + progname, blockno, fn, r, BLCKSZ); + return; + } } + blocks++; /* New pages have no checksum yet */ if (PageIsNew(header)) + { + /* Check for an all-zeroes page */ + all_zeroes = true; + pagebytes = (size_t *) buf.data; + for (int i = 0; i < (BLCKSZ / sizeof(size_t)); i++) + { + if (pagebytes[i] != 0) + { + all_zeroes = false; + break; + } + } + if (!all_zeroes) + { + fprintf(stderr, _("%s: checksum verification failed in file \"%s\", block %u: pd_upper is zero but block is not all-zero\n"), + progname, fn, blockno); + badblocks++; + } + else + { + skippedblocks++; + } continue; + } csum = pg_checksum_page(buf.data, blockno + segmentno * RELSEG_SIZE); if (mode == PG_MODE_CHECK) { if (csum != header->pd_checksum) { + if (online) + { + /* + * Retry the block on the first failure if online. If the + * verification is done while the instance is online, it is + * possible that we read the first 4K page of the block just + * before postgres updated the entire block so it ends up + * looking torn to us. We only need to retry once because the + * LSN should be updated to something we can ignore on the next + * pass. If the error happens again then it is a true + * validation failure. + */ + if (!block_retry) + { + /* Seek to the beginning of the failed block */ + if (lseek(f, -BLCKSZ, SEEK_CUR) == -1) + { + skippedfiles++; + fprintf(stderr, _("%s: could not lseek in file \"%s\": %m\n"), + progname, fn); + return; + } + + /* Set flag so we know a retry was attempted */ + block_retry = true; + + /* Reset loop to validate the block again */ + blockno--; + blocks--; + + continue; + } + + /* + * The checksum verification failed on retry as well. Check if + * the page has been modified since the checkpoint and skip it + * in this case. As a sanity check, demand that the upper + * 32 bits of the LSN are identical in order to skip as a + * guard against a corrupted LSN in the pageheader. + */ + if ((PageGetLSN(buf.data) > checkpointLSN) && + (PageGetLSN(buf.data) >> 32 == checkpointLSN >> 32)) + { + block_retry = false; + skippedblocks++; + continue; + } + } + if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_VERSION) fprintf(stderr, _("%s: checksum verification failed in file \"%s\", block %u: calculated checksum %X but block contains %X\n"), progname, fn, blockno, csum, header->pd_checksum); @@ -183,6 +318,9 @@ scan_file(const char *fn, BlockNumber segmentno) exit(1); } } + + block_retry = false; + block_loc = 0; } if (verbose) @@ -237,6 +375,12 @@ scan_directory(const char *basedir, const char *subdir) snprintf(fn, sizeof(fn), "%s/%s", path, de->d_name); if (lstat(fn, &st) < 0) { + if (online && errno == ENOENT) + { + /* File was removed in the meantime */ + continue; + } + fprintf(stderr, _("%s: could not stat file \"%s\": %s\n"), progname, fn, strerror(errno)); exit(1); @@ -280,6 +424,10 @@ scan_directory(const char *basedir, const char *subdir) continue; scan_file(fn, segmentno); + + /* Update checkpointLSN every 1024 * 1024 blocks */ + if (online && ((blocks - blocks_last_lsn_update) > (1024 * 1024))) + update_checkpoint_lsn(); } #ifndef WIN32 else if (S_ISDIR(st.st_mode) || S_ISLNK(st.st_mode)) @@ -304,7 +452,6 @@ main(int argc, char *argv[]) {NULL, 0, NULL, 0} }; - char *DataDir = NULL; int c; int option_index; bool crc_ok; @@ -398,7 +545,7 @@ main(int argc, char *argv[]) exit(1); } - /* Check if cluster is running */ + /* Check if checksums are enabled */ ControlFile = get_controlfile(DataDir, progname, &crc_ok); if (!crc_ok) { @@ -422,12 +569,10 @@ main(int argc, char *argv[]) exit(1); } + /* Check if cluster is running */ if (ControlFile->state != DB_SHUTDOWNED && ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY) - { - fprintf(stderr, _("%s: cluster must be shut down\n"), progname); - exit(1); - } + online = true; if (ControlFile->data_checksum_version == 0 && mode == PG_MODE_CHECK) @@ -450,6 +595,9 @@ main(int argc, char *argv[]) exit(1); } + /* Get checkpoint LSN */ + checkpointLSN = ControlFile->checkPoint; + /* Operate on all files if checking or enabling checksums */ if (mode == PG_MODE_CHECK || mode == PG_MODE_ENABLE) { @@ -459,7 +607,11 @@ main(int argc, char *argv[]) printf(_("Checksum operation completed\n")); printf(_("Files scanned: %s\n"), psprintf(INT64_FORMAT, files)); + if (skippedfiles > 0) + printf(_("Files skipped: %s\n"), psprintf(INT64_FORMAT, skippedfiles)); printf(_("Blocks scanned: %s\n"), psprintf(INT64_FORMAT, blocks)); + if (skippedblocks > 0) + printf(_("Blocks skipped: %s\n"), psprintf(INT64_FORMAT, skippedblocks)); if (mode == PG_MODE_CHECK) { printf(_("Bad checksums: %s\n"), psprintf(INT64_FORMAT, badblocks)); @@ -497,5 +649,10 @@ main(int argc, char *argv[]) printf(_("Checksums disabled in cluster\n")); } + /* skipped blocks or files are considered an error if offline */ + if (!online) + if (skippedblocks > 0 || skippedfiles > 0) + return 1; + return 0; } diff --git a/src/bin/pg_checksums/t/002_actions.pl b/src/bin/pg_checksums/t/002_actions.pl index 41575c5245..b6b6092da2 100644 --- a/src/bin/pg_checksums/t/002_actions.pl +++ b/src/bin/pg_checksums/t/002_actions.pl @@ -5,8 +5,7 @@ use strict; use warnings; use PostgresNode; use TestLib; -use Test::More tests => 62; - +use Test::More tests => 126; # Utility routine to create and check a table with corrupted checksums # on a wanted tablespace. Note that this stops and starts the node @@ -17,6 +16,9 @@ sub check_relation_corruption my $node = shift; my $table = shift; my $tablespace = shift; + my $offset = shift; + my $corrupted_data = shift; + my $description = shift; my $pgdata = $node->data_dir; $node->safe_psql('postgres', @@ -31,21 +33,18 @@ sub check_relation_corruption my $relfilenode_corrupted = $node->safe_psql('postgres', "SELECT relfilenode FROM pg_class WHERE relname = '$table';"); - # Set page header and block size - my $pageheader_size = 24; - my $block_size = $node->safe_psql('postgres', 'SHOW block_size;'); $node->stop; # Checksums are correct for single relfilenode as the table is not # corrupted yet. command_ok(['pg_checksums', '--check', '-D', $pgdata, '-r', $relfilenode_corrupted], - "succeeds for single relfilenode on tablespace $tablespace with offline cluster"); + "succeeds for single relfilenode $description with offline cluster"); # Time to create some corruption open my $file, '+<', "$pgdata/$file_corrupted"; - seek($file, $pageheader_size, 0); - syswrite($file, "\0\0\0\0\0\0\0\0\0"); + seek($file, $offset, 0); + syswrite($file, $corrupted_data); close $file; # Checksum checks on single relfilenode fail @@ -54,23 +53,38 @@ sub check_relation_corruption 1, [qr/Bad checksums:.*1/], [qr/checksum verification failed/], - "fails with corrupted data for single relfilenode on tablespace $tablespace"); + "fails with corrupted data for single relfilenode $description"); # Global checksum checks fail as well $node->command_checks_all([ 'pg_checksums', '--check', '-D', $pgdata], 1, [qr/Bad checksums:.*1/], [qr/checksum verification failed/], - "fails with corrupted data on tablespace $tablespace"); + "fails with corrupted data $description"); - # Drop corrupted table again and make sure there is no more corruption. + # Now check online as well $node->start; + + # Checksum checks on single relfilenode fail + $node->command_checks_all([ 'pg_checksums', '--check', '-D', $pgdata, + '-r', $relfilenode_corrupted], + 1, + [qr/Bad checksums:.*1/], + [qr/checksum verification failed/], + "fails with corrupted data for single relfilenode $description with online cluster"); + + # Global checksum checks fail as well + $node->command_checks_all([ 'pg_checksums', '--check', '-D', $pgdata], + 1, + [qr/Bad checksums:.*1/], + [qr/checksum verification failed/], + "fails with corrupted data $description with online cluster"); + + + # Drop corrupted table again and make sure there is no more corruption. $node->safe_psql('postgres', "DROP TABLE $table;"); - $node->stop; $node->command_ok(['pg_checksums', '--check', '-D', $pgdata], "succeeds again after table drop on tablespace $tablespace"); - - $node->start; return; } @@ -143,6 +157,11 @@ command_ok(['pg_checksums', '--check', '-D', $pgdata], command_ok(['pg_checksums', '-D', $pgdata], "verifies checksums as default action"); +# Checksums pass on an online cluster +$node->start; +command_ok(['pg_checksums', '--check', '-D', $pgdata], + "succeeds with online cluster"); + # Specific relation files cannot be requested when action is --disable # or --enable. command_fails(['pg_checksums', '--disable', '-r', '1234', '-D', $pgdata], @@ -150,13 +169,12 @@ command_fails(['pg_checksums', '--disable', '-r', '1234', '-D', $pgdata], command_fails(['pg_checksums', '--enable', '-r', '1234', '-D', $pgdata], "fails when relfilenodes are requested and action is --enable"); -# Checks cannot happen with an online cluster -$node->start; -command_fails(['pg_checksums', '--check', '-D', $pgdata], - "fails with online cluster"); +# Set page header and block size +my $pageheader_size = 24; +my $block_size = $node->safe_psql('postgres', 'SHOW block_size;'); # Check corruption of table on default tablespace. -check_relation_corruption($node, 'corrupt1', 'pg_default'); +check_relation_corruption($node, 'corrupt1', 'pg_default', $pageheader_size, "\0\0\0\0\0\0\0\0\0", "on tablespace pg_default"); # Create tablespace to check corruptions in a non-default tablespace. my $basedir = $node->basedir; @@ -165,7 +183,15 @@ mkdir ($tablespace_dir); $tablespace_dir = TestLib::real_dir($tablespace_dir); $node->safe_psql('postgres', "CREATE TABLESPACE ts_corrupt LOCATION '$tablespace_dir';"); -check_relation_corruption($node, 'corrupt2', 'ts_corrupt'); +check_relation_corruption($node, 'corrupt2', 'ts_corrupt', $pageheader_size, "\0\0\0\0\0\0\0\0\0", "on tablespace ts_corrupt"); + +# Check corruption in the pageheader with random data in it +my $random_data = join '', map { ("a".."z")[rand 26] } 1 .. $pageheader_size; +check_relation_corruption($node, 'corrupt1', 'pg_default', 0, $random_data, "with random data in pageheader"); + +# Check corruption when the pageheader has been zeroed-out completely +my $zero_data = "\0"x$pageheader_size; +check_relation_corruption($node, 'corrupt1', 'pg_default', 0, $zero_data, "with zeroed-out pageheader"); # Utility routine to check that pg_checksums is able to detect # correctly-named relation files filled with some corrupted data. @@ -179,23 +205,32 @@ sub fail_corrupt my $file_name = "$pgdata/global/$file"; append_to_file $file_name, "foo"; + $node->stop; + # If the instance is offline, the whole file is skipped and this is + # considered to be an error. $node->command_checks_all([ 'pg_checksums', '--check', '-D', $pgdata], 1, - [qr/^$/], + [qr/Files skipped:.*1/], [qr/could not read block 0 in file.*$file\":/], - "fails for corrupted data in $file"); + "skips corrupted data in $file"); + + $node->start; + # If the instance is online, the block is skipped and this is not + # considered to be an error + $node->command_checks_all([ 'pg_checksums', '--check', '-D', $pgdata], + 0, + [qr/Blocks skipped:.*1/], + [qr/^$/], + "skips corrupted data in $file"); # Remove file to prevent future lookup errors on conflicts. unlink $file_name; return; } -# Stop instance for the follow-up checks. -$node->stop; - -# Authorized relation files filled with corrupted data cause the -# checksum checks to fail. Make sure to use file names different -# than the previous ones. +# Authorized relation files filled with corrupted data cause the files to be +# skipped and, if the instance is offline, a non-zero exit status. Make sure +# to use file names different than the previous ones. fail_corrupt($node, "99990"); fail_corrupt($node, "99990.123"); fail_corrupt($node, "99990_fsm"); @@ -204,3 +239,6 @@ fail_corrupt($node, "99990_vm"); fail_corrupt($node, "99990_init.123"); fail_corrupt($node, "99990_fsm.123"); fail_corrupt($node, "99990_vm.123"); + +# Stop node again at the end of tests +$node->stop;