From 835abbfb2c30367e76fa68d63b9eb6a817b4eae3 Mon Sep 17 00:00:00 2001 From: Zhao Rui <875941708@qq.com> Date: Tue, 26 Apr 2022 15:08:23 +0800 Subject: [PATCH] Fix primary crash continually with invalid checkpoint after promote --- src/backend/access/transam/xlog.c | 8 +- .../t/027_invalid_checkpoint_after_promote.pl | 113 ++++++++++++++++++ 2 files changed, 120 insertions(+), 1 deletion(-) create mode 100644 src/test/recovery/t/027_invalid_checkpoint_after_promote.pl diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 6208e123e5..7c07a19d91 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -9586,6 +9586,9 @@ CreateRestartPoint(int flags) XLogRecPtr endptr; XLogSegNo _logSegNo; TimestampTz xtime; + bool isControlFileUpdated; + + isControlFileUpdated = false; /* Get a local copy of the last safe checkpoint record. */ SpinLockAcquire(&XLogCtl->info_lck); @@ -9717,6 +9720,7 @@ CreateRestartPoint(int flags) if (flags & CHECKPOINT_IS_SHUTDOWN) ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY; UpdateControlFile(); + isControlFileUpdated = true; } LWLockRelease(ControlFileLock); @@ -9768,7 +9772,9 @@ CreateRestartPoint(int flags) if (RecoveryInProgress()) ThisTimeLineID = replayTLI; - RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr); + /* We can only remove old xlog files after control file is updated. */ + if (isControlFileUpdated) + RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr); /* * Make more log segments if needed. (Do this after recycling old log diff --git a/src/test/recovery/t/027_invalid_checkpoint_after_promote.pl b/src/test/recovery/t/027_invalid_checkpoint_after_promote.pl new file mode 100644 index 0000000000..9cf6db5c4c --- /dev/null +++ b/src/test/recovery/t/027_invalid_checkpoint_after_promote.pl @@ -0,0 +1,113 @@ +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Time::HiRes qw(usleep nanosleep); +use Test::More tests => 5; + +# initialize primary node +my $node_primary = get_new_node('master'); +$node_primary->init(allows_streaming => 1); +$node_primary->append_conf( + 'postgresql.conf', q[ +checkpoint_timeout = 30s +max_wal_size = 16GB +log_checkpoints = on +restart_after_crash = on +]); +$node_primary->start; +my $backup_name = 'my_backup'; +$node_primary->backup($backup_name); + +# setup a standby +my $node_standby = get_new_node('standby1'); +$node_standby->init_from_backup($node_primary, $backup_name, has_streaming => 1); +$node_standby->start; + +# dummy table for the upcoming tests. +$node_primary->safe_psql('postgres', 'checkpoint'); +$node_primary->safe_psql('postgres', 'create table test (a int, b varchar(255))'); + +# use background psql to insert batch, just to make checkpoint a little slow. +my $psql_timeout = IPC::Run::timer(3600); +my ($stdin, $stdout, $stderr) = ('', '', ''); +my $psql_primary = IPC::Run::start( + [ 'psql', '-XAtq', '-v', 'ON_ERROR_STOP=1', '-f', '-', '-d', $node_primary->connstr('postgres') ], + '<', + \$stdin, + '>', + \$stdout, + '2>', + \$stderr, + $psql_timeout); +$stdin .= q[ +INSERT INTO test SELECT i, 'aaaaaaaaaaaaaaaaaaaaaaa' from generate_series(1, 100000000) as i; +]; +$psql_primary->pump_nb(); + +# wait until restartpoint on standy +my $logstart = -s $node_standby->logfile; +my $checkpoint_start = 0; +for (my $i = 0; $i < 3000; $i++) +{ + my $log = TestLib::slurp_file($node_standby->logfile, $logstart); + if ($log =~ m/restartpoint starting: time/) + { + $checkpoint_start = 1; + last; + } + usleep(100_000); +} +is($checkpoint_start, 1, 'restartpoint has started'); + +# promote during restartpoint +$node_primary->stop; +$node_standby->promote; + +# wait until checkpoint on new primary +$logstart = -s $node_standby->logfile; +$checkpoint_start = 0; +for (my $i = 0; $i < 3000; $i++) +{ + my $log = TestLib::slurp_file($node_standby->logfile, $logstart); + if ($log =~ m/restartpoint complete/) + { + $checkpoint_start = 1; + last; + } + usleep(100_000); +} +is($checkpoint_start, 1, 'checkpoint has started'); + +# kill SIGKILL a backend, and all backend will restart. Note that previous checkpoint has not completed. +my ($killme_stdin, $killme_stdout, $killme_stderr) = ('', '', ''); +my $killme = IPC::Run::start( + [ 'psql', '-XAtq', '-v', 'ON_ERROR_STOP=1', '-f', '-', '-d', $node_standby->connstr('postgres') ], + '<', + \$killme_stdin, + '>', + \$killme_stdout, + '2>', + \$killme_stderr, + $psql_timeout); +$killme_stdin .= q[ +SELECT pg_backend_pid(); +]; +$killme->pump until $killme_stdout =~ /[[:digit:]]+[\r\n]$/; +my $pid = $killme_stdout; +chomp($pid); +my $ret = TestLib::system_log('pg_ctl', 'kill', 'KILL', $pid); +is($ret, 0, 'killed process with KILL'); + +# after recovery, the server will not start, and log PANIC: could not locate a valid checkpoint record +for (my $i = 0; $i < 30; $i++) +{ + ($ret, $stdout, $stderr) = $node_standby->psql('postgres', 'select 1'); + last if $ret == 0; + sleep(1); +} +is($ret, 0, "psql connect success"); +is($stdout, 1, "psql select 1"); + +$psql_primary->finish; +$killme->finish; \ No newline at end of file -- 2.32.0.3.g01195cf9f