From 67797944f91441e0cde1259409eeffcc1bd62e39 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Mon, 19 Feb 2024 14:38:08 +0900 Subject: [PATCH 2/2] Add regression test for restart points during promotion This test fails when 7863ee4def65 is reverted, checking that a node is able to properly restart following a crash when a restart point was finishing across a promotion. This is an old bug that had no coverage, and injection points make that cheap to achieve. --- src/backend/access/transam/xlog.c | 7 + src/test/recovery/Makefile | 7 +- src/test/recovery/meson.build | 4 + .../t/041_invalid_checkpoint_after_promote.pl | 159 ++++++++++++++++++ 4 files changed, 176 insertions(+), 1 deletion(-) create mode 100644 src/test/recovery/t/041_invalid_checkpoint_after_promote.pl diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 50c347a679..50b045ff08 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -100,6 +100,7 @@ #include "storage/sync.h" #include "utils/guc_hooks.h" #include "utils/guc_tables.h" +#include "utils/injection_point.h" #include "utils/memutils.h" #include "utils/ps_status.h" #include "utils/relmapper.h" @@ -7536,6 +7537,12 @@ CreateRestartPoint(int flags) CheckPointGuts(lastCheckPoint.redo, flags); + /* + * This location needs to be after CheckPointGuts() to ensure that some + * work has already happened during this checkpoint. + */ + INJECTION_POINT("CreateRestartPoint"); + /* * Remember the prior checkpoint's redo ptr for * UpdateCheckPointDistanceEstimate() diff --git a/src/test/recovery/Makefile b/src/test/recovery/Makefile index 17ee353735..f57baba5e8 100644 --- a/src/test/recovery/Makefile +++ b/src/test/recovery/Makefile @@ -9,12 +9,17 @@ # #------------------------------------------------------------------------- -EXTRA_INSTALL=contrib/pg_prewarm contrib/pg_stat_statements contrib/test_decoding +EXTRA_INSTALL=contrib/pg_prewarm \ + contrib/pg_stat_statements \ + contrib/test_decoding \ + src/test/modules/injection_points subdir = src/test/recovery top_builddir = ../../.. include $(top_builddir)/src/Makefile.global +export enable_injection_points enable_injection_points + # required for 017_shm.pl and 027_stream_regress.pl REGRESS_SHLIB=$(abs_top_builddir)/src/test/regress/regress$(DLSUFFIX) export REGRESS_SHLIB diff --git a/src/test/recovery/meson.build b/src/test/recovery/meson.build index bf087ac2a9..e4e0e2b4cc 100644 --- a/src/test/recovery/meson.build +++ b/src/test/recovery/meson.build @@ -6,6 +6,9 @@ tests += { 'bd': meson.current_build_dir(), 'tap': { 'test_kwargs': {'priority': 40}, # recovery tests are slow, start early + 'env': { + 'enable_injection_points': get_option('injection_points') ? 'yes' : 'no', + }, 'tests': [ 't/001_stream_rep.pl', 't/002_archiving.pl', @@ -46,6 +49,7 @@ tests += { 't/038_save_logical_slots_shutdown.pl', 't/039_end_of_wal.pl', 't/040_standby_failover_slots_sync.pl', + 't/041_invalid_checkpoint_after_promote.pl', ], }, } diff --git a/src/test/recovery/t/041_invalid_checkpoint_after_promote.pl b/src/test/recovery/t/041_invalid_checkpoint_after_promote.pl new file mode 100644 index 0000000000..b2e9b526ee --- /dev/null +++ b/src/test/recovery/t/041_invalid_checkpoint_after_promote.pl @@ -0,0 +1,159 @@ + +# Copyright (c) 2024, PostgreSQL Global Development Group + +use strict; +use warnings; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Time::HiRes qw(usleep); +use Test::More; + +################################################## +# Test race condition when a restart point is running during a promotion, +# checking that WAL segments are correctly removed in the restart point +# while the promotion finishes. +# +# This test relies on an injection point that causes the checkpointer to +# wait in the middle of a restart point on a standby. The checkpointer +# is awaken to finish its restart point only once the promotion of the +# standby is completed, and the node should be able to restart properly. +################################################## + +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +# Initialize primary node +my $node_primary = PostgreSQL::Test::Cluster->new('master'); +$node_primary->init(allows_streaming => 1); +$node_primary->append_conf( + 'postgresql.conf', q[ +checkpoint_timeout = 30s +log_checkpoints = on +restart_after_crash = on +]); +$node_primary->start; + +my $backup_name = 'my_backup'; +$node_primary->backup($backup_name); + +# Setup a standby +my $node_standby = PostgreSQL::Test::Cluster->new('standby1'); +$node_standby->init_from_backup($node_primary, $backup_name, + has_streaming => 1); +$node_standby->start; + +# Dummy table for the upcoming tests. +$node_primary->safe_psql('postgres', 'checkpoint'); +$node_primary->safe_psql('postgres', 'CREATE TABLE prim_tab (a int);'); + +# Register a injection point on the standby so as the follow-up +# restart point will wait on it. +$node_primary->safe_psql('postgres', 'CREATE EXTENSION injection_points;'); +# Wait until the extension has been created on the standby +$node_primary->wait_for_replay_catchup($node_standby); + +# Note that from this point the checkpointer will wait in the middle of +# a restart point on the standby. +$node_standby->safe_psql('postgres', + "SELECT injection_points_attach('CreateRestartPoint', 'wait');"); + +# Execute a restart point on the standby, that we will now be waiting on. +# This needs to be in the background. +my $logstart = -s $node_standby->logfile; +my $psql_session = + $node_standby->background_psql('postgres', on_error_stop => 0); +$psql_session->query_until( + qr/starting_checkpoint/, q( + \echo starting_checkpoint + CHECKPOINT; +)); + +# Switch one WAL segment to make the previous restart point remove the +# segment once the restart point completes. +$node_primary->safe_psql('postgres', 'INSERT INTO prim_tab VALUES (1);'); +$node_primary->safe_psql('postgres', 'SELECT pg_switch_wal();'); +$node_primary->wait_for_replay_catchup($node_standby); + +# Wait until the checkpointer is in the middle of the restart point +# processing, relying on the custom wait event generated in the +# wait callback used in the injection point previously attached. +ok( $node_standby->poll_query_until( + 'postgres', + qq[SELECT count(*) FROM pg_stat_activity + WHERE backend_type = 'checkpointer' AND wait_event = 'injection_wait' ;], + '1'), + 'checkpointer is waiting in restart point' +) or die "Timed out while waiting for checkpointer to run restart point"; + +# Check the logs that the restart point has started on standby. This is +# optional, but let's be sure. +my $log = slurp_file($node_standby->logfile, $logstart); +my $checkpoint_start = 0; +if ($log =~ m/restartpoint starting: immediate wait/) +{ + $checkpoint_start = 1; +} +is($checkpoint_start, 1, 'restartpoint has started'); + +# Trigger promotion during the restart point. +$node_primary->stop; +$node_standby->promote; + +# Update the start position before waking up the checkpointer! +$logstart = -s $node_standby->logfile; + +# Now wake up the checkpointer. +$node_standby->safe_psql('postgres', "SELECT injection_points_wake();"); + +# Wait until the previous restart point completes on the newly-promoted +# standby, checking the logs for that. +my $checkpoint_complete = 0; +for (my $i = 0; $i < 3000; $i++) +{ + my $log = slurp_file($node_standby->logfile, $logstart); + if ($log =~ m/restartpoint complete/) + { + $checkpoint_complete = 1; + last; + } + usleep(100_000); +} +is($checkpoint_complete, 1, 'restart point has completed'); + +# Kill with SIGKILL, forcing all the backends to restart. +my $psql_timeout = IPC::Run::timer(3600); +my ($killme_stdin, $killme_stdout, $killme_stderr) = ('', '', ''); +my $killme = IPC::Run::start( + [ + 'psql', '-XAtq', '-v', 'ON_ERROR_STOP=1', '-f', '-', '-d', + $node_standby->connstr('postgres') + ], + '<', + \$killme_stdin, + '>', + \$killme_stdout, + '2>', + \$killme_stderr, + $psql_timeout); +$killme_stdin .= q[ +SELECT pg_backend_pid(); +]; +$killme->pump until $killme_stdout =~ /[[:digit:]]+[\r\n]$/; +my $pid = $killme_stdout; +chomp($pid); +my $ret = PostgreSQL::Test::Utils::system_log('pg_ctl', 'kill', 'KILL', $pid); +is($ret, 0, 'killed process with KILL'); +my $stdout; +my $stderr; + +# Wait till server finishes restarting +$node_standby->poll_query_until('postgres', undef, ''); + +# After recovery, the server should be able to start. +($ret, $stdout, $stderr) = $node_standby->psql('postgres', 'select 1'); +is($ret, 0, "psql connect success"); +is($stdout, 1, "psql select 1"); + +done_testing(); -- 2.43.0