From ea258ce0f4d2f416f0d33af974abe87219458511 Mon Sep 17 00:00:00 2001 From: Hayato Kuroda Date: Wed, 26 Mar 2025 19:03:50 +0900 Subject: [PATCH v5-PG17-2] Stabilize 035_standby_logical_decoding.pl This test tries to invalidate slots on standby server, by running VACUUM on primary and discarding needed tuples for slots. The problem is that xl_running_xacts records are sotimetimes generated while testing, it advances the catalog_xmin so that the invalidation might not happen in some cases. The fix is to skip using the active slots for some testcases. --- .../t/035_standby_logical_decoding.pl | 66 +++++++++---------- 1 file changed, 31 insertions(+), 35 deletions(-) diff --git a/src/test/recovery/t/035_standby_logical_decoding.pl b/src/test/recovery/t/035_standby_logical_decoding.pl index aeb79f51e71..2b7ae4b74e9 100644 --- a/src/test/recovery/t/035_standby_logical_decoding.pl +++ b/src/test/recovery/t/035_standby_logical_decoding.pl @@ -205,9 +205,6 @@ sub reactive_slots_change_hfs_and_wait_for_xmins change_hot_standby_feedback_and_wait_for_xmins($hsf, $invalidated); - $handle = - make_slot_active($node_standby, $slot_prefix, 1, \$stdout, \$stderr); - # reset stat: easier to check for confl_active_logicalslot in pg_stat_database_conflicts $node_standby->psql('testdb', q[select pg_stat_reset();]); } @@ -215,9 +212,8 @@ sub reactive_slots_change_hfs_and_wait_for_xmins # Check invalidation in the logfile and in pg_stat_database_conflicts sub check_for_invalidation { - my ($slot_prefix, $log_start, $test_name) = @_; + my ($slot_prefix, $log_start, $test_name, $checks_active_slot) = @_; - my $active_slot = $slot_prefix . 'activeslot'; my $inactive_slot = $slot_prefix . 'inactiveslot'; # message should be issued @@ -226,18 +222,24 @@ sub check_for_invalidation $log_start), "inactiveslot slot invalidation is logged $test_name"); - ok( $node_standby->log_contains( - "invalidating obsolete replication slot \"$active_slot\"", - $log_start), - "activeslot slot invalidation is logged $test_name"); - - # Verify that pg_stat_database_conflicts.confl_active_logicalslot has been updated - ok( $node_standby->poll_query_until( - 'postgres', - "select (confl_active_logicalslot = 1) from pg_stat_database_conflicts where datname = 'testdb'", - 't'), - 'confl_active_logicalslot updated' - ) or die "Timed out waiting confl_active_logicalslot to be updated"; + if ($checks_active_slot) + { + my $active_slot = $slot_prefix . 'activeslot'; + + ok( $node_standby->log_contains( + "invalidating obsolete replication slot \"$active_slot\"", + $log_start), + "activeslot slot invalidation is logged $test_name"); + + # Verify that pg_stat_database_conflicts.confl_active_logicalslot has + # been updated + ok( $node_standby->poll_query_until( + 'postgres', + "select (confl_active_logicalslot = 1) from pg_stat_database_conflicts where datname = 'testdb'", + 't'), + 'confl_active_logicalslot updated' + ) or die "Timed out waiting confl_active_logicalslot to be updated"; + } } # Launch $sql query, wait for a new snapshot that has a newer horizon and @@ -250,7 +252,8 @@ sub check_for_invalidation # seeing a xl_running_xacts that would advance an active replication slot's # catalog_xmin. Advancing the active replication slot's catalog_xmin # would break some tests that expect the active slot to conflict with -# the catalog xmin horizon. +# the catalog xmin horizon. We ensure that replication slots are not activated +# for tests that might produce this race condition though. sub wait_until_vacuum_can_remove { my ($vac_option, $sql, $to_vac) = @_; @@ -550,10 +553,6 @@ reactive_slots_change_hfs_and_wait_for_xmins('behaves_ok_', 'vacuum_full_', $node_primary->safe_psql('testdb', qq[INSERT INTO decoding_test(x,y) SELECT 100,'100';]); -$node_standby->poll_query_until('testdb', - qq[SELECT total_txns > 0 FROM pg_stat_replication_slots WHERE slot_name = 'vacuum_full_activeslot'] -) or die "replication slot stats of vacuum_full_activeslot not updated"; - # This should trigger the conflict wait_until_vacuum_can_remove( 'full', 'CREATE TABLE conflict_test(x integer, y text); @@ -562,19 +561,11 @@ wait_until_vacuum_can_remove( $node_primary->wait_for_replay_catchup($node_standby); # Check invalidation in the logfile and in pg_stat_database_conflicts -check_for_invalidation('vacuum_full_', 1, 'with vacuum FULL on pg_class'); +check_for_invalidation('vacuum_full_', 1, 'with vacuum FULL on pg_class', 0); # Verify reason for conflict is 'rows_removed' in pg_replication_slots check_slots_conflict_reason('vacuum_full_', 'rows_removed'); -# Ensure that replication slot stats are not removed after invalidation. -is( $node_standby->safe_psql( - 'testdb', - qq[SELECT total_txns > 0 FROM pg_stat_replication_slots WHERE slot_name = 'vacuum_full_activeslot'] - ), - 't', - 'replication slot stats not removed after invalidation'); - $handle = make_slot_active($node_standby, 'vacuum_full_', 0, \$stdout, \$stderr); @@ -651,7 +642,7 @@ wait_until_vacuum_can_remove( $node_primary->wait_for_replay_catchup($node_standby); # Check invalidation in the logfile and in pg_stat_database_conflicts -check_for_invalidation('row_removal_', $logstart, 'with vacuum on pg_class'); +check_for_invalidation('row_removal_', $logstart, 'with vacuum on pg_class', 0); # Verify reason for conflict is 'rows_removed' in pg_replication_slots check_slots_conflict_reason('row_removal_', 'rows_removed'); @@ -687,7 +678,7 @@ $node_primary->wait_for_replay_catchup($node_standby); # Check invalidation in the logfile and in pg_stat_database_conflicts check_for_invalidation('shared_row_removal_', $logstart, - 'with vacuum on pg_authid'); + 'with vacuum on pg_authid', 0); # Verify reason for conflict is 'rows_removed' in pg_replication_slots check_slots_conflict_reason('shared_row_removal_', 'rows_removed'); @@ -711,6 +702,11 @@ $logstart = -s $node_standby->logfile; reactive_slots_change_hfs_and_wait_for_xmins('shared_row_removal_', 'no_conflict_', 0, 1); +# This scenario won't produce the race condition by a xl_running_xacts, so +# activate the slot. See comments atop wait_until_vacuum_can_remove(). +make_slot_active($node_standby, 'no_conflict_', 1, \$stdout, + \$stderr); + # This should not trigger a conflict wait_until_vacuum_can_remove( '', 'CREATE TABLE conflict_test(x integer, y text); @@ -779,7 +775,7 @@ $node_primary->safe_psql('testdb', qq[UPDATE prun SET s = 'E';]); $node_primary->wait_for_replay_catchup($node_standby); # Check invalidation in the logfile and in pg_stat_database_conflicts -check_for_invalidation('pruning_', $logstart, 'with on-access pruning'); +check_for_invalidation('pruning_', $logstart, 'with on-access pruning', 0); # Verify reason for conflict is 'rows_removed' in pg_replication_slots check_slots_conflict_reason('pruning_', 'rows_removed'); @@ -823,7 +819,7 @@ $node_primary->restart; $node_primary->wait_for_replay_catchup($node_standby); # Check invalidation in the logfile and in pg_stat_database_conflicts -check_for_invalidation('wal_level_', $logstart, 'due to wal_level'); +check_for_invalidation('wal_level_', $logstart, 'due to wal_level', 1); # Verify reason for conflict is 'wal_level_insufficient' in pg_replication_slots check_slots_conflict_reason('wal_level_', 'wal_level_insufficient'); -- 2.43.5