From c6ebfaf3a967d97af3aa575df8c4190fad2117ee Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Thu, 20 Jun 2024 16:38:30 -0400
Subject: [PATCH v2 1/2] Test that vacuum removes tuples older than OldestXmin

If vacuum fails to prune a tuple killed before OldestXmin, it will
decide to freeze its xmax and later error out in pre-freeze checks.
---
 src/test/recovery/meson.build                 |   1 +
 .../recovery/t/043_vacuum_horizon_floor.pl    | 236 ++++++++++++++++++
 2 files changed, 237 insertions(+)
 create mode 100644 src/test/recovery/t/043_vacuum_horizon_floor.pl

diff --git a/src/test/recovery/meson.build b/src/test/recovery/meson.build
index b1eb77b1ec1..1d55d6bf560 100644
--- a/src/test/recovery/meson.build
+++ b/src/test/recovery/meson.build
@@ -51,6 +51,7 @@ tests += {
       't/040_standby_failover_slots_sync.pl',
       't/041_checkpoint_at_promote.pl',
       't/042_low_level_backup.pl',
+      't/043_vacuum_horizon_floor.pl',
     ],
   },
 }
diff --git a/src/test/recovery/t/043_vacuum_horizon_floor.pl b/src/test/recovery/t/043_vacuum_horizon_floor.pl
new file mode 100644
index 00000000000..1b82d935be5
--- /dev/null
+++ b/src/test/recovery/t/043_vacuum_horizon_floor.pl
@@ -0,0 +1,236 @@
+use strict;
+use warnings;
+use PostgreSQL::Test::Cluster;
+use Test::More;
+
+# Test that vacuum prunes away all dead tuples killed before OldestXmin
+
+# Set up nodes
+my $node_primary = PostgreSQL::Test::Cluster->new('primary');
+$node_primary->init(allows_streaming => 'physical');
+
+$node_primary->append_conf(
+	'postgresql.conf', qq[
+hot_standby_feedback = on
+log_recovery_conflict_waits = true
+autovacuum = off
+maintenance_work_mem = 1024
+]);
+$node_primary->start;
+
+my $node_replica = PostgreSQL::Test::Cluster->new('standby');
+
+$node_primary->backup('my_backup');
+$node_replica->init_from_backup($node_primary, 'my_backup',
+	has_streaming => 1);
+
+$node_replica->start;
+
+my $test_db = "test_db";
+$node_primary->safe_psql('postgres', "CREATE DATABASE $test_db");
+
+# Save the original connection info for later use
+my $orig_conninfo = $node_primary->connstr();
+
+my $table1 = "vac_horizon_floor_table";
+
+# Long-running Primary Session A
+my $psql_primaryA =
+  $node_primary->background_psql($test_db, on_error_stop => 1);
+
+# Long-running Primary Session B
+my $psql_primaryB  =
+  $node_primary->background_psql($test_db, on_error_stop => 1);
+
+# Insert and update enough rows that we force at least one round of index
+# vacuuming before getting to a dead tuple which was killed after the standby
+# is disconnected.
+#
+# We need multiple index vacuuming passes to repro because after the standby
+# reconnects to the primary, our backend's GlobalVisStates will not have been
+# updated with the new horizon until something forces them to be updated.
+#
+# _bt_pendingfsm_finalize() calls GetOldestNonRemovableTransactionId() at the
+# end of a round of index vacuuming, updating the backend's GlobalVisState
+# and, in our case, moving maybe_needed backwards.
+#
+# Then vacuum's first pass will continue and pruning will find our later
+# inserted and updated tuple HEAPTUPLE_RECENTLY_DEAD when compared to
+# maybe_needed but HEAPTUPLE_DEAD when compared to OldestXmin.
+$node_primary->safe_psql($test_db, qq[
+	CREATE TABLE ${table1}(col1 int) with (autovacuum_enabled=false, fillfactor=10);
+	INSERT INTO $table1 VALUES(7);
+	INSERT INTO $table1 SELECT generate_series(1, 400000) % 3;
+	CREATE INDEX on ${table1}(col1);
+	UPDATE $table1 SET col1 = 3 WHERE col1 = 0;
+	INSERT INTO $table1 VALUES(7);
+]);
+
+# We will later move the primary forward while the standby is disconnected. For
+# now, however, there is no reason not to wait for the standby to catch up.
+my $primary_lsn = $node_primary->lsn('flush');
+$node_primary->wait_for_catchup($node_replica, 'replay', $primary_lsn);
+
+# Test that the WAL receiver is up and running.
+$node_replica->poll_query_until($test_db, qq[
+	select exists (select * from pg_stat_wal_receiver);] , 't');
+
+# Set primary_conninfo to something invalid on the replica and reload the
+# config. Once the config is reloaded, the startup process will force the WAL
+# receiver to restart and it will be unable to reconnect because of the invalid
+# connection information.
+$node_replica->safe_psql($test_db, qq[
+		ALTER SYSTEM SET primary_conninfo = '';
+		SELECT pg_reload_conf();
+	]);
+
+# Wait until the WAL receiver has shut down and been unable to start up again.
+$node_replica->poll_query_until($test_db, qq[
+	select exists (select * from pg_stat_wal_receiver);] , 'f');
+
+# Now insert and update a tuple which will be visible to the vacuum on the
+# primary but which will have xmax newer than the oldest xmin on the standby
+# that was recently disconnected.
+my $res = $psql_primaryA->query_safe(
+	qq[
+		INSERT INTO $table1 VALUES (99);
+		UPDATE $table1 SET col1 = 100 WHERE col1 = 99;
+		SELECT 'after_update';
+        ]
+	);
+
+# Make sure the UPDATE finished
+like($res, qr/^after_update$/m, "UPDATE occurred on primary session A");
+
+# Open a cursor on the primary whose pin will keep VACUUM from getting a
+# cleanup lock on the first page of the relation. We want VACUUM to be able to
+# start, calculate initial values for OldestXmin and GlobalVisState and then be
+# unable to proceed with pruning our dead tuples. This will allow us to
+# reconnect the standby and push the horizon back before we start actual
+# pruning and vacuuming.
+my $primary_cursor1 = "vac_horizon_floor_cursor1";
+
+# The first value inserted into the table was a 7, so FETCH FORWARD should
+# return a 7. That's how we know the cursor has a pin.
+$res = $psql_primaryB->query_safe(
+	qq[
+        BEGIN;
+			DECLARE $primary_cursor1 CURSOR FOR SELECT col1 FROM $table1 WHERE col1 = 7;
+			FETCH $primary_cursor1;
+        ]
+	);
+
+is($res, 7, qq[Cursor query returned $res. Expected value 7.]);
+
+
+# Get the PID of the session which will run the VACUUM FREEZE so that we can
+# use it to filter pg_stat_activity later.
+my $vacuum_pid = $psql_primaryA->query_safe("SELECT pg_backend_pid();");
+
+# Now start a VACUUM FREEZE on the primary. It will call vacuum_get_cutoffs()
+# and establish values of OldestXmin and GlobalVisState which are newer than
+# all of our dead tuples. Then it will be unable to get a cleanup lock to start
+# pruning, so it will hang.
+$psql_primaryA->{stdin} .= qq[
+		VACUUM FREEZE $table1;
+		\\echo VACUUM
+        ];
+
+# Make sure the VACUUM command makes it to the server.
+$psql_primaryA->{run}->pump_nb();
+
+# Make sure that the VACUUM has already called vacuum_get_cutoffs() and is just
+# waiting on the lock to start vacuuming. We don't want the standby to
+# re-establish a connection to the primary and push the horizon back until
+# we've saved initial values in GlobalVisState and calculated OldestXmin.
+$node_primary->poll_query_until($test_db,
+	qq[
+	SELECT count(*) >= 1 FROM pg_stat_activity
+		WHERE pid = $vacuum_pid
+		AND wait_event = 'BufferPin';
+	],
+	't');
+
+# Ensure the WAL receiver is still not active on the replica.
+$node_replica->poll_query_until($test_db, qq[
+	select exists (select * from pg_stat_wal_receiver);] , 'f');
+
+# Allow the WAL receiver connection to re-establish.
+$node_replica->safe_psql(
+	$test_db, qq[
+		ALTER SYSTEM SET primary_conninfo = '$orig_conninfo';
+		SELECT pg_reload_conf();
+	]);
+
+# Ensure the new WAL receiver has connected.
+$node_replica->poll_query_until($test_db, qq[
+	select exists (select * from pg_stat_wal_receiver);] , 't');
+
+# Once the WAL sender is shown on the primary, the replica should have
+# connected with the primary and pushed the horizon backward. Primary Session A
+# won't see that until the VACUUM FREEZE proceeds and does its first round of
+# index vacuuming.
+$node_primary->poll_query_until($test_db, qq[
+	select exists (select * from pg_stat_replication);] , 't');
+
+# Move the cursor forward to the next 7. We inserted the 7 much later, so
+# advancing the cursor should allow vacuum to proceed vacuuming most pages of
+# the relation. Because we set maintanence_work_mem sufficiently low, we expect
+# that a round of index vacuuming has happened and that the vacuum is now
+# waiting for the cursor to release its pin on the last page of the relation.
+$res = $psql_primaryB->query_safe("FETCH $primary_cursor1");
+is($res, 7, qq[Cursor query returned $res from second fetch. Expected value 7.]);
+
+# Prevent the test from incorrectly passing by confirming that we did indeed do
+# a pass of index vacuuming.
+$node_primary->poll_query_until($test_db, qq[
+	SELECT index_vacuum_count > 0
+	FROM pg_stat_progress_vacuum
+	WHERE datname='$test_db' AND relid::regclass = '$table1'::regclass;
+	] , 't');
+
+# Commit the cursor so that the VACUUM can finish.
+$psql_primaryB->query_until(
+		qr/^commit$/m,
+		qq[
+			COMMIT;
+			\\echo commit
+        ]
+	);
+
+# VACUUM proceeds with pruning and does a visibility check on each tuple. In
+# older versions of Postgres, pruning found our final dead tuple
+# non-removable (HEAPTUPLE_RECENTLY_DEAD) since its xmax is after the new
+# value of maybe_needed. Then heap_prepare_freeze_tuple() would decide the
+# tuple xmax should be frozen because it precedes OldestXmin. Vacuum would
+# then error out in heap_pre_freeze_checks() with "cannot freeze committed
+# xmax". This was fixed by changing pruning to find all
+# HEAPTUPLE_RECENTLY_DEAD tuples with xmaxes preceding OldestXmin
+# HEAPTUPLE_DEAD and removing them.
+
+# With the fix, VACUUM should finish successfully, incrementing the table
+# vacuum_count.
+$node_primary->poll_query_until($test_db,
+	qq[
+	SELECT vacuum_count > 0
+	FROM pg_stat_all_tables WHERE relname = '${table1}';
+	]
+	, 't');
+
+$primary_lsn = $node_primary->lsn('flush');
+
+# Make sure something causes us to flush
+$node_primary->safe_psql($test_db, "INSERT INTO $table1 VALUES (1);");
+
+# Nothing on the replica should cause a recovery conflict, so this should
+# finish successfully.
+$node_primary->wait_for_catchup($node_replica, 'replay', $primary_lsn);
+
+## Shut down psqls
+$psql_primaryA->quit;
+$psql_primaryB->quit;
+
+$node_replica->stop();
+$node_primary->stop();
+
+done_testing();
-- 
2.34.1