From 42f2103ebc11c4fa01c3efcbefec6c45cdd67093 Mon Sep 17 00:00:00 2001 From: Craig Ringer Date: Thu, 10 Mar 2016 10:50:59 +0800 Subject: [PATCH 7/7] Introduce TAP recovery tests for failover slots --- src/test/recovery/t/007_failover_slots.pl | 644 ++++++++++++++++++++++++++++++ 1 file changed, 644 insertions(+) create mode 100644 src/test/recovery/t/007_failover_slots.pl diff --git a/src/test/recovery/t/007_failover_slots.pl b/src/test/recovery/t/007_failover_slots.pl new file mode 100644 index 0000000..b8ebda0 --- /dev/null +++ b/src/test/recovery/t/007_failover_slots.pl @@ -0,0 +1,644 @@ +# +# Test failover slots +# +use strict; +use warnings; +use bigint; +use PostgresNode; +use TestLib; +use Test::More; +use RecursiveCopy; +use File::Copy; +use File::Basename qw(basename); +use List::Util qw(); +use Data::Dumper; +use IPC::Run qw(); + + +use Carp 'verbose'; +$SIG{ __DIE__ } = sub { Carp::confess( @_ ) }; + +my $verbose = 0; + +sub lsn_to_bigint +{ + my ($lsn) = @_; + my ($high, $low) = split("/",$lsn); + return hex($high) * 2**32 + hex($low); +} + +sub get_slot_info +{ + my ($node, $slot_name) = @_; + + my $esc_slot_name = $slot_name; + $esc_slot_name =~ s/'/''/g; + my @selectlist = ('slot_name', 'plugin', 'slot_type', 'database', 'active_pid', 'xmin', 'catalog_xmin', 'restart_lsn', 'confirmed_flush_lsn', 'failover'); + my $row = $node->safe_psql('postgres', "SELECT " . join(', ', @selectlist) . " FROM pg_replication_slots WHERE slot_name = '$esc_slot_name';"); + chomp $row; + my @fields = split('\|', $row, -1); + if (scalar @fields != scalar @selectlist) + { + diag "Invalid row is: '$row'"; + die "Select-list '@selectlist'(".scalar(@selectlist).") didn't match length of result-list '@fields'(".scalar(@fields).")"; + } + my %slotinfo; + for (my $i = 0; $i < scalar @selectlist; $i++) + { + $slotinfo{$selectlist[$i]} = $fields[$i]; + } + return \%slotinfo; +} + +sub diag_slotinfo +{ + my ($info, $msg) = @_; + return unless $verbose; + diag "slot " . $info->{slot_name} . ": " . Dumper($info); +} + +sub wait_for_catchup +{ + my ($node_master, $node_replica) = @_; + + my $master_lsn = $node_master->safe_psql('postgres', 'SELECT pg_current_xlog_insert_location()'); + diag "waiting for " . $node_replica->name . " to catch up to $master_lsn on " . $node_master->name if $verbose; + my $ret = $node_replica->poll_query_until('postgres', + "SELECT pg_last_xlog_replay_location() >= '$master_lsn'::pg_lsn;"); + BAIL_OUT('replica failed to catch up') unless $ret; + my $replica_lsn = $node_replica->safe_psql('postgres', 'SELECT pg_last_xlog_replay_location()'); + diag "Replica is caught up to $replica_lsn, past required LSN $master_lsn" if $verbose; +} + +sub read_slot_updates_from_xlog +{ + my ($node, $timeline) = @_; + my ($stdout, $stderr) = ('', ''); + # Look at master xlogs and examine sequence advances + my $wal_pattern = sprintf("%s/pg_xlog/%08X" . ("?" x 16), $node->data_dir, $timeline); + my @wal = glob $wal_pattern; + my $firstwal = List::Util::minstr(@wal); + my $lastwal = basename(List::Util::maxstr(@wal)); + diag "decoding xlog on " . $node->name . " from $firstwal to $lastwal" if $verbose; + IPC::Run::run ['pg_xlogdump', $firstwal, $lastwal], '>', \$stdout, '2>', \$stderr; + like($stderr, qr/invalid record length at [0-9A-F]+\/[0-9A-F]+: wanted 24, got 0/, + 'pg_xlogdump exits with expected error'); + my @slots = grep(/ReplicationSlot/, split(/\n/, $stdout)); + + # Parse the dumped xlog data + my @slot_updates = (); + for my $slot (@slots) { + if (my @matches = ($slot =~ /lsn: ([[:xdigit:]]{1,8}\/[[:xdigit:]]{1,8}), prev [[:xdigit:]]{1,8}\/[[:xdigit:]]{1,8}, desc: UPDATE of slot (\w+) with restart ([[:xdigit:]]{1,8}\/[[:xdigit:]]{1,8}) and xid ([[:digit:]]+) confirmed to ([[:xdigit:]]{1,8}\/[[:xdigit:]]{1,8})/)) + { + my %slot_update = ( + action => 'UPDATE', + log_lsn => $1, slot_name => $2, restart_lsn => $3, + xid => $4, confirm_lsn => $5 + ); + diag "Replication slot create/advance: $slot_update{slot_name} advanced to $slot_update{confirm_lsn} with restart $slot_update{restart_lsn} and $slot_update{xid} in xlog entry $slot_update{log_lsn}" if $verbose; + push @slot_updates, \%slot_update; + } + elsif ($slot =~ /DELETE/) + { + diag "Replication slot delete: $slot" if $verbose; + } + else + { + die "Slot xlog entry didn't match pattern: $slot"; + } + } + return \@slot_updates; +} + +sub check_slot_wal_update +{ + my ($entry, $slotname, %params) = @_; + + ok(defined($entry), 'xlog entry exists for slot $slotname'); + SKIP: { + skip 'Expected xlog entry was undef' unless defined($entry); + my %entry = %{$entry}; undef $entry; + diag "Examining decoded slot update xlog entry: " . Dumper(\%entry) if $verbose; + is($entry{action}, 'UPDATE', "$slotname: action is an update"); + is($entry{slot_name}, $slotname, "$slotname: action affects slot " . $slotname); + + cmp_ok(lsn_to_bigint($entry{restart_lsn}), "le", + lsn_to_bigint($entry{log_lsn}), + "$slotname: restart_lsn is no greater than LSN when logged"); + + cmp_ok(lsn_to_bigint($entry{confirm_lsn}), "le", + lsn_to_bigint($entry{log_lsn}), + "$slotname: confirm_lsn is no greater than LSN when logged"); + + cmp_ok(lsn_to_bigint($entry{confirm_lsn}), "ge", + lsn_to_bigint($entry{restart_lsn}), + "$slotname: confirm_lsn equal to or ahead of restart_lsn") + if $entry{confirm_lsn} && $entry{confirm_lsn} ne '0/0'; + + cmp_ok(lsn_to_bigint($entry{restart_lsn}), "le", + lsn_to_bigint($params{expect_max_restart_lsn}), + "$slotname: restart_lsn is at or before expected") + if ($params{expect_max_restart_lsn}); + + cmp_ok(lsn_to_bigint($entry{restart_lsn}), "ge", + lsn_to_bigint($params{expect_min_restart_lsn}), + "$slotname: restart_lsn is at or after expected") + if ($params{expect_min_restart_lsn}); + + cmp_ok(lsn_to_bigint($entry{confirm_lsn}), "le", + lsn_to_bigint($params{expect_max_confirm_lsn}), + "$slotname: confirm_lsn is at or before expected") + if ($params{expect_max_confirm_lsn}); + + cmp_ok(lsn_to_bigint($entry{confirm_lsn}), "ge", + lsn_to_bigint($params{expect_min_confirm_lsn}), + "$slotname: confirm_lsn is at or after expected") + if ($params{expect_min_confirm_lsn}); + } +} + +sub test_read_from_slot +{ + my ($node, $slot, $expected) = @_; + my $slot_quoted = $slot; + $slot_quoted =~ s/'/''/g; + my ($ret, $stdout, $stderr) = $node->psql('postgres', + "SELECT data FROM pg_logical_slot_peek_changes('$slot_quoted', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1');" + ); + is($ret, 0, "replaying from slot $slot is successful"); + is($stderr, '', "replay from slot $slot produces no stderr"); + if (defined($expected)) { + is($stdout, $expected, "slot $slot returned expected output"); + } + return $stderr; +} + +sub wait_for_end_of_recovery +{ + my ($node) = @_; + $node->poll_query_until('postgres', + "SELECT NOT pg_is_in_recovery();"); +} + +# Launch pg_xlogdump as a background proc and return the IPC::Run handle for it +# as well as the proc's stdout and stderr scalar refs as well as the path to +# where the xlogs are written. +sub start_pg_receivexlog +{ + my ($node, $slotname) = @_; + my ($stdout, $stderr); + + my $outdir = $node->basedir . '/xl_' . $slotname; + mkdir($outdir); + + my @cmd = ("pg_receivexlog", "--verbose", "-S", $slotname, "-D", $outdir, "--no-loop", "--dbname", $node->connstr); + diag "Running '@cmd'" if $verbose; + + my $proc = IPC::Run::start \@cmd, '>', \$stdout, '2>', \$stderr; + + die $! unless defined($proc); + + return ($proc, \$stdout, \$stderr, $outdir); +} + +sub test_phys_replay +{ + my ($node, $slotname, $start_tli) = @_; + my ($recvxlog, $stdout, $stderr, $outdir) = start_pg_receivexlog($node, $slotname); + # pg_receivexlog doesn't give us a --nowait option so we have to just wait a + # bit then kill it. + sleep(1); + $recvxlog->signal("TERM"); + sleep(1); + $recvxlog->finish; + # FIXME: Not portable, we should use IPC::Signal but that's in CPAN because + # apparently Perl doesn't have a signo/signame mapping built-in. WTF... + is($recvxlog->full_result, "15", 'pg_recvlog exited due to SIGTERM'); + chomp $$stderr; + my $expected_stderr_re = "^pg_receivexlog: starting log streaming at ([[:xdigit:]]{1,8})/([[:xdigit:]]{1,8}) \\(timeline ($start_tli)\\)"; + like($$stderr, "/$expected_stderr_re/", "reported start location to stderr"); + if ($$stderr =~ $expected_stderr_re) + { + my ($cap_lsn_high, $cap_lsn_low, $cap_tli) = ($1, $2, $3); + diag "pg_xlogdump streamed xlog from node " . $node->name . " starting at $cap_lsn_high/$cap_lsn_low on timeline $cap_tli" if $verbose; + is($cap_tli, $start_tli, 'replay started on expected timeline') if ($start_tli); + } + is($$stdout, '', "no stdout"); + my @xlogs = glob $outdir . "/*"; + cmp_ok(scalar(@xlogs), "ge", 1, "Received at least one segment from $slotname"); +} + + +my ($stdout, $stderr, $ret, $slotinfo, $outdir, $proc); + +# Initialize master node +my $node_master = get_new_node('master'); +$node_master->init(allows_streaming => 1, has_archiving => 1); +$node_master->append_conf('postgresql.conf', "wal_level = 'logical'\n"); +$node_master->append_conf('postgresql.conf', "max_replication_slots = 8\n"); +$node_master->append_conf('postgresql.conf', "max_wal_senders = 8\n"); +#$node_master->append_conf('postgresql.conf', "log_min_messages = 'debug2'\n"); +$node_master->dump_info; +$node_master->start; + +my $master_beforecreate_bb_lsn = $node_master->safe_psql('postgres', + "SELECT pg_current_xlog_insert_location()"); + +$node_master->safe_psql('postgres', +"SELECT pg_create_logical_replication_slot('bb_failover', 'test_decoding', true);" +); +my $bb_beforeconsume_si = get_slot_info($node_master, 'bb_failover'); +diag_slotinfo $bb_beforeconsume_si, 'bb_beforeconsume'; + +# Create non-failover slot to make sure it isn't replicated +$node_master->safe_psql('postgres', +"SELECT pg_create_logical_replication_slot('bb', 'test_decoding');" +); + +# Failover slots work for physical slots too +$node_master->safe_psql('postgres', +"SELECT pg_create_physical_replication_slot('bb_phys_failover', false, true);"); +$node_master->safe_psql('postgres', +"SELECT pg_create_physical_replication_slot('bb_phys');"); + +my $bb_phys_beforeconsume_si = get_slot_info($node_master, 'bb_phys_failover'); +diag_slotinfo $bb_phys_beforeconsume_si, 'bb_phys_beforeconsume'; + +$node_master->safe_psql('postgres', "CREATE TABLE decoding(blah text);"); +$node_master->safe_psql('postgres', + "INSERT INTO decoding(blah) VALUES ('consumed');"); +($ret, $stdout, $stderr) = $node_master->psql('postgres', + "SELECT data FROM pg_logical_slot_get_changes('bb_failover', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1');"); +is($ret, 0, 'replaying from bb_failover on master is successful'); +is( $stdout, q(BEGIN +table public.decoding: INSERT: blah[text]:'consumed' +COMMIT), 'decoded expected data from slot bb_failover on master'); +is($stderr, '', 'replay from slot bb_failover produces no stderr'); + +my $bb_afterconsume_si = get_slot_info($node_master, 'bb_failover'); +diag_slotinfo $bb_afterconsume_si, 'bb_afterconsume'; + +($ret, $stdout, $stderr) = $node_master->psql('postgres', + "SELECT data FROM pg_logical_slot_get_changes('bb_failover', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1');"); +is ($ret, 0, 'no error reading empty slot changes after get'); +is ($stdout, '', 'no new changes to read from slot after get'); + +cmp_ok(lsn_to_bigint($bb_afterconsume_si->{confirmed_flush_lsn}), + "gt", + lsn_to_bigint($bb_beforeconsume_si->{confirmed_flush_lsn}), + "confirm lsn on bb_failover advanced on master after replay"); + +$node_master->safe_psql('postgres', 'CHECKPOINT;'); + +$node_master->safe_psql('postgres', + "INSERT INTO decoding(blah) VALUES ('beforebb');"); +$node_master->safe_psql('postgres', 'CHECKPOINT;'); + +my $backup_name = 'b1'; +$node_master->backup_fs_hot($backup_name); + +my $node_replica = get_new_node('replica'); +$node_replica->init_from_backup( + $node_master, $backup_name, + has_streaming => 1, + has_restoring => 1); +$node_replica->start; + +my $master_beforecreate_ab_lsn = $node_master->safe_psql('postgres', + "SELECT pg_current_xlog_insert_location()"); + +$node_master->safe_psql('postgres', +"SELECT pg_create_logical_replication_slot('ab_failover', 'test_decoding', true);" +); + +my $ab_beforeconsume_si = get_slot_info($node_master, 'ab_failover'); +diag_slotinfo $ab_beforeconsume_si, 'ab_beforeconsume'; + +$node_master->safe_psql('postgres', +"SELECT pg_create_logical_replication_slot('ab', 'test_decoding');" +); + +$node_master->safe_psql('postgres', +"SELECT pg_create_physical_replication_slot('ab_phys_failover', false, true);" +); + +$node_master->safe_psql('postgres', +"SELECT pg_create_physical_replication_slot('ab_phys');" +); + +my $ab_phys_beforeconsume_si = get_slot_info($node_master, 'ab_phys_failover'); +diag_slotinfo $ab_phys_beforeconsume_si, 'ab_phys_beforeconsume'; + +# We can also create physical slots on replicas if they aren't failover slots +$node_replica->safe_psql('postgres', +"SELECT pg_create_physical_replication_slot('onreplica');" +); + +($ret, $stdout, $stderr) = $node_replica->psql('postgres', +"SELECT pg_create_physical_replication_slot('onreplica', false, true);" +); +is($ret, 3, "failed to create failover slot on replica"); +like($stderr, qr/a failover slot may not be created on a replica/, "got expected error creating failover slot on replica"); + +$node_master->safe_psql('postgres', + "INSERT INTO decoding(blah) VALUES ('afterbb');"); + +wait_for_catchup($node_master, $node_replica); + +# Can't replay from a failover slot on a replica +($proc, $stdout, $stderr, $outdir) = start_pg_receivexlog($node_replica, 'bb_phys_failover'); +$proc->finish; +is($proc->result, 1, 'pg_receivexlog exited with error code when attempting replay from failover slot on replica'); +is($$stdout, '', 'no stdout'); +like($$stderr, qr/ERROR:.*replication slot "bb_phys_failover" is reserved for use after failover/, 'pg_receivexlog exited with expected error'); + +$stdout = $node_master->safe_psql('postgres', 'SELECT slot_name FROM pg_replication_slots ORDER BY slot_name'); +is($stdout, q(ab +ab_failover +ab_phys +ab_phys_failover +bb +bb_failover +bb_phys +bb_phys_failover), 'Expected slots exist on master') + or BAIL_OUT('Remaining tests meaningless'); + + +# Verify that only the failover slots and the physical slot we created +# directly are present on the replica +$stdout = $node_replica->safe_psql('postgres', 'SELECT slot_name FROM pg_replication_slots ORDER BY slot_name'); +is($stdout, q(ab_failover +ab_phys_failover +bb_failover +bb_phys_failover +onreplica), 'Expected slots exist on replica') + or BAIL_OUT('Remaining tests meaningless'); + +# Make sure we can replay from the physical failover slot on the master +my $master_beforereplay_bbphys_si = get_slot_info($node_master, 'bb_phys_failover'); +is($master_beforereplay_bbphys_si->{restart_lsn}, '', + 'restart_lsn on slot bb_phys_failover is empty before replay'); +test_phys_replay($node_master, 'bb_phys_failover', 1); +my $master_afterreplay_bbphys_si = get_slot_info($node_master, 'bb_phys_failover'); + +cmp_ok(lsn_to_bigint($master_afterreplay_bbphys_si->{restart_lsn}), + "gt", + 0, + "bb_phys_failover restart_lsn advanced after replay"); + +$node_master->stop('fast'); + +my $log = TestLib::slurp_file($node_master->logfile); +unlike($log, '/PANIC:/', 'No PANIC in master logs'); + +my @slot_updates = @{ read_slot_updates_from_xlog($node_master, 1) }; + +# +# Decode the WAL from the master and make sure the expected entries and only the +# expected entries are present. +# +# We want to see two WAL entries, one for each slot. There won't be another entry +# for the slot advance because right now we don't write out WAL when a slot's confirmed +# location advances, only when the flush location or xmin advance. The restart lsn +# and confirmed flush LSN in the slot's WAL record must not be less than the LSN +# of the master before we created the slot and not greater than the position we saw +# in pg_replication_slots after slot creation. +# + +# bb_failover created +check_slot_wal_update($slot_updates[0], 'bb_failover', + expect_min_restart_lsn => $master_beforecreate_bb_lsn, + expect_min_confirm_lsn => $master_beforecreate_bb_lsn, + expect_max_restart_lsn => $bb_beforeconsume_si->{restart_lsn}, + expect_max_confirm_lsn => $bb_beforeconsume_si->{confirmed_flush_lsn}); + +# bb_phys_failover created +check_slot_wal_update($slot_updates[1], 'bb_phys_failover', + expect_min_restart_lsn => '0/0', + expect_min_confirm_lsn => '0/0', + expect_max_restart_lsn => '0/0', + expect_max_confirm_lsn => '0/0'); + +# bb_failover updated after replay. This only happens because we +# force a checkpoint to flush the dirtied but not written-out +# slot. +check_slot_wal_update($slot_updates[2], 'bb_failover', + expect_min_restart_lsn => $master_beforecreate_bb_lsn, + expect_min_confirm_lsn => $master_beforecreate_bb_lsn, + expect_max_restart_lsn => $bb_afterconsume_si->{restart_lsn}, + expect_max_confirm_lsn => $bb_afterconsume_si->{confirmed_flush_lsn}); + +# Creation of ab_failover +check_slot_wal_update($slot_updates[3], 'ab_failover', + expect_min_restart_lsn => $master_beforecreate_ab_lsn, + expect_min_confirm_lsn => $master_beforecreate_ab_lsn, + expect_max_restart_lsn => $ab_beforeconsume_si->{restart_lsn}, + expect_max_confirm_lsn => $ab_beforeconsume_si->{confirmed_flush_lsn}); + +# Creation of ab_phys_failover +check_slot_wal_update($slot_updates[4], 'ab_phys_failover', + expect_min_restart_lsn => '0/0', + expect_min_confirm_lsn => '0/0', + expect_max_restart_lsn => '0/0', + expect_max_confirm_lsn => '0/0'); + +# created after we replayed from bb_failover on the master +check_slot_wal_update($slot_updates[5], 'bb_phys_failover', + expect_min_restart_lsn => $master_afterreplay_bbphys_si->{restart_lsn}, + expect_min_confirm_lsn => '0/0', + expect_max_restart_lsn => $master_afterreplay_bbphys_si->{restart_lsn}, + expect_max_confirm_lsn => '0/0'); + +# Consuming from a slot does not cause the slot to be written out even on +# CHECKPOINT. Since nothing else would have dirtied the slot, there should +# be no more WAL entries for failover slots. +# +# The client is expected to keep track of the confirmed LSN and skip replaying +# data it's already seen. +ok(!defined($slot_updates[6]), 'No more slot updates'); + + + +# Can replay from physical failover slot on promoted replica + + +$node_replica->promote; + +wait_for_end_of_recovery($node_replica); + +$node_replica->safe_psql('postgres', + "INSERT INTO decoding(blah) VALUES ('after failover');"); + +my $bb_afterpromote_si = get_slot_info($node_replica, 'bb_failover'); +diag_slotinfo $bb_afterpromote_si, 'bb_afterpromote'; +# Because we forced a checkpoint to flush the slot to disk after replaying from +# bb_failover it should have the new confirmed flush point on the replica. +is($bb_afterpromote_si->{confirmed_flush_lsn}, $bb_afterconsume_si->{confirmed_flush_lsn}, + 'slot bb_failover confirmed pos on replica matches master'); +# We haven't replayed much, so the restartpoint probably didn't change, but +# it should be wherever it was after we replayed anyway. +is($bb_afterpromote_si->{restart_lsn}, $bb_afterconsume_si->{restart_lsn}, + 'slot bb_failover restart pos on replica matches master'); + +# We never replayed from the after-basebackup slot on the master so it +# should be right where it was created. +my $ab_afterpromote_si = get_slot_info($node_replica, 'ab_failover'); +diag_slotinfo $ab_afterpromote_si, 'ab_afterpromote'; +is($ab_afterpromote_si->{confirmed_flush_lsn}, $ab_beforeconsume_si->{confirmed_flush_lsn}, + 'slot ab_failover confirmed pos is unchanged'); +is($ab_afterpromote_si->{restart_lsn}, $ab_beforeconsume_si->{restart_lsn}, + 'slot ab_failover restart pos is unchanged'); + + + + +# Can replay from slot ab, following the timeline switch +test_read_from_slot($node_replica, 'ab_failover', q(BEGIN +table public.decoding: INSERT: blah[text]:'afterbb' +COMMIT +BEGIN +table public.decoding: INSERT: blah[text]:'after failover' +COMMIT)); + +# Can replay from slot bb too, and we only see data after +# what we replayed on the master. +# +# Note that if we didn't force a checkpoint on the master then did an unclean +# shutdown we would expect to see data that we already replayed on the master +# here. The confirm lsn wouldn't be flushed on the master and would therefore +# effectively go backwards on failover. +# +# See http://www.postgresql.org/message-id/CAMsr+YGSaTRGqPcx9qx4eOcizWsa27XjKEiPSOtAJE8OfiXT-g@mail.gmail.com +# +test_read_from_slot($node_replica, 'bb_failover', q(BEGIN +table public.decoding: INSERT: blah[text]:'beforebb' +COMMIT +BEGIN +table public.decoding: INSERT: blah[text]:'afterbb' +COMMIT +BEGIN +table public.decoding: INSERT: blah[text]:'after failover' +COMMIT)); + + +# Can replay from physical failover slot on promoted replica +test_phys_replay($node_replica, 'bb_phys_failover', 2); + +$node_replica->stop('fast'); + +my $log = TestLib::slurp_file($node_replica->logfile); +unlike($log, '/PANIC:/', 'No PANIC in replica logs'); + +# We don't need the standby anymore +$node_replica->teardown_node(); + + + +# Now make sure slot drop works correctly and replays correctly by restoring +# a fresh backup of the standby and having it replay the slot drops. We'll +# also test dropping a physical slot that's currently in-use. +$node_master->start; + +# restore the replica again +$node_replica = get_new_node('replica2'); +$node_replica->init_from_backup( + $node_master, $backup_name, + has_streaming => 1, + has_restoring => 1); +$node_replica->start; + + +# start pg_receivexlog from a local slot on the replica. Then create a failover +# slot with the same name on the master. pg_receivexlog will be automatically +# killed when we drop the slot it's replaying from and replace it with a failover +# slot. +$node_replica->safe_psql('postgres', +"SELECT pg_create_physical_replication_slot('replace_me', false, false);"); + +my $si = get_slot_info($node_replica, 'replace_me'); +diag_slotinfo($si); +is($si->{failover}, 'f', 'created as slot replace_me as non-failover'); + +($proc, $stdout, $stderr, $outdir) = start_pg_receivexlog($node_replica, 'replace_me'); + +$node_master->safe_psql('postgres', +"SELECT pg_create_physical_replication_slot('replace_me', false, true);"); + +wait_for_catchup($node_master, $node_replica); + +# pg_receivexlog should've died +$proc->finish; +is($proc->result, 1, 'pg_receivexlog exited with error code after its slot was dropped'); +like($$stdout, '', 'no stdout'); +like($$stderr, qr/by administrative command/, 'pg_receivexlog exited with admin command'); + +# The slot is now a failover slot +$si = get_slot_info($node_replica, 'replace_me'); +is($si->{failover}, 't', 'failover slot successfully replaces local slot'); + +# OK, make sure slot drops replay correctly + +$node_master->safe_psql("postgres", "SELECT pg_drop_replication_slot('bb_failover');"); +$node_master->safe_psql("postgres", "SELECT pg_drop_replication_slot('ab_failover');"); +$node_master->safe_psql("postgres", "SELECT pg_drop_replication_slot('bb_phys_failover');"); +$node_master->safe_psql("postgres", "SELECT pg_drop_replication_slot('ab_phys_failover');"); +$node_master->safe_psql("postgres", "SELECT pg_drop_replication_slot('replace_me');"); + +wait_for_catchup($node_master, $node_replica); + + +$stdout = $node_replica->safe_psql('postgres', 'SELECT slot_name FROM pg_replication_slots ORDER BY slot_name'); +is($stdout, '', 'No slots exist on replica') + or BAIL_OUT('Remaining tests meaningless'); + + +# OK, now we need to test replay of a big enough chunk of data to advance the restart_lsn +# and make the master do a checkpoint. +# +# We create two copies of the slot so we can advance one of them and get the changes +# checkpointed out, while leaving the other unchanged for replay after failover. +# This just lets us test two things in one: checkpointing of failover slots and +# failover with big chunks of data. + +$node_master->safe_psql('postgres', +"SELECT pg_create_logical_replication_slot('big', 'test_decoding', true); SELECT pg_create_logical_replication_slot('big_adv', 'test_decoding', true);" +); + +$node_master->safe_psql('postgres', + "CREATE TABLE big_inserts (id serial primary key, text padding);" +); + +$node_master->safe_psql('postgres', + "INSERT into big_inserts(padding) SELECT repeat('x', n % 100) FROM generate_series(1, 1000000) n;" +); + +($ret, $stdout, $stderr) = $node_master->psql('postgres', + "SELECT data FROM pg_logical_slot_get_changes('big_adv', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1');"); +is($ret, 0, 'replaying from slot big_adv on master is successful'); +my $data_replayed_from_master = $stdout; +is($stderr, '', 'replay from slot big_adv produces no stderr'); + +wait_for_catchup($node_master, $node_replica); +$node_master->stop('fast'); +$node_replica->promote; +wait_for_end_of_recovery($node_replica); + +($ret, $stdout, $stderr) = $node_replica->psql('postgres', + "SELECT data FROM pg_logical_slot_peek_changes('big', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1');"); +is($ret, 0, 'replaying from slot big on replica is successful'); +is($stdout, $data_replayed_from_master, 'Got same data from replica as master'); +is($stderr, '', 'replay from slot big produces no stderr'); + +$node_replica->stop('fast'); + +# Make sure there's no crash complaint in the master or replica logs +my $log = TestLib::slurp_file($node_master->logfile); +unlike($log, '/PANIC:/', 'No PANIC in master logs'); + +my $log = TestLib::slurp_file($node_replica->logfile); +unlike($log, '/PANIC:/', 'No PANIC in replica logs'); + +$node_master->teardown_node; +$node_replica->teardown_node; + +done_testing(); -- 2.1.0