From 10101239221c7d92ec6a20fd9e79d4cc09cd2299 Mon Sep 17 00:00:00 2001 From: Shveta Malik Date: Fri, 19 Jan 2024 11:04:16 +0530 Subject: [PATCH v3] Document the steps to check if the standby is ready for failover This patch adds detailed documentation for the slot sync feature including examples to guide users on how to verify that all slots have been successfully synchronized to the standby server and how to confirm whether the subscription can continue subscribing to publications on the promoted standby server. --- doc/src/sgml/high-availability.sgml | 9 ++ doc/src/sgml/logical-replication.sgml | 137 ++++++++++++++++++++++++++ 2 files changed, 146 insertions(+) diff --git a/doc/src/sgml/high-availability.sgml b/doc/src/sgml/high-availability.sgml index b48209fc2f..ae405e029e 100644 --- a/doc/src/sgml/high-availability.sgml +++ b/doc/src/sgml/high-availability.sgml @@ -1487,6 +1487,15 @@ synchronous_standby_names = 'ANY 2 (s1, s2, s3)' Written administration procedures are advised. + + If you have opted for synchronization of logical slots (see + ), + then before switching to the standby server, it is recommended to check + if the logical slots synchronized on the standby server are ready + for failover. This can be done by following the steps described in + . + + To trigger failover of a log-shipping standby server, run pg_ctl promote or call pg_promote(). diff --git a/doc/src/sgml/logical-replication.sgml b/doc/src/sgml/logical-replication.sgml index ec2130669e..947bd652ea 100644 --- a/doc/src/sgml/logical-replication.sgml +++ b/doc/src/sgml/logical-replication.sgml @@ -687,6 +687,143 @@ ALTER SUBSCRIPTION + + Logical Replication Failover + + + When the publisher server is the primary server of a streaming replication, + the logical slots on that primary server can be synchronized to the standby + server by specifying failover = true when creating + subscriptions for those publications. Enabling failover ensures a seamless + transition of those subscriptions after the standby is promoted. They can + continue subscribing to publications now on the new primary server without + losing any data that has been flushed to the new primary server. + + + + Because the slot synchronization logic copies asynchronously, it is + necessary to confirm that replication slots have been synced to the standby + server before the failover happens. Furthermore, to ensure a successful + failover, the standby server must not be lagging behind the subscriber. It + is highly recommended to use + standby_slot_names + to prevent the subscriber from consuming changes faster than the hot standby. + To confirm that the standby server is indeed ready for failover, follow + these 2 steps: + + + + + + Confirm that all the necessary logical replication slots have been synced to + the standby server. + + + + + Firstly, on the subscriber node, use the following SQL to identify + which slots should be synced to the standby that we plan to promote. + +test_sub=# SELECT + array_agg(slotname) AS slots + FROM + (( + SELECT r.srsubid AS subid, CONCAT('pg_', srsubid, '_sync_', srrelid, '_', ctl.system_identifier) AS slotname + FROM pg_control_system() ctl, pg_subscription_rel r, pg_subscription s + WHERE r.srsubstate = 'f' AND s.oid = r.srsubid AND s.subfailover + ) UNION ( + SELECT s.oid AS subid, s.subslotname as slotname + FROM pg_subscription s + WHERE s.subfailover + )); + slots +------- + {sub1,sub2,sub3} +(1 row) + + + + + Next, check that the logical replication slots identified above exist on + the standby server and are ready for failover. + +test_standby=# SELECT slot_name, (synced AND NOT temporary AND NOT conflicting) AS failover_ready + FROM pg_replication_slots + WHERE slot_name IN ('sub1','sub2','sub3'); + slot_name | failover_ready +-------------+---------------- + sub1 | t + sub2 | t + sub3 | t +(3 rows) + + + + + + + + Confirm that the standby server is not lagging behind the subscribers. + This step can be skipped if + standby_slot_names + has been correctly configured. If standby_slot_names is not configured + correctly, it is highly recommended to run this step after the primary + server is down, otherwise the results of the query may vary at different + points of time due to the ongoing replication on the logical subscribers + from the primary server. + + + + + Firstly, on the subscriber node check the last replayed WAL. + This step needs to be run on the database(s) that includes the failover + enabled subscription(s), to find the last replayed WAL on each database. + +test_sub=# SELECT + MAX(remote_lsn) AS remote_lsn_on_subscriber + FROM + (( + SELECT (CASE WHEN r.srsubstate = 'f' THEN pg_replication_origin_progress(CONCAT('pg_', r.srsubid, '_', r.srrelid), false) + WHEN r.srsubstate IN ('s', 'r') THEN r.srsublsn END) AS remote_lsn + FROM pg_subscription_rel r, pg_subscription s + WHERE r.srsubstate IN ('f', 's', 'r') AND s.oid = r.srsubid AND s.subfailover + ) UNION ( + SELECT pg_replication_origin_progress(CONCAT('pg_', s.oid), false) AS remote_lsn + FROM pg_subscription s + WHERE s.subfailover + )); + remote_lsn_on_subscriber +-------------------------- + 0/3000388 + + + + + Next, on the standby server check that the last-received WAL location + is ahead of the replayed WAL location(s) on the subscriber identified + above. If the above SQL result was NULL, it means the subscriber has not + yet replayed any WAL, so the standby server must be ahead of the + subscriber, and this step can be skipped. + +test_standby=# SELECT pg_last_wal_receive_lsn() >= '0/3000388'::pg_lsn AS failover_ready; + failover_ready +---------------- + t +(1 row) + + + + + + + + If the result (failover_ready) of both above steps is + true, existing subscriptions will be able to continue without losing any data + that has been flushed to the new primary server. + + + + Row Filters -- 2.30.0.windows.2