From a3be65c9bea08d3c7ffd575342a879d37b28b9a8 Mon Sep 17 00:00:00 2001 From: Euler Taveira Date: Thu, 11 Jul 2024 19:59:56 -0300 Subject: [PATCH] pg_createsubscriber: fix slow recovery If the primary server is idle when you are running pg_createsubscriber, it used to take some time during recovery. The reason is that it was using the LSN returned by pg_create_logical_replication_slot as recovery_target_lsn. This LSN points to "end + 1" record that might not be available at WAL, hence, the recovery routine waits until some activity from auxiliary processes write WAL records and once it reaches the recovery_target_lsn position. Instead, use restart_lsn from the last replication slot. It points to the last WAL record that was replayed. Hence, the recovery finishes soon. create_logical_replication_slot() does not return LSN so change its signature. Discussion: https://www.postgresql.org/message-id/2377319.1719766794%40sss.pgh.pa.us Discussion: https://www.postgresql.org/message-id/68de6498-0449-a113-dd03-e198dded0bac%40gmail.com --- src/bin/pg_basebackup/pg_createsubscriber.c | 56 ++++++++++++++++----- 1 file changed, 43 insertions(+), 13 deletions(-) diff --git a/src/bin/pg_basebackup/pg_createsubscriber.c b/src/bin/pg_basebackup/pg_createsubscriber.c index 21dd50f8089..5ef3f751a5b 100644 --- a/src/bin/pg_basebackup/pg_createsubscriber.c +++ b/src/bin/pg_basebackup/pg_createsubscriber.c @@ -86,8 +86,8 @@ static void setup_recovery(const struct LogicalRepInfo *dbinfo, const char *data static void drop_primary_replication_slot(struct LogicalRepInfo *dbinfo, const char *slotname); static void drop_failover_replication_slots(struct LogicalRepInfo *dbinfo); -static char *create_logical_replication_slot(PGconn *conn, - struct LogicalRepInfo *dbinfo); +static bool create_logical_replication_slot(PGconn *conn, + struct LogicalRepInfo *dbinfo); static void drop_replication_slot(PGconn *conn, struct LogicalRepInfo *dbinfo, const char *slot_name); static void pg_ctl_status(const char *pg_ctl_cmd, int rc); @@ -769,15 +769,47 @@ setup_publisher(struct LogicalRepInfo *dbinfo) create_publication(conn, &dbinfo[i]); /* Create replication slot on publisher */ - if (lsn) - pg_free(lsn); - lsn = create_logical_replication_slot(conn, &dbinfo[i]); - if (lsn != NULL || dry_run) + if (create_logical_replication_slot(conn, &dbinfo[i]) || dry_run) pg_log_info("create replication slot \"%s\" on publisher", dbinfo[i].replslotname); else exit(1); + /* + * Get restart_lsn from the last replication slot. It points to the + * last record replayed. The LSN returned by + * pg_create_logical_replication_slot() should not be used because it + * points to a future position. In case the server is idle while + * running this tool, the recovery will not finish soon because the + * future position was not reached. Hence, obtain the restart_lsn that + * points to a LSN already available at WAL. The recovery parameters + * guarantee that this last replication slot will be applied. + */ + if ((i == num_dbs - 1) && !dry_run) + { + PQExpBuffer str = createPQExpBuffer(); + PGresult *res; + + appendPQExpBuffer(str, + "SELECT restart_lsn FROM pg_catalog.pg_replication_slots " + "WHERE slot_name = '%s'", + dbinfo[i].replslotname); + res = PQexec(conn, str->data); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + pg_log_error("could not obtain consistent LSN: %s", + PQresultErrorMessage(res)); + disconnect_database(conn, true); + } + else + { + lsn = pg_strdup(PQgetvalue(res, 0, 0)); + } + + PQclear(res); + destroyPQExpBuffer(str); + } + disconnect_database(conn, false); } @@ -1283,19 +1315,18 @@ drop_failover_replication_slots(struct LogicalRepInfo *dbinfo) } /* - * Create a logical replication slot and returns a LSN. + * Create a logical replication slot. * * CreateReplicationSlot() is not used because it does not provide the one-row * result set that contains the LSN. */ -static char * +static bool create_logical_replication_slot(PGconn *conn, struct LogicalRepInfo *dbinfo) { PQExpBuffer str = createPQExpBuffer(); PGresult *res = NULL; const char *slot_name = dbinfo->replslotname; char *slot_name_esc; - char *lsn = NULL; Assert(conn != NULL); @@ -1305,7 +1336,7 @@ create_logical_replication_slot(PGconn *conn, struct LogicalRepInfo *dbinfo) slot_name_esc = PQescapeLiteral(conn, slot_name, strlen(slot_name)); appendPQExpBuffer(str, - "SELECT lsn FROM pg_catalog.pg_create_logical_replication_slot(%s, 'pgoutput', false, false, false)", + "SELECT * FROM pg_catalog.pg_create_logical_replication_slot(%s, 'pgoutput', false, false, false)", slot_name_esc); pg_free(slot_name_esc); @@ -1322,10 +1353,9 @@ create_logical_replication_slot(PGconn *conn, struct LogicalRepInfo *dbinfo) PQresultErrorMessage(res)); PQclear(res); destroyPQExpBuffer(str); - return NULL; + return false; } - lsn = pg_strdup(PQgetvalue(res, 0, 0)); PQclear(res); } @@ -1334,7 +1364,7 @@ create_logical_replication_slot(PGconn *conn, struct LogicalRepInfo *dbinfo) destroyPQExpBuffer(str); - return lsn; + return true; } static void -- 2.30.2