Re: BUG #17103: WAL segments are not removed after exceeding max_slot_wal_keep_size - Mailing list pgsql-bugs
| From | Kyotaro Horiguchi |
|---|---|
| Subject | Re: BUG #17103: WAL segments are not removed after exceeding max_slot_wal_keep_size |
| Date | |
| Msg-id | 20210715.163348.2298778826242193015.horikyota.ntt@gmail.com Whole thread Raw |
| In response to | Re: BUG #17103: WAL segments are not removed after exceeding max_slot_wal_keep_size (Kyotaro Horiguchi <horikyota.ntt@gmail.com>) |
| Responses |
Re: BUG #17103: WAL segments are not removed after exceeding max_slot_wal_keep_size
Re: BUG #17103: WAL segments are not removed after exceeding max_slot_wal_keep_size Re: BUG #17103: WAL segments are not removed after exceeding max_slot_wal_keep_size |
| List | pgsql-bugs |
At Thu, 15 Jul 2021 14:22:35 +0900 (JST), Kyotaro Horiguchi <horikyota.ntt@gmail.com> wrote in
> At Wed, 14 Jul 2021 19:10:26 -0400, Jeff Janes <jeff.janes@gmail.com> wrote in
> > They should be, but they are not. That is the bug. They just hang
> > around, checkpoint after checkpoint. Some of them do get cleaned up, to
> > make up for new ones created during that cycle. It treats
> > max_slot_wal_keep the same way it treats wal_keep_size (but only if a
> > "lost" slot is hanging around). If you drop the lost slot, only then does
> > it remove all the accumulated WAL at the next checkpoint.
>
> Thanks! I saw the issue here. Some investigation showd me a doubious
> motion of XLogCtl->repliationSlotMinLSN. Slot invalidation is
> forgetting to recalculate it and that misbehavior retreats the segment
> horizon.
>
> So the attached worked for me. I'll repost the polished version
> including test.
This is it. It is for the master branch but also applicable to 14 as
is. Not needed for earlier version.
I believe the test works for Windows but haven't checked.
regards.
--
Kyotaro Horiguchi
NTT Open Source Software Center
From 91705833be615690388e41d176c16b2c294bb0cf Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Date: Thu, 15 Jul 2021 15:52:48 +0900
Subject: [PATCH v1] Advance old-segment horizon properly after slot
invalidation
When some slots are invalidated due to the limit of
max_slot_wal_keep_size, the old segment horizon should advance to
within the limit.
Previously that advancement didn't happen. Hence especially if no
valid slots remain after a slot-invalidation, the horizon no longer
advances and the live WAL files no longer be less than
max_slot_wal_keep_size thereafter.
Backpatch to 14 where the feature was introduced.
---
src/backend/access/transam/xlog.c | 24 +++++++++++++++++++++--
src/backend/replication/slot.c | 20 ++++++++++++++++---
src/include/replication/slot.h | 2 +-
src/test/recovery/t/019_replslot_limit.pl | 11 ++++++++++-
4 files changed, 50 insertions(+), 7 deletions(-)
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index c7c928f50b..6500ee5b11 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -9300,7 +9300,17 @@ CreateCheckPoint(int flags)
*/
XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
KeepLogSeg(recptr, &_logSegNo);
- InvalidateObsoleteReplicationSlots(_logSegNo);
+ if (InvalidateObsoleteReplicationSlots(_logSegNo))
+ {
+ /*
+ * Some slots have been gone, recalculate the old-segment horizon
+ * excluding the invalidated slots.
+ */
+ ReplicationSlotsComputeRequiredLSN();
+ XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
+ KeepLogSeg(recptr, &_logSegNo);
+ }
+
_logSegNo--;
RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr);
@@ -9640,7 +9650,17 @@ CreateRestartPoint(int flags)
replayPtr = GetXLogReplayRecPtr(&replayTLI);
endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
KeepLogSeg(endptr, &_logSegNo);
- InvalidateObsoleteReplicationSlots(_logSegNo);
+ if (InvalidateObsoleteReplicationSlots(_logSegNo))
+ {
+ /*
+ * Some slots have been gone, recalculate the old-segment horizon
+ * excluding the invalidated slots.
+ */
+ ReplicationSlotsComputeRequiredLSN();
+ XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
+ KeepLogSeg(endptr, &_logSegNo);
+ }
+
_logSegNo--;
/*
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index 33b85d86cc..b8eddcab53 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -1143,11 +1143,14 @@ ReplicationSlotReserveWal(void)
* Returns whether ReplicationSlotControlLock was released in the interim (and
* in that case we're not holding the lock at return, otherwise we are).
*
+ * Sets *invalidated if the slot was invalidated but never unsets otherwise.
+ *
* This is inherently racy, because we release the LWLock
* for syscalls, so caller must restart if we return true.
*/
static bool
-InvalidatePossiblyObsoleteSlot(ReplicationSlot *s, XLogRecPtr oldestLSN)
+InvalidatePossiblyObsoleteSlot(ReplicationSlot *s, XLogRecPtr oldestLSN,
+ bool *invalidated)
{
int last_signaled_pid = 0;
bool released_lock = false;
@@ -1193,6 +1196,12 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlot *s, XLogRecPtr oldestLSN)
slotname = s->data.name;
active_pid = s->active_pid;
+ /*
+ * Inform the caller that this slot have got invalidated. We could just
+ * assign true here but make it clear what we are intending here.
+ */
+ *invalidated |= true;
+
/*
* If the slot can be acquired, do so and mark it invalidated
* immediately. Otherwise we'll signal the owning process, below, and
@@ -1291,12 +1300,15 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlot *s, XLogRecPtr oldestLSN)
* Mark any slot that points to an LSN older than the given segment
* as invalid; it requires WAL that's about to be removed.
*
+ * Returns true when any slot have got invalidated.
+ *
* NB - this runs as part of checkpoint, so avoid raising errors if possible.
*/
-void
+bool
InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno)
{
XLogRecPtr oldestLSN;
+ bool invalidated = false;
XLogSegNoOffsetToRecPtr(oldestSegno, 0, wal_segment_size, oldestLSN);
@@ -1309,13 +1321,15 @@ restart:
if (!s->in_use)
continue;
- if (InvalidatePossiblyObsoleteSlot(s, oldestLSN))
+ if (InvalidatePossiblyObsoleteSlot(s, oldestLSN, &invalidated))
{
/* if the lock was released, start from scratch */
goto restart;
}
}
LWLockRelease(ReplicationSlotControlLock);
+
+ return invalidated;
}
/*
diff --git a/src/include/replication/slot.h b/src/include/replication/slot.h
index 34d95eac8e..e32fb85db8 100644
--- a/src/include/replication/slot.h
+++ b/src/include/replication/slot.h
@@ -213,7 +213,7 @@ extern void ReplicationSlotsComputeRequiredLSN(void);
extern XLogRecPtr ReplicationSlotsComputeLogicalRestartLSN(void);
extern bool ReplicationSlotsCountDBSlots(Oid dboid, int *nslots, int *nactive);
extern void ReplicationSlotsDropDBSlots(Oid dboid);
-extern void InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno);
+extern bool InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno);
extern ReplicationSlot *SearchNamedReplicationSlot(const char *name, bool need_lock);
extern void ReplicationSlotNameForTablesync(Oid suboid, Oid relid, char *syncslotname, int szslot);
extern void ReplicationSlotDropAtPubNode(WalReceiverConn *wrconn, char *slotname, bool missing_ok);
diff --git a/src/test/recovery/t/019_replslot_limit.pl b/src/test/recovery/t/019_replslot_limit.pl
index d4b9ff705f..b76ed0fdac 100644
--- a/src/test/recovery/t/019_replslot_limit.pl
+++ b/src/test/recovery/t/019_replslot_limit.pl
@@ -11,7 +11,7 @@ use TestLib;
use PostgresNode;
use File::Path qw(rmtree);
-use Test::More tests => $TestLib::windows_os ? 14 : 18;
+use Test::More tests => $TestLib::windows_os ? 15 : 19;
use Time::HiRes qw(usleep);
$ENV{PGDATABASE} = 'postgres';
@@ -192,6 +192,15 @@ $result = $node_primary->safe_psql('postgres',
is($result, "rep1|f|t|lost|",
'check that the slot became inactive and the state "lost" persists');
+# The invalidated slot shouldn't retreat the old-segment horizon
+my $redoseg = $node_primary->safe_psql('postgres',
+ "select pg_walfile_name(lsn) from pg_create_physical_replication_slot('xx', true)");
+my $oldestseg = $node_primary->safe_psql('postgres',
+ "select pg_ls_dir from pg_ls_dir('pg_wal') where pg_ls_dir like '0%' order by pg_ls_dir limit 1");
+
+is($oldestseg, $redoseg,
+ "check if slot-invalidation advances the old segment horizon");
+
# The standby no longer can connect to the primary
$logstart = get_log_size($node_standby);
$node_standby->start;
--
2.27.0
pgsql-bugs by date: