From f14cf88958858c637f18c11c4221dfed0a04ccba Mon Sep 17 00:00:00 2001 From: Bernd Helmle Date: Thu, 10 Jul 2025 18:35:51 +0200 Subject: [PATCH 3/4] Rework handling of procsignalbarrier and local cache for data_checksion_version. --- src/backend/access/transam/xlog.c | 45 +++++++++---------------- src/backend/postmaster/auxprocess.c | 19 +++++++++++ src/backend/postmaster/launch_backend.c | 10 ------ src/backend/utils/init/postinit.c | 17 ++++++++-- src/include/access/xlog.h | 2 +- src/include/miscadmin.h | 1 - 6 files changed, 51 insertions(+), 43 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 53890c3e726..097db70fcfc 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -672,12 +672,15 @@ static bool updateMinRecoveryPoint = true; static uint32 LocalDataChecksumVersion = 0; /* - * Flag to remember if the procsignalbarrier being absorbed for enabling - * checksums is the first one or not. The first procsignalbarrier can in rare - * circumstances cause a transition from 'on' to 'on' when a new process is + * Flag to remember if the procsignalbarrier being absorbed for checksums + * is the first one. The first procsignalbarrier can in rare cases be for + * the state we've initialized, i.e. a duplicate. This may happen for any + * data_checksum_version value, but for PG_DATA_CHECKSUM_ON_VERSION this + * would trigger an assert failure (this is the only transition with an + * assert) when processing the barrier. This may happen if the process is * spawned between the update of XLogCtl->data_checksum_version and the - * barrier being emitted. This can only happen on the very first barrier so - * mark that with this flag. + * barrier being emitted. This can only happen on the very first barrier + * so mark that with this flag. */ static bool InitialDataChecksumTransition = true; @@ -5052,6 +5055,7 @@ SetDataChecksumsOff(void) bool AbsorbChecksumsOnInProgressBarrier(void) { + /* XXX can't we check we're in OFF or INPROGRESSS_ON? */ SetLocalDataChecksumVersion(PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION); return true; } @@ -5064,22 +5068,19 @@ AbsorbChecksumsOnBarrier(void) * barrier it will have seen the updated value, so for the first barrier * we accept both "on" and "inprogress-on". */ - if (InitialDataChecksumTransition) - { - Assert((LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION) || - (LocalDataChecksumVersion == PG_DATA_CHECKSUM_VERSION)); - InitialDataChecksumTransition = false; - } - else - Assert(LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION); + Assert((LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION) || + (InitialDataChecksumTransition && + (LocalDataChecksumVersion == PG_DATA_CHECKSUM_VERSION))); SetLocalDataChecksumVersion(PG_DATA_CHECKSUM_VERSION); + InitialDataChecksumTransition = false; return true; } bool AbsorbChecksumsOffInProgressBarrier(void) { + /* XXX can't we check we're in ON or INPROGRESSS_OFF? */ SetLocalDataChecksumVersion(PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION); return true; } @@ -5087,6 +5088,7 @@ AbsorbChecksumsOffInProgressBarrier(void) bool AbsorbChecksumsOffBarrier(void) { + /* XXX can't we check we're in INPROGRESSS_OFF? */ SetLocalDataChecksumVersion(0); return true; } @@ -5100,7 +5102,7 @@ AbsorbChecksumsOffBarrier(void) * purpose enough to handle future cases. */ void -InitLocalControldata(void) +InitLocalDataChecksumVersion(void) { SpinLockAcquire(&XLogCtl->info_lck); SetLocalDataChecksumVersion(XLogCtl->data_checksum_version); @@ -5138,21 +5140,6 @@ SetLocalDataChecksumVersion(uint32 data_checksum_version) } } -/* - * Initialize the various data checksum values - GUC, local, .... - */ -void -InitLocalDataChecksumVersion(void) -{ - uint32 data_checksum_version; - - SpinLockAcquire(&XLogCtl->info_lck); - data_checksum_version = XLogCtl->data_checksum_version; - SpinLockRelease(&XLogCtl->info_lck); - - SetLocalDataChecksumVersion(data_checksum_version); -} - /* * Get the local data_checksum_version (cached XLogCtl value). */ diff --git a/src/backend/postmaster/auxprocess.c b/src/backend/postmaster/auxprocess.c index a6d3630398f..4b56ef0eb81 100644 --- a/src/backend/postmaster/auxprocess.c +++ b/src/backend/postmaster/auxprocess.c @@ -15,6 +15,7 @@ #include #include +#include "access/xlog.h" #include "miscadmin.h" #include "pgstat.h" #include "postmaster/auxprocess.h" @@ -68,6 +69,24 @@ AuxiliaryProcessMainCommon(void) ProcSignalInit(NULL, 0); + /* + * Initialize a local cache of the data_checksum_version, to be updated + * by the procsignal-based barriers. + * + * This intentionally happens after initializing the procsignal, otherwise + * we might miss a state change. This means we can get a barrier for the + * state we've just initialized - but it can happen only once. + * + * The postmaster (which is what gets forked into the new child process) + * does not handle barriers, therefore it may not have the current value + * of LocalDataChecksumVersion value (it'll have the value read from the + * control file, which may be arbitrarily old). + * + * NB: Even if the postmaster handled barriers, the value might still be + * stale, as it might have changed after this process forked. + */ + InitLocalDataChecksumVersion(); + /* * Auxiliary processes don't run transactions, but they may need a * resource owner anyway to manage buffer pins acquired outside diff --git a/src/backend/postmaster/launch_backend.c b/src/backend/postmaster/launch_backend.c index 4a86d2588ff..955df32be5d 100644 --- a/src/backend/postmaster/launch_backend.c +++ b/src/backend/postmaster/launch_backend.c @@ -287,16 +287,6 @@ postmaster_child_launch(BackendType child_type, int child_slot, memcpy(MyClientSocket, client_sock, sizeof(ClientSocket)); } - /* - * update the LocalProcessControlFile to match XLogCtl->data_checksum_version - * - * XXX It seems the postmaster (which is what gets forked into the new - * child process) does not absorb the checksum barriers, therefore it - * does not update the value (except after a restart). Not sure if there - * is some sort of race condition. - */ - InitLocalDataChecksumVersion(); - /* * Run the appropriate Main function */ diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 785b8d4b04f..7418deb10f5 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -752,9 +752,22 @@ InitPostgres(const char *in_dbname, Oid dboid, ProcSignalInit(MyCancelKey, MyCancelKeyLength); /* - * Set up backend local cache of Controldata values. + * Initialize a local cache of the data_checksum_version, to be updated + * by the procsignal-based barriers. + * + * This intentionally happens after initializing the procsignal, otherwise + * we might miss a state change. This means we can get a barrier for the + * state we've just initialized - but it can happen only once. + * + * The postmaster (which is what gets forked into the new child process) + * does not handle barriers, therefore it may not have the current value + * of LocalDataChecksumVersion value (it'll have the value read from the + * control file, which may be arbitrarily old). + * + * NB: Even if the postmaster handled barriers, the value might still be + * stale, as it might have changed after this process forked. */ - InitLocalControldata(); + InitLocalDataChecksumVersion(); /* * Also set up timeout handlers needed for backend operation. We need diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index aec3ea0bc63..615b2cf4ec8 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -243,7 +243,7 @@ extern bool AbsorbChecksumsOffInProgressBarrier(void); extern bool AbsorbChecksumsOnBarrier(void); extern bool AbsorbChecksumsOffBarrier(void); extern const char *show_data_checksums(void); -extern void InitLocalControldata(void); +extern void InitLocalDataChecksumVersion(void); extern bool GetDefaultCharSignedness(void); extern XLogRecPtr GetFakeLSNForUnloggedRel(void); extern Size XLOGShmemSize(void); diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index c86294b4d19..5ce7a1cbc65 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -544,7 +544,6 @@ extern void RestoreClientConnectionInfo(char *conninfo); extern uint32 GetLocalDataChecksumVersion(void); extern uint32 GetCurrentDataChecksumVersion(void); -extern void InitLocalDataChecksumVersion(void); /* in executor/nodeHash.c */ extern size_t get_hash_memory_limit(void); -- 2.50.0