From 3ccb06a1e39b456d26a5e2f89c9b634f04b34307 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 27 Jan 2021 17:20:44 +0200 Subject: [PATCH v34 1/1] Support checksum enable/disable in a running cluster v33 This allows data checksums to be enabled, or disabled, in a running cluster without restricting access to the cluster during processing. A dynamic background worker is responsible for launching a per-database worker which will mark all buffers dirty for all relation with storage in order for them to have data checksums on write. A new in-progress state is introduced which during processing ensures that data checksums are written but not verified to avoid false negatives. State changes across backends are synchronized using a procsignalbarrier. Authors: Daniel Gustafsson, Magnus Hagander Reviewed-by: Heikki Linnakangas, Robert Haas, Andres Freund, Tomas Vondra, Michael Banck, Andrey Borodin Discussion: https://postgr.es/m/CABUevExz9hUUOLnJVr2kpw9Cx=o4MCr1SVKwbupzuxP7ckNutA@mail.gmail.com Discussion: https://postgr.es/m/20181030051643.elbxjww5jjgnjaxg@alap3.anarazel.de Discussion: https://postgr.es/m/CABUevEwE3urLtwxxqdgd5O2oQz9J717ZzMbh+ziCSa5YLLU_BA@mail.gmail.com --- doc/src/sgml/catalogs.sgml | 11 + doc/src/sgml/func.sgml | 68 + doc/src/sgml/monitoring.sgml | 6 +- doc/src/sgml/ref/pg_checksums.sgml | 6 + doc/src/sgml/wal.sgml | 57 +- src/backend/access/heap/heapam.c | 9 +- src/backend/access/rmgrdesc/xlogdesc.c | 18 + src/backend/access/transam/xlog.c | 452 +++++- src/backend/access/transam/xlogfuncs.c | 47 + src/backend/catalog/heap.c | 7 + src/backend/catalog/system_views.sql | 5 + src/backend/postmaster/Makefile | 1 + src/backend/postmaster/bgworker.c | 10 + src/backend/postmaster/datachecksumsworker.c | 1530 ++++++++++++++++++ src/backend/postmaster/pgstat.c | 6 + src/backend/replication/basebackup.c | 9 +- src/backend/replication/logical/decode.c | 1 + src/backend/storage/buffer/bufmgr.c | 5 + src/backend/storage/ipc/ipci.c | 3 + src/backend/storage/ipc/procsignal.c | 33 +- src/backend/storage/lmgr/lwlocknames.txt | 1 + src/backend/storage/page/README | 4 +- src/backend/storage/page/bufpage.c | 29 +- src/backend/utils/adt/pgstatfuncs.c | 6 - src/backend/utils/cache/relcache.c | 60 +- src/backend/utils/init/miscinit.c | 6 + src/backend/utils/init/postinit.c | 5 + src/backend/utils/misc/guc.c | 37 +- src/bin/pg_checksums/pg_checksums.c | 2 +- src/bin/pg_upgrade/controldata.c | 9 + src/bin/pg_upgrade/pg_upgrade.h | 2 +- src/include/access/xlog.h | 19 +- src/include/access/xlog_internal.h | 7 + src/include/catalog/pg_class.h | 3 + src/include/catalog/pg_control.h | 1 + src/include/catalog/pg_proc.dat | 16 + src/include/miscadmin.h | 2 + src/include/pgstat.h | 2 + src/include/postmaster/datachecksumsworker.h | 30 + src/include/storage/bufpage.h | 3 + src/include/storage/checksum.h | 8 + src/include/storage/procsignal.h | 10 +- src/test/Makefile | 2 +- src/test/checksum/.gitignore | 2 + src/test/checksum/Makefile | 23 + src/test/checksum/README | 22 + src/test/checksum/t/001_basic.pl | 92 ++ src/test/checksum/t/002_restarts.pl | 117 ++ src/test/checksum/t/003_standby_checksum.pl | 127 ++ src/test/checksum/t/004_offline.pl | 105 ++ src/test/perl/PostgresNode.pm | 36 + 51 files changed, 2985 insertions(+), 87 deletions(-) create mode 100644 src/backend/postmaster/datachecksumsworker.c create mode 100644 src/include/postmaster/datachecksumsworker.h create mode 100644 src/test/checksum/.gitignore create mode 100644 src/test/checksum/Makefile create mode 100644 src/test/checksum/README create mode 100644 src/test/checksum/t/001_basic.pl create mode 100644 src/test/checksum/t/002_restarts.pl create mode 100644 src/test/checksum/t/003_standby_checksum.pl create mode 100644 src/test/checksum/t/004_offline.pl diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index 865e826fb0b..75cc1588a5c 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -2166,6 +2166,17 @@ SCRAM-SHA-256$<iteration count>:&l + + + relhaschecksums bool + + + True if relation has data checksums on all pages. This state is only + used during checksum processing; this field should never be consulted + for cluster checksum status. + + + relrewrite oid diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index aa99665e2eb..94182fb7b1e 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -25839,6 +25839,74 @@ postgres=# SELECT * FROM pg_walfile_name_offset(pg_stop_backup()); + + Data Checksum Functions + + + The functions shown in can + be used to enable or disable data checksums in a running cluster. + See for details. + + + + Data Checksum Functions + + + + + Function + + + Description + + + + + + + + + pg_enable_data_checksums + + pg_enable_data_checksums ( cost_delay int, cost_limit int ) + void + + + Initiates data checksums for the cluster. This will switch the data + checksums mode to inprogress-on as well as start a + background worker that will process all data in the database and enable + checksums for it. When all data pages have had checksums enabled, the + cluster will automatically switch data checksums mode to + on. + + + If cost_delay and cost_limit are + specified, the speed of the process is throttled using the same principles as + Cost-based Vacuum Delay. + + + + + + + pg_disable_data_checksums + + pg_disable_data_checksums () + void + + + Disables data checksums for the cluster. This will switch the data + checksum mode to inprogress-off while data checksums + are being disabled. When all active backends have ceased to validate + data checksums, the data checksum mode will be changed to off. + + + + +
+ +
+ Database Object Management Functions diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 9496f76b1fb..7e170ec4299 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -3695,8 +3695,7 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
Number of data page checksum failures detected in this - database (or on a shared object), or NULL if data checksums are not - enabled. + database.
@@ -3706,8 +3705,7 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i Time at which the last data page checksum failure was detected in - this database (or on a shared object), or NULL if data checksums are not - enabled. + this database (or on a shared object). diff --git a/doc/src/sgml/ref/pg_checksums.sgml b/doc/src/sgml/ref/pg_checksums.sgml index c84bc5c5b23..d879550e81c 100644 --- a/doc/src/sgml/ref/pg_checksums.sgml +++ b/doc/src/sgml/ref/pg_checksums.sgml @@ -45,6 +45,12 @@ PostgreSQL documentation exit status is nonzero if the operation failed. + + When enabling checksums, if checksums were in the process of being enabled + when the cluster was shut down, pg_checksums + will still process all relations regardless of the online processing. + + When verifying checksums, every file in the cluster is scanned. When enabling checksums, every file in the cluster is rewritten in-place. diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml index 66de1ee2f81..48890ccc9d3 100644 --- a/doc/src/sgml/wal.sgml +++ b/doc/src/sgml/wal.sgml @@ -247,9 +247,10 @@ Checksums are normally enabled when the cluster is initialized using initdb. - They can also be enabled or disabled at a later time as an offline - operation. Data checksums are enabled or disabled at the full cluster - level, and cannot be specified individually for databases or tables. + They can also be enabled or disabled at a later time either as an offline + operation or online in a running cluster allowing concurrent access. Data + checksums are enabled or disabled at the full cluster level, and cannot be + specified individually for databases or tables. @@ -266,7 +267,7 @@ - Off-line Enabling of Checksums + Offline Enabling of Checksums The pg_checksums @@ -275,6 +276,54 @@ + + + Online Enabling of Checksums + + + Checksums can be enabled or disabled online, by calling the appropriate + functions. + + + + Enabling checksums will put the cluster checksum mode in + inprogress-on mode. During this time, checksums will be + written but not verified. In addition to this, a background worker process + is started that enables checksums on all existing data in the cluster. Once + this worker has completed processing all databases in the cluster, the + checksum mode will automatically switch to on. The + processing will consume a background worker process, make sure that + max_worker_processes allows for at least one more + additional process. + + + + The process will initially wait for all open transactions to finish before + it starts, so that it can be certain that there are no tables that have been + created inside a transaction that has not committed yet and thus would not + be visible to the process enabling checksums. It will also, for each database, + wait for all pre-existing temporary tables to get removed before it finishes. + If long-lived temporary tables are used in the application it may be necessary + to terminate these application connections to allow the process to complete. + + + + If the cluster is stopped while in inprogress-on mode, for + any reason, then this process must be restarted manually. To do this, + re-execute the function pg_enable_data_checksums() + once the cluster has been restarted. The background worker will attempt + to resume the work from where it was interrupted. + + + + + Enabling checksums can cause significant I/O to the system, as most of the + database pages will need to be rewritten, and will be written both to the + data files and the WAL. + + + + diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 9926e2bd546..ffcd8899082 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -7927,7 +7927,7 @@ log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid, * and dirtied. * * If checksums are enabled, we also generate a full-page image of - * heap_buffer, if necessary. + * heap_buffer. */ XLogRecPtr log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer, @@ -7948,11 +7948,18 @@ log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer, XLogRegisterBuffer(0, vm_buffer, 0); flags = REGBUF_STANDARD; + /* + * Hold interrupts for the duration of xlogging to avoid the state of data + * checksums changing during the processing which would alter the premise + * for xlogging hint bits. + */ + HOLD_INTERRUPTS(); if (!XLogHintBitIsNeeded()) flags |= REGBUF_NO_IMAGE; XLogRegisterBuffer(1, heap_buffer, flags); recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE); + RESUME_INTERRUPTS(); return recptr; } diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index 92cc7ea0735..fa074c6046f 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -18,6 +18,7 @@ #include "access/xlog.h" #include "access/xlog_internal.h" #include "catalog/pg_control.h" +#include "storage/bufpage.h" #include "utils/guc.h" #include "utils/timestamp.h" @@ -140,6 +141,20 @@ xlog_desc(StringInfo buf, XLogReaderState *record) xlrec.ThisTimeLineID, xlrec.PrevTimeLineID, timestamptz_to_str(xlrec.end_time)); } + else if (info == XLOG_CHECKSUMS) + { + xl_checksum_state xlrec; + + memcpy(&xlrec, rec, sizeof(xl_checksum_state)); + if (xlrec.new_checksumtype == PG_DATA_CHECKSUM_VERSION) + appendStringInfo(buf, "on"); + else if (xlrec.new_checksumtype == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION) + appendStringInfo(buf, "inprogress-off"); + else if (xlrec.new_checksumtype == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION) + appendStringInfo(buf, "inprogress-on"); + else + appendStringInfo(buf, "off"); + } } const char * @@ -185,6 +200,9 @@ xlog_identify(uint8 info) case XLOG_FPI_FOR_HINT: id = "FPI_FOR_HINT"; break; + case XLOG_CHECKSUMS: + id = "CHECKSUMS"; + break; } return id; diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index cc007b8963e..8531def93c1 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -38,6 +38,7 @@ #include "access/xlogreader.h" #include "access/xlogutils.h" #include "catalog/catversion.h" +#include "catalog/pg_class.h" #include "catalog/pg_control.h" #include "catalog/pg_database.h" #include "commands/progress.h" @@ -50,6 +51,7 @@ #include "port/atomics.h" #include "port/pg_iovec.h" #include "postmaster/bgwriter.h" +#include "postmaster/datachecksumsworker.h" #include "postmaster/startup.h" #include "postmaster/walwriter.h" #include "replication/basebackup.h" @@ -253,6 +255,16 @@ static bool LocalPromoteIsTriggered = false; */ static int LocalXLogInsertAllowed = -1; +/* + * Local state for Controlfile data_checksum_version. After initialization, + * this is only updated when absorbing a procsignal barrier during interrupt + * processing. The reason for keeping a copy in backend-private memory is to + * avoid locking for interrogating checksum state. Possible values are the + * checksum versions defined in storage/bufpage.h and zero for when checksums + * are disabled. + */ +static uint32 LocalDataChecksumVersion = 0; + /* * When ArchiveRecoveryRequested is set, archive recovery was requested, * ie. signal files were present. When InArchiveRecovery is set, we are @@ -900,6 +912,7 @@ static void SetLatestXTime(TimestampTz xtime); static void SetCurrentChunkStartTime(TimestampTz xtime); static void CheckRequiredParameterValues(void); static void XLogReportParameters(void); +static void XLogChecksums(ChecksumType new_type); static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI); static void LocalSetXLogInsertAllowed(void); @@ -1073,8 +1086,8 @@ XLogInsertRecord(XLogRecData *rdata, * and fast otherwise. * * Also check to see if fullPageWrites or forcePageWrites was just turned - * on; if we weren't already doing full-page writes then go back and - * recompute. + * on, or of we are in the process of enabling checksums in the cluster; + * if we weren't already doing full-page writes then go back and recompute. * * If we aren't doing full-page writes then RedoRecPtr doesn't actually * affect the contents of the XLOG record, so we'll update our local copy @@ -1087,7 +1100,7 @@ XLogInsertRecord(XLogRecData *rdata, Assert(RedoRecPtr < Insert->RedoRecPtr); RedoRecPtr = Insert->RedoRecPtr; } - doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites); + doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites || DataChecksumsOnInProgress()); if (doPageWrites && (!prevDoPageWrites || @@ -4915,9 +4928,7 @@ ReadControlFile(void) CalculateCheckpointSegments(); - /* Make the initdb settings visible as GUC variables, too */ - SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no", - PGC_INTERNAL, PGC_S_OVERRIDE); + LocalDataChecksumVersion = ControlFile->data_checksum_version; } /* @@ -4951,13 +4962,370 @@ GetMockAuthenticationNonce(void) } /* - * Are checksums enabled for data pages? + * DataChecksumsNeedWrite + * Returns whether data checksums must be written or not + * + * Returns true iff data checksums are enabled or are in the process of being + * enabled. During "inprogress-on" and "inprogress-off" states checksums must + * be written even though they are not verified (see datachecksumsworker.c for + * a longer discussion). + * + * This function is intedewd for callsites which are about to write a data page + * to storage, and need to know whether to re-calculate the checksum for the + * page header. Interrupts must be held off during calling this and until the + * write operation has finished to avoid the risk of the checksum state + * changing. This implies that calling this function must be performed as close + * to write operation as possible to keep the critical section short. + */ +bool +DataChecksumsNeedWrite(void) +{ + Assert(InterruptHoldoffCount > 0); + return (LocalDataChecksumVersion == PG_DATA_CHECKSUM_VERSION || + LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION || + LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION); +} + +/* + * DataChecksumsNeedVerify + * Returns whether data checksums must be verified or not + * + * Data checksums are only verified if they are fully enabled in the cluster. + * During the "inprogress-on" and "inprogress-off" states they are only + * updated, not verified (see datachecksumsworker.c for a longer discussion). + * + * This function is intended for callsites which have read data and are about + * to perform checksum validation based on the result of this. To avoid the + * the risk of the checksum state changing between reading and performing the + * validation (or not), interrupts must be held off. This implies that calling + * this function must be performed as close to the validation call as possible + * to keep the critical section short. This is in order to protect against + * time of check/time of use situations around data checksum validation. + */ +bool +DataChecksumsNeedVerify(void) +{ + Assert(InterruptHoldoffCount > 0); + return (LocalDataChecksumVersion == PG_DATA_CHECKSUM_VERSION); +} + +/* + * DataChecksumsOnInProgress + * Returns whether data checksums are being enabled + * + * Most operations don't need to worry about the "inprogress" states, and + * should use DataChecksumsNeedVerify() or DataChecksumsNeedWrite(). The + * "inprogress-on" state for enabling checksums is used when the checksum + * worker is setting checksums on all pages, it can thus be used to check for + * aborted checksum processing which need to be restarted. + */ +inline bool +DataChecksumsOnInProgress(void) +{ + return (LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION); +} + +/* + * DataChecksumsOffInProgress + * Returns whether data checksums are being disabled + * + * The "inprogress-off" state for disabling checksums is used for when the + * worker resets the catalog state. DataChecksumsNeedVerify() or + * DataChecksumsNeedWrite() should be used for deciding whether to read/write + * checksums. */ bool -DataChecksumsEnabled(void) +DataChecksumsOffInProgress(void) +{ + return (LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION); +} + +/* + * SetDataChecksumsOnInProgress + * Sets the data checksum state to "inprogress-on" to enable checksums + * + * To start the process of enabling data checksums in a running cluster the + * data_checksum_version state must be changed to "inprogress-on". See + * SetDataChecksumsOn below for a description on how this state change works. + * This function blocks until all backends in the cluster have acknowledged the + * state transition. + */ +void +SetDataChecksumsOnInProgress(void) +{ + uint64 barrier; + + Assert(ControlFile != NULL); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + + /* + * Data checksum state can only be transitioned to "inprogress-on" from + * "off", if data checksums are in any other state then exit. + */ + if (ControlFile->data_checksum_version != 0) + { + LWLockRelease(ControlFileLock); + return; + } + + LWLockRelease(ControlFileLock); + + /* + * The state transition is performed in a critical section with + * checkpoints held off to provide crash safety. + */ + MyProc->delayChkpt = true; + START_CRIT_SECTION(); + + XLogChecksums(PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON); + + END_CRIT_SECTION(); + MyProc->delayChkpt = false; + + /* + * Await state change in all backends to ensure that all backends are in + * "inprogress-on". Once done we know that all backends are writing data + * checksums. + */ + WaitForProcSignalBarrier(barrier); +} + +/* + * SetDataChecksumsOn + * Enables data checksums cluster-wide + * + * Enabling data checksums is performed using two barriers, the first one to + * set the checksums state to "inprogress-on" (which is performed by + * SetDataChecksumsOnInProgress()) and the second one to set the state to "on" + * (performed here). + * + * To start the process of enabling data checksums in a running cluster the + * data_checksum_version state must be changed to "inprogress-on". This state + * requires data checksums to be written but not verified. This ensures that + * all data pages can be checksummed without the risk of false negatives in + * validation during the process. When all existing pages are guaranteed to + * have checksums, and all new pages will be initiated with checksums, the + * state can be changed to "on". Once the state is "on" checksums will be both + * written and verified. See datachecksumsworker.c for a longer discussion on + * how data checksums can be enabled in a running cluster. + * + * This function blocks until all backends in the cluster have acknowledged the + * state transition. + */ +void +SetDataChecksumsOn(void) { + uint64 barrier; + Assert(ControlFile != NULL); - return (ControlFile->data_checksum_version > 0); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + + /* + * The only allowed state transition to "on" is from "inprogress-on" since + * that state ensures that all pages will have data checksums written. + */ + if (ControlFile->data_checksum_version != PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION) + { + LWLockRelease(ControlFileLock); + elog(ERROR, "checksums not in \"inprogress-on\" mode"); + } + + LWLockRelease(ControlFileLock); + + MyProc->delayChkpt = true; + START_CRIT_SECTION(); + + XLogChecksums(PG_DATA_CHECKSUM_VERSION); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->data_checksum_version = PG_DATA_CHECKSUM_VERSION; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON); + + END_CRIT_SECTION(); + MyProc->delayChkpt = false; + + /* + * Await state transition of "on" in all backends. When done we know that + * data checksums are enabled in all backends and data checksums are both + * written and verified. + */ + WaitForProcSignalBarrier(barrier); +} + +/* + * SetDataChecksumsOff + * Disables data checksums cluster-wide + * + * Disabling data checksums must be performed with two sets of barriers, each + * carrying a different state. The state is first set to "inprogress-off" + * during which checksums are still written but not verified. This ensures that + * backends which have yet to observe the state change from "on" won't get + * validation errors on concurrently modified pages. Once all backends have + * changed to "inprogress-off", the barrier for moving to "off" can be emitted. + * This function blocks until all backends in the cluster have acknowledged the + * state transition. + */ +void +SetDataChecksumsOff(void) +{ + uint64 barrier; + + Assert(ControlFile); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + + /* If data checksums are already disabled there is nothing to do */ + if (ControlFile->data_checksum_version == 0) + { + LWLockRelease(ControlFileLock); + return; + } + + /* + * If data checksums are currently enabled we first transition to the + * "inprogress-off" state during which backends continue to write + * checksums without verifying them. When all backends are in + * "inprogress-off" the next transition to "off" can be performed, after + * which all data checksum processing is disabled. + */ + if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_VERSION) + { + LWLockRelease(ControlFileLock); + + MyProc->delayChkpt = true; + START_CRIT_SECTION(); + + XLogChecksums(PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF); + + END_CRIT_SECTION(); + MyProc->delayChkpt = false; + + /* + * Update local state in all backends to ensure that any backend in + * "on" state is changed to "inprogress-off". + */ + WaitForProcSignalBarrier(barrier); + + /* + * At this point we know that no backends are verifying data checksums + * during reading. Next, we can safely move to state "off" to also + * stop writing checksums. + */ + } + else + { + /* + * Ending up here implies that the checksums state is "inprogress-on" + * or "inprogress-off" and we can transition directly to "off" from + * there. + */ + LWLockRelease(ControlFileLock); + } + + /* + * Ensure that we don't incur a checkpoint during disabling checksums. + */ + MyProc->delayChkpt = true; + START_CRIT_SECTION(); + + XLogChecksums(0); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->data_checksum_version = 0; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF); + + END_CRIT_SECTION(); + MyProc->delayChkpt = false; + + WaitForProcSignalBarrier(barrier); +} + +/* + * ProcSignalBarrier absorption functions for enabling and disabling data + * checksums in a running cluster. The procsignalbarriers are emitted in the + * SetDataChecksums* functions. + */ +bool +AbsorbChecksumsOnInProgressBarrier(void) +{ + Assert(LocalDataChecksumVersion == 0 || + LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION); + LocalDataChecksumVersion = PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION; + return true; +} + +bool +AbsorbChecksumsOnBarrier(void) +{ + Assert(LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION); + LocalDataChecksumVersion = PG_DATA_CHECKSUM_VERSION; + return true; +} + +bool +AbsorbChecksumsOffInProgressBarrier(void) +{ + LocalDataChecksumVersion = PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION; + return true; +} + +bool +AbsorbChecksumsOffBarrier(void) +{ + LocalDataChecksumVersion = 0; + return true; +} + +/* + * InitLocalControlData + * + * Set up backend local caches of controldata variables which may change at + * any point during runtime and thus require special cased locking. So far + * this only applies to data_checksum_version, but it's intended to be general + * purpose enough to handle future cases. + */ +void +InitLocalControldata(void) +{ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + LocalDataChecksumVersion = ControlFile->data_checksum_version; + LWLockRelease(ControlFileLock); +} + +/* guc hook */ +const char * +show_data_checksums(void) +{ + if (LocalDataChecksumVersion == PG_DATA_CHECKSUM_VERSION) + return "on"; + else if (LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION) + return "inprogress-on"; + else if (LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION) + return "inprogress-off"; + else + return "off"; } /* @@ -7991,6 +8359,32 @@ StartupXLOG(void) */ CompleteCommitTsInitialization(); + /* + * If we reach this point with checksums being enabled ("inprogress-on" + * state), we notify the user that they need to manually restart the + * process to enable checksums. This is because we cannot launch a dynamic + * background worker directly from here, it has to be launched from a + * regular backend. + */ + if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION) + ereport(WARNING, + (errmsg("data checksums are being enabled, but no worker is running"), + errhint("Either disable or enable data checksums by calling the pg_disable_data_checksums() or pg_enable_data_checksums() functions."))); + + /* + * If data checksums were being disabled when the cluster was shutdown, we + * know that we have a state where all backends have stopped validating + * checksums and we can move to off instead. + */ + if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION) + { + XLogChecksums(0); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->data_checksum_version = 0; + LWLockRelease(ControlFileLock); + } + /* * All done with end-of-recovery actions. * @@ -9900,6 +10294,24 @@ XLogReportParameters(void) } } +/* + * Log the new state of checksums + */ +static void +XLogChecksums(ChecksumType new_type) +{ + xl_checksum_state xlrec; + XLogRecPtr recptr; + + xlrec.new_checksumtype = new_type; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xl_checksum_state)); + + recptr = XLogInsert(RM_XLOG_ID, XLOG_CHECKSUMS); + XLogFlush(recptr); +} + /* * Update full_page_writes in shared memory, and write an * XLOG_FPW_CHANGE record if necessary. @@ -10355,6 +10767,28 @@ xlog_redo(XLogReaderState *record) /* Keep track of full_page_writes */ lastFullPageWrites = fpw; } + else if (info == XLOG_CHECKSUMS) + { + xl_checksum_state state; + + memcpy(&state, XLogRecGetData(record), sizeof(xl_checksum_state)); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->data_checksum_version = state.new_checksumtype; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + if (state.new_checksumtype == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION) + WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON)); + else if (state.new_checksumtype == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION) + WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF)); + else if (state.new_checksumtype == PG_DATA_CHECKSUM_VERSION) + WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON)); + else + { + Assert(state.new_checksumtype == 0); + WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF)); + } + } } #ifdef WAL_DEBUG diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c index 5e1aab319dd..5d77be8a2dc 100644 --- a/src/backend/access/transam/xlogfuncs.c +++ b/src/backend/access/transam/xlogfuncs.c @@ -25,6 +25,7 @@ #include "catalog/pg_type.h" #include "funcapi.h" #include "miscadmin.h" +#include "postmaster/datachecksumsworker.h" #include "pgstat.h" #include "replication/walreceiver.h" #include "storage/fd.h" @@ -784,3 +785,49 @@ pg_promote(PG_FUNCTION_ARGS) (errmsg("server did not promote within %d seconds", wait_seconds))); PG_RETURN_BOOL(false); } + +/* + * Disables checksums for the cluster, unless already disabled. + * + * Starts a background worker that turns off data checksums. + */ +Datum +disable_data_checksums(PG_FUNCTION_ARGS) +{ + if (!superuser()) + ereport(ERROR, + (errmsg("must be superuser"))); + + StartDatachecksumsWorkerLauncher(false, 0, 0); + + PG_RETURN_VOID(); +} + +/* + * Enables checksums for the cluster, unless already enabled. + * + * Supports vacuum-like cost-based throttling, to limit system load. + * Starts a background worker that updates checksums on existing data. + */ +Datum +enable_data_checksums(PG_FUNCTION_ARGS) +{ + int cost_delay = PG_GETARG_INT32(0); + int cost_limit = PG_GETARG_INT32(1); + + if (!superuser()) + ereport(ERROR, + (errmsg("must be superuser"))); + + if (cost_delay < 0) + ereport(ERROR, + (errmsg("cost delay cannot be less than zero"))); + + if (cost_limit <= 0) + ereport(ERROR, + (errmsg("cost limit must be a positive value"))); + + StartDatachecksumsWorkerLauncher(true, cost_delay, cost_limit); + + PG_RETURN_VOID(); +} diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index 9abc4a1f556..87052b06930 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -974,10 +974,17 @@ InsertPgClassTuple(Relation pg_class_desc, /* relpartbound is set by updating this tuple, if necessary */ nulls[Anum_pg_class_relpartbound - 1] = true; + /* + * Hold off interrupts to ensure that the observed data checksum state + * cannot change as we form and insert the tuple. + */ + HOLD_INTERRUPTS(); + values[Anum_pg_class_relhaschecksums - 1] = BoolGetDatum(DataChecksumsNeedWrite()); tup = heap_form_tuple(RelationGetDescr(pg_class_desc), values, nulls); /* finally insert the new tuple, update the indexes, and clean up */ CatalogTupleInsert(pg_class_desc, tup); + RESUME_INTERRUPTS(); heap_freetuple(tup); } diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index fa58afd9d78..516ae666b7a 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1264,6 +1264,11 @@ CREATE OR REPLACE FUNCTION RETURNS boolean STRICT VOLATILE LANGUAGE INTERNAL AS 'pg_promote' PARALLEL SAFE; +CREATE OR REPLACE FUNCTION pg_enable_data_checksums ( + cost_delay int DEFAULT 0, cost_limit int DEFAULT 100) + RETURNS void STRICT VOLATILE LANGUAGE internal AS 'enable_data_checksums' + PARALLEL RESTRICTED; + -- legacy definition for compatibility with 9.3 CREATE OR REPLACE FUNCTION json_populate_record(base anyelement, from_json json, use_json_as_text boolean DEFAULT false) diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile index bfdf6a833db..59b82ee9ce5 100644 --- a/src/backend/postmaster/Makefile +++ b/src/backend/postmaster/Makefile @@ -17,6 +17,7 @@ OBJS = \ bgworker.o \ bgwriter.o \ checkpointer.o \ + datachecksumsworker.o \ fork_process.o \ interrupt.o \ pgarch.o \ diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c index dd3dad3de35..8afbf762afc 100644 --- a/src/backend/postmaster/bgworker.c +++ b/src/backend/postmaster/bgworker.c @@ -18,6 +18,7 @@ #include "pgstat.h" #include "port/atomics.h" #include "postmaster/bgworker_internals.h" +#include "postmaster/datachecksumsworker.h" #include "postmaster/interrupt.h" #include "postmaster/postmaster.h" #include "replication/logicallauncher.h" @@ -128,6 +129,15 @@ static const struct }, { "ApplyWorkerMain", ApplyWorkerMain + }, + { + "DatachecksumsWorkerLauncherMain", DatachecksumsWorkerLauncherMain + }, + { + "DatachecksumsWorkerMain", DatachecksumsWorkerMain + }, + { + "ResetDataChecksumsStateInDatabase", ResetDataChecksumsStateInDatabase } }; diff --git a/src/backend/postmaster/datachecksumsworker.c b/src/backend/postmaster/datachecksumsworker.c new file mode 100644 index 00000000000..b26c31e8924 --- /dev/null +++ b/src/backend/postmaster/datachecksumsworker.c @@ -0,0 +1,1530 @@ +/*------------------------------------------------------------------------- + * + * datachecksumsworker.c + * Background worker for enabling or disabling data checksums online + * + * When enabling data checksums on a database at initdb time or with + * pg_checksums, no extra process is required as each page is checksummed, and + * verified, when accessed. When enabling checksums on an already running + * cluster, which does not run with checksums enabled, this worker will ensure + * that all pages are checksummed before verification of the checksums is + * turned on. In the case of disabling checksums, the state transition is + * recorded in the catalog and control file, and no changes are performed + * on the data pages or in the catalog. + * + * Checksums can be either enabled or disabled cluster-wide, with on/off being + * the end state for data_checksums. + * + * Enabling checksums + * ------------------ + * When enabling checksums in an online cluster, data_checksums will be set to + * "inprogress-on" which signals that write operations MUST compute and write + * the checksum on the data page, but during reading the checksum SHALL NOT be + * verified. This ensures that all objects created during checksumming will + * have checksums set, but no reads will fail due to incorrect checksum. The + * DataChecksumsWorker will compile a list of databases which exist at the + * start of checksumming, and all of these which haven't been dropped during + * the processing MUST have been processed successfully in order for checksums + * to be enabled. Any new relation created during processing will see the + * in-progress state and will automatically be checksummed as well as have its + * state recorded in the catalog to avoid the datachecksumsworker having to + * process it when already checksummed. + * + * For each database, all relations which have storage are read and every data + * page is marked dirty to force a write with the checksum. This will generate + * a lot of WAL as the entire database is read and written. Once all data pages + * in a relation have been written, pg_class.relhaschecksums is set to true to + * indicate that the relation is done. + * + * If the processing is interrupted by a cluster restart, it will be restarted + * from where it left off given that pg_class.relhaschecksums track state of + * processed relations and the in-progress state will ensure all new writes + * performed with checksums. Each database will be reprocessed, but relations + * where pg_class.relhaschecksums is true are skipped. + * + * If data checksums are enabled, then disabled, and then re-enabled, every + * relation's pg_class.relhaschecksums field will be reset to false before + * entering the in-progress mode. + * + * + * Disabling checksums + * ------------------- + * When disabling checksums, data_checksums will be set to "inprogress-off" + * which signals that checksums are written but no longer verified. This ensure + * that backends which have yet to move from the "on" state will still be able + * to process data checksum validation. During "inprogress-off", the catalog + * state pg_class.relhaschecksums is cleared for all relations. + * + * + * Synchronization and Correctness + * ------------------------------- + * The processes involved in enabling, or disabling, data checksums in an + * online cluster must be properly synchronized with the normal backends + * serving concurrent queries to ensure correctness. Correctness is defined + * as the following: + * + * - Backends SHALL NOT violate local datachecksum state + * - Data checksums SHALL NOT be considered enabled cluster-wide until all + * currently connected backends have the local state "enabled" + * + * There are two levels of synchronization required for enabling data checksums + * in an online cluster: (i) changing state in the active backends ("on", + * "off", "inprogress-on" and "inprogress-off"), and (ii) ensuring no + * incompatible objects and processes are left in a database when workers end. + * The former deals with cluster-wide agreement on data checksum state and the + * latter with ensuring that any concurrent activity cannot break the data + * checksum contract during processing. + * + * Synchronizing the state change is done with procsignal barriers, where the + * WAL logging backend updating the global state in the controlfile will wait + * for all other backends to absorb the barrier. Barrier absorption will happen + * during interrupt processing, which means that connected backends will change + * state at different times. To prevent data checksum state changes when + * writing and verifying checksums, interrupts shall be held off before + * interrogating state and resumed when the IO operation has been performed. + * + * When Enabling Data Checksums + * ---------------------------- + * A process which fails to observe data checksums being enabled can induce + * two types of errors: failing to write the checksum when modifying the page + * and failing to validate the data checksum on the page when reading it. + * + * When processing starts all backends belong to one of the below sets, with + * one set being empty: + * + * Bd: Backends in "off" state + * Bi: Backends in "inprogress-on" state + * + * If processing is started in an online cluster then all backends are in Bd. + * If processing was halted by the cluster shutting down, the controlfile + * state "inprogress-on" will be observed on system startup and all backends + * will be in Bd. Backends transition Bd -> Bi via a procsignalbarrier. When + * the DataChecksumsWorker has finished writing checksums on all pages and + * enables data checksums cluster-wide, there are four sets of backends where + * Bd shall be an empty set: + * + * Bg: Backend updating the global state and emitting the procsignalbarrier + * Bd: Backends in "off" state + * Be: Backends in "on" state + * Bi: Backends in "inprogress-on" state + * + * Backends in Bi and Be will write checksums when modifying a page, but only + * backends in Be will verify the checksum during reading. The Bg backend is + * blocked waiting for all backends in Bi to process interrupts and move to + * Be. Any backend starting while Bg is waiting on the procsignalbarrier will + * observe the global state being "on" and will thus automatically belong to + * Be. Checksums are enabled cluster-wide when Bi is an empty set. Bi and Be + * are compatible sets while still operating based on their local state as + * both write data checksums. + * + * When Disabling Data Checksums + * ----------------------------- + * A process which fails to observe that data checksums have been disabled + * can induce two types of errors: writing the checksum when modifying the + * page and validating a data checksum which is no longer correct due to + * modifications to the page. + * + * Bg: Backend updating the global state and emitting the procsignalbarrier + * Bd: Backends in "off" state + * Be: Backends in "on" state + * Bo: Backends in "inprogress-off" state + * + * Backends transition from the Be state to Bd like so: Be -> Bo -> Bd + * + * The goal is to transition all backends to Bd making the others empty sets. + * Backends in Bo write data checksums, but don't validate them, such that + * backends still in Be can continue to validate pages until the barrier has + * been absorbed such that they are in Bo. Once all backends are in Bo, the + * barrier to transition to "off" can be raised and all backends can safely + * stop writing data checksums as no backend is enforcing data checksum + * validation any longer. + * + * + * Potential optimizations + * ----------------------- + * Below are some potential optimizations and improvements which were brought + * up during reviews of this feature, but which weren't implemented in the + * initial version. These are ideas listed without any validation on their + * feasability or potential payoff. More discussion on these can be found on + * the -hackers threads linked to in the commit message of this feature. + * + * * Launching datachecksumsworker for resuming operation from the startup + * process: Currently users have to restart processing manually after a + * restart since dynamic background worker cannot be started from the + * postmaster. Changing to the startup process could make resuming the + * processing automatic. + * * Avoid dirtying the page when checksums already match: Iff the checksum + * on the page happens to already match we still dirty the page. It should + * be enough to only do the log_newpage_buffer() call in that case. + * * Invent a lightweight WAL record that doesn't contain the full-page + * image but just the block number: On replay, the redo routine would read + * the page from disk. + * * Teach pg_checksums to avoid checksummed pages when pg_checksums is used + * to enable checksums on a cluster which is in inprogress-on state and + * may have checksummed pages (make pg_checksums be able to resume an + * online operation). + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/postmaster/datachecksumsworker.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/genam.h" +#include "access/heapam.h" +#include "access/htup_details.h" +#include "access/xact.h" +#include "catalog/indexing.h" +#include "catalog/pg_class.h" +#include "catalog/pg_database.h" +#include "commands/vacuum.h" +#include "common/relpath.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/bgworker.h" +#include "postmaster/bgwriter.h" +#include "postmaster/datachecksumsworker.h" +#include "storage/bufmgr.h" +#include "storage/checksum.h" +#include "storage/lmgr.h" +#include "storage/ipc.h" +#include "storage/procarray.h" +#include "storage/smgr.h" +#include "tcop/tcopprot.h" +#include "utils/fmgroids.h" +#include "utils/lsyscache.h" +#include "utils/ps_status.h" +#include "utils/syscache.h" + +#define DATACHECKSUMSWORKER_MAX_DB_RETRIES 5 + +#define MAX_OPS 4 + +typedef enum DataChecksumOperation +{ + ENABLE_CHECKSUMS = 1, + DISABLE_CHECKSUMS, + RESET_STATE, + SET_INPROGRESS_ON, + SET_CHECKSUMS_ON +} DataChecksumOperation; + +typedef enum +{ + DATACHECKSUMSWORKER_SUCCESSFUL = 0, + DATACHECKSUMSWORKER_ABORTED, + DATACHECKSUMSWORKER_FAILED, + DATACHECKSUMSWORKER_RETRYDB, +} DatachecksumsWorkerResult; + +/* + * Signaling between backends calling pg_enable/disable_checkums, the + * checksums launcher process, and the checksums worker process. + * + * This struct is protected by DatachecksumsWorkerLock + */ +typedef struct DatachecksumsWorkerShmemStruct +{ + /* + * These are set by pg_enable/disable_checkums, to tell the launcher what + * the target state is. + */ + bool launch_enable_checksums; /* True if checksums are being + * enabled, else false */ + int launch_cost_delay; + int launch_cost_limit; + + /* + * Is a launcher process is currently running? + * + * This is set by the launcher process, after it has read the above launch_* + * parameters. + */ + bool launcher_running; + + /* + * These fields indicate the target state that the launcher is currently + * working towards. They can be different from the corresponding launch_* + * fields, if a new pg_enable_disable_checksums() call was made while the + * launcher/worker was already running. + + * The below members are set when the launcher starts, and are only + * accessed read-only by the single worker. Thus, we can access these + * without a lock. If multiple workers, or dynamic cost parameters, are + * supported at some point then this would need to be revisited. + */ + bool enabling_checksums; /* True if checksums are being enabled, + * else false */ + int cost_delay; + int cost_limit; + + /* + * Signaling between the launcher and the worker process. + * + * As there is only a single worker, and the launcher + * won't read these until the worker exits, they can be accessed without + * the need for a lock. If multiple workers are supported then this will + * have to be revisited. + */ + /* result, set by worker before exiting */ + DatachecksumsWorkerResult success; + + /* tells the worker process whether it should also process the shared catalogs. */ + bool process_shared_catalogs; +} DatachecksumsWorkerShmemStruct; + +/* Shared memory segment for datachecksumsworker */ +static DatachecksumsWorkerShmemStruct *DatachecksumsWorkerShmem; + +/* Bookkeeping for work to do */ +typedef struct DatachecksumsWorkerDatabase +{ + Oid dboid; + char *dbname; +} DatachecksumsWorkerDatabase; + +typedef struct DatachecksumsWorkerResultEntry +{ + Oid dboid; + DatachecksumsWorkerResult result; + int retries; +} DatachecksumsWorkerResultEntry; + + +/* + * Flag set by the interrupt handler + */ +static volatile sig_atomic_t abort_requested = false; + +/* + * Have we set the DatachecksumsWorkerShmemStruct->launcher_running flag? + * If we have, we need to clear it before exiting! + */ +static volatile sig_atomic_t launcher_running = false; + +/* + * Are we enabling checkums, or disabling them? + */ +static bool enabling_checksums; + +/* Prototypes */ +static List *BuildDatabaseList(void); +static List *BuildRelationList(bool temp_relations, bool include_shared); +static DatachecksumsWorkerResult ProcessDatabase(DatachecksumsWorkerDatabase *db, const char *bgw_func_name); +static bool ProcessAllDatabases(bool *already_connected, const char *bgw_func_name); +static bool ProcessSingleRelationFork(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy); +static void launcher_cancel_handler(SIGNAL_ARGS); +static void SetRelHasChecksums(Oid relOid); +static void WaitForAllTransactionsToFinish(void); + +/* + * StartDataChecksumsWorkerLauncher + * Main entry point for datachecksumsworker launcher process + * + * The main entrypoint for starting data checksums processing for enabling as + * well as disabling. + */ +void +StartDatachecksumsWorkerLauncher(bool enable_checksums, int cost_delay, int cost_limit) +{ + BackgroundWorker bgw; + BackgroundWorkerHandle *bgw_handle; + bool launcher_running; + + /* the cost delay settings have no effect when disabling */ + Assert(enable_checksums || cost_delay == 0); + Assert(enable_checksums || cost_limit == 0); + + /* + * Store the desired state in shared memory. + */ + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + + DatachecksumsWorkerShmem->launch_enable_checksums = enable_checksums; + DatachecksumsWorkerShmem->launch_cost_delay = cost_delay; + DatachecksumsWorkerShmem->launch_cost_limit = cost_limit; + + /* is the launcher already running? */ + launcher_running = DatachecksumsWorkerShmem->launcher_running; + + LWLockRelease(DatachecksumsWorkerLock); + + /* + * Launch a new launcher process, if it's not running already. + * + * If the launcher is currently busy enabling the checkums, and we want + * them disabled (or vice versa), the launcher will notice that at latest + * when it's about to exit, and will loop back process the new request. + * So if the launcher is already running, we don't need to do anything + * more here to abort it. + * + * If you call pg_enable/disable_checksums() twice in a row, before the + * launcher has had a chance to start up, we still end up launching it + * twice. That's OK, the second invocation will see that a launcher is + * already running and exit quickly. + * + * TODO: We could optimize here and skip launching the launcher, if we are + * already in the desired state, i.e. if the checksums are already enabled + * and you call pg_enable_checksums(). + */ + if (!launcher_running) + { + /* + * Prepare the BackgroundWorker and launch it. + */ + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "DatachecksumsWorkerLauncherMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, "datachecksumsworker launcher"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "datachecksumsworker launcher"); + bgw.bgw_restart_time = BGW_NEVER_RESTART; + bgw.bgw_notify_pid = MyProcPid; + bgw.bgw_main_arg = (Datum) 0; + + if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle)) + ereport(ERROR, + (errmsg("failed to start background worker to process data checksums"))); + } +} + +/* + * ProcessSingleRelationFork + * Enable data checksums in a single relation/fork. + * + * Returns true if successful, and false if *aborted*. On error, an actual + * error is raised in the lower levels. + */ +static bool +ProcessSingleRelationFork(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy) +{ + BlockNumber numblocks = RelationGetNumberOfBlocksInFork(reln, forkNum); + BlockNumber blknum; + char activity[NAMEDATALEN * 2 + 128]; + char *relns; + + relns = get_namespace_name(RelationGetNamespace(reln)); + + if (!relns) + return false; + + /* + * We are looping over the blocks which existed at the time of process + * start, which is safe since new blocks are created with checksums set + * already due to the state being "inprogress-on". + */ + for (blknum = 0; blknum < numblocks; blknum++) + { + Buffer buf = ReadBufferExtended(reln, forkNum, blknum, RBM_NORMAL, strategy); + + /* + * Report to pgstat every 100 blocks to keep from overwhelming the + * activity reporting with close to identical reports. + */ + if ((blknum % 100) == 0) + { + snprintf(activity, sizeof(activity) - 1, "processing: %s.%s (%s block %d/%d)", + relns, RelationGetRelationName(reln), + forkNames[forkNum], blknum, numblocks); + pgstat_report_activity(STATE_RUNNING, activity); + } + + /* Need to get an exclusive lock before we can flag as dirty */ + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + /* + * Mark the buffer as dirty and force a full page write. We have to + * re-write the page to WAL even if the checksum hasn't changed, + * because if there is a replica it might have a slightly different + * version of the page with an invalid checksum, caused by unlogged + * changes (e.g. hintbits) on the master happening while checksums + * were off. This can happen if there was a valid checksum on the page + * at one point in the past, so only when checksums are first on, then + * off, and then turned on again. Iff wal_level is set to "minimal", + * this could be avoided iff the checksum is calculated to be correct. + */ + START_CRIT_SECTION(); + MarkBufferDirty(buf); + log_newpage_buffer(buf, false); + END_CRIT_SECTION(); + + UnlockReleaseBuffer(buf); + + /* + * This is the only place where we check if we are asked to abort, the + * abortion will bubble up from here. It's safe to check this without + * a lock, because if we miss it being set, we will try again soon. + */ + Assert(enabling_checksums); + if (!DatachecksumsWorkerShmem->launch_enable_checksums) + abort_requested = true; + if (abort_requested) + return false; + + vacuum_delay_point(); + } + + pfree(relns); + return true; +} + +/* + * ProcessSingleRelationByOid + * Process a single relation based on oid. + * + * Returns true if successful, and false if *aborted*. On error, an actual + * error is raised in the lower levels. + */ +static bool +ProcessSingleRelationByOid(Oid relationId, BufferAccessStrategy strategy) +{ + Relation rel; + ForkNumber fnum; + bool aborted = false; + + StartTransactionCommand(); + + elog(DEBUG2, + "adding data checksums to relation with OID %u", + relationId); + + rel = try_relation_open(relationId, AccessShareLock); + if (rel == NULL) + { + /* + * Relation no longer exists. We don't consider this an error since + * there are no pages in it that need data checksums, and thus return + * true. The worker operates off a list of relations generated at the + * start of processing, so relations being dropped in the meantime is + * to be expected. + */ + CommitTransactionCommand(); + pgstat_report_activity(STATE_IDLE, NULL); + return true; + } + RelationOpenSmgr(rel); + + for (fnum = 0; fnum <= MAX_FORKNUM; fnum++) + { + if (smgrexists(rel->rd_smgr, fnum)) + { + if (!ProcessSingleRelationFork(rel, fnum, strategy)) + { + aborted = true; + break; + } + } + } + relation_close(rel, AccessShareLock); + elog(DEBUG2, + "data checksum processing done for relation with OID %u: %s", + relationId, (aborted ? "aborted" : "finished")); + + if (!aborted) + SetRelHasChecksums(relationId); + + CommitTransactionCommand(); + + pgstat_report_activity(STATE_IDLE, NULL); + + return !aborted; +} + +/* + * SetRelHasChecksums + * + * Sets the pg_class.relhaschecksums flag for the relation specified by relOid + * to true. The corresponding function for clearing state is + * ResetDataChecksumsStateInDatabase which operate on all relations in a + * database. + */ +static void +SetRelHasChecksums(Oid relOid) +{ + Relation rel; + Relation heaprel; + Form_pg_class pg_class_tuple; + HeapTuple tuple; + + /* + * If the relation has gone away since we checksummed it then that's not + * an errorcase. Exit early and continue on the next relation instead. + */ + heaprel = try_relation_open(relOid, ShareUpdateExclusiveLock); + if (!heaprel) + return; + rel = table_open(RelationRelationId, RowExclusiveLock); + + tuple = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relOid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for relation %u", relOid); + + pg_class_tuple = (Form_pg_class) GETSTRUCT(tuple); + pg_class_tuple->relhaschecksums = true; + + CatalogTupleUpdate(rel, &tuple->t_self, tuple); + + heap_freetuple(tuple); + + table_close(rel, RowExclusiveLock); + relation_close(heaprel, ShareUpdateExclusiveLock); +} + +/* + * ProcessDatabase + * Enable data checksums in a single database. + * + * We do this by launching a dynamic background worker into this database, and + * waiting for it to finish. We have to do this in a separate worker, since + * each process can only be connected to one database during its lifetime. + */ +static DatachecksumsWorkerResult +ProcessDatabase(DatachecksumsWorkerDatabase *db, const char *bgw_func_name) +{ + BackgroundWorker bgw; + BackgroundWorkerHandle *bgw_handle; + BgwHandleStatus status; + pid_t pid; + char activity[NAMEDATALEN + 64]; + + DatachecksumsWorkerShmem->success = DATACHECKSUMSWORKER_FAILED; + + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "%s", bgw_func_name); + snprintf(bgw.bgw_name, BGW_MAXLEN, "datachecksumsworker worker"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "datachecksumsworker worker"); + bgw.bgw_restart_time = BGW_NEVER_RESTART; + bgw.bgw_notify_pid = MyProcPid; + bgw.bgw_main_arg = ObjectIdGetDatum(db->dboid); + + /* + * If there are no worker slots available, make sure we retry processing + * this database. This will make the datachecksumsworker move on to the + * next database and quite likely fail with the same problem. TODO: Maybe + * we need a backoff to avoid running through all the databases here in + * short order. + */ + if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle)) + { + ereport(WARNING, + (errmsg("failed to start worker for enabling data checksums in database \"%s\", retrying", + db->dbname), + errhint("The max_worker_processes setting might be too low."))); + return DATACHECKSUMSWORKER_RETRYDB; + } + + status = WaitForBackgroundWorkerStartup(bgw_handle, &pid); + if (status == BGWH_STOPPED) + { + ereport(WARNING, + (errmsg("could not start background worker for enabling data checksums in database \"%s\"", + db->dbname), + errhint("More details on the error might be found in the server log."))); + return DATACHECKSUMSWORKER_FAILED; + } + + /* + * If the postmaster crashed we cannot end up with a processed database so + * we have no alternative other than exiting. When enabling checksums we + * won't at this time have changed the pg_control version to enabled so + * when the cluster comes back up processing will have to be resumed. When + * disabling, the pg_control version will be set to off before this so + * when the cluster comes up checksums will be off as expected. In the + * latter case we might have stale relhaschecksums flags in pg_class which + * it would be nice to handle in some way. Enabling data checksums reset + * the flags so any stale flags won't cause problems at that point, but + * they may cause confusion with users reading pg_class. TODO. + */ + if (status == BGWH_POSTMASTER_DIED) + ereport(FATAL, + (errmsg("cannot enable data checksums without the postmaster process"), + errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums()."))); + + Assert(status == BGWH_STARTED); + ereport(DEBUG1, + (errmsg("initiating data checksum processing in database \"%s\"", + db->dbname))); + + snprintf(activity, sizeof(activity) - 1, + "Waiting for worker in database %s (pid %d)", db->dbname, pid); + pgstat_report_activity(STATE_RUNNING, activity); + + status = WaitForBackgroundWorkerShutdown(bgw_handle); + if (status == BGWH_POSTMASTER_DIED) + ereport(FATAL, + (errmsg("postmaster exited during data checksum processing in \"%s\"", + db->dbname), + errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums()."))); + + if (DatachecksumsWorkerShmem->success == DATACHECKSUMSWORKER_ABORTED) + ereport(LOG, + (errmsg("data checksums processing was aborted in database \"%s\"", + db->dbname))); + + pgstat_report_activity(STATE_IDLE, NULL); + + return DatachecksumsWorkerShmem->success; +} + +/* + * launcher_exit + * + * Internal routine for cleaning up state when the launcher process exits. We + * need to clean up the abort flag to ensure that processing can be restarted + * again after it was previously aborted. + */ +static void +launcher_exit(int code, Datum arg) +{ + if (launcher_running) + { + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + launcher_running = false; + DatachecksumsWorkerShmem->launcher_running = false; + LWLockRelease(DatachecksumsWorkerLock); + } +} + +/* + * launcher_cancel_handler + * + * Internal routine for reacting to SIGINT and flagging the worker to abort. + * The worker won't be interrupted immediately but will check for abort flag + * between each block in a relation. + */ +static void +launcher_cancel_handler(SIGNAL_ARGS) +{ + int save_errno = errno; + + abort_requested = true; + + /* + * There is no sleeping in the main loop, the flag will be checked + * periodically in ProcessSingleRelationFork. The worker does however + * sleep when waiting for concurrent transactions to end so we still need + * to set the latch. + */ + SetLatch(MyLatch); + + errno = save_errno; +} + +/* + * WaitForAllTransactionsToFinish + * Blocks awaiting all current transactions to finish + * + * Returns when all transactions which are active at the call of the function + * have ended, or if the postmaster dies while waiting. If the postmaster dies + * the abort flag will be set to indicate that the caller of this shouldn't + * proceed. + * + * NB: this will return early, if aborted by SIGINT or if the target state + * is changed while we're running. + */ +static void +WaitForAllTransactionsToFinish(void) +{ + TransactionId waitforxid; + + LWLockAcquire(XidGenLock, LW_SHARED); + waitforxid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + LWLockRelease(XidGenLock); + + while (TransactionIdPrecedes(GetOldestActiveTransactionId(), waitforxid)) + { + char activity[64]; + int rc; + + /* Oldest running xid is older than us, so wait */ + snprintf(activity, + sizeof(activity), + "Waiting for current transactions to finish (waiting for %u)", + waitforxid); + pgstat_report_activity(STATE_RUNNING, activity); + + /* Retry every 5 seconds */ + ResetLatch(MyLatch); + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + 5000, + WAIT_EVENT_CHECKSUM_ENABLE_STARTCONDITION); + + /* + * If the postmaster died we won't be able to enable checksums + * cluster-wide so abort and hope to continue when restarted. + */ + if (rc & WL_POSTMASTER_DEATH) + ereport(FATAL, + (errmsg("postmaster exited during data checksum processing"), + errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums()."))); + + LWLockAcquire(DatachecksumsWorkerLock, LW_SHARED); + if (DatachecksumsWorkerShmem->launch_enable_checksums != enabling_checksums) + abort_requested = true; + LWLockRelease(DatachecksumsWorkerLock); + if (abort_requested) + break; + } + + pgstat_report_activity(STATE_IDLE, NULL); + return; +} + +/* + * DatachecksumsWorkerLauncherMain + * + * Main function for launching dynamic background workers for processing data + * checksums in databases. This function has the bgworker management, with + * ProcessAllDatabases being responsible for looping over the databases and + * initiating processing. + */ +void +DatachecksumsWorkerLauncherMain(Datum arg) +{ + bool connected = false; + bool status = false; + DataChecksumOperation current; + int operations[MAX_OPS]; + + on_shmem_exit(launcher_exit, 0); + + ereport(DEBUG1, + (errmsg("background worker \"datachecksumsworker\" launcher started"))); + + pqsignal(SIGTERM, die); + pqsignal(SIGINT, launcher_cancel_handler); + + BackgroundWorkerUnblockSignals(); + + InitXLOGAccess(); + + MyBackendType = B_DATACHECKSUMSWORKER_LAUNCHER; + init_ps_display(NULL); + + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + + if (DatachecksumsWorkerShmem->launcher_running) + { + /* Launcher was already running. Let it finish. */ + LWLockRelease(DatachecksumsWorkerLock); + return; + } + + launcher_running = true; + + enabling_checksums = DatachecksumsWorkerShmem->launch_enable_checksums; + DatachecksumsWorkerShmem->launcher_running = true; + DatachecksumsWorkerShmem->enabling_checksums = enabling_checksums; + DatachecksumsWorkerShmem->cost_delay = DatachecksumsWorkerShmem->launch_cost_delay; + DatachecksumsWorkerShmem->cost_limit = DatachecksumsWorkerShmem->launch_cost_limit; + LWLockRelease(DatachecksumsWorkerLock); + + /* + * The target state can change while we are busy enabling/disabling checksums, + * if the user calls pg_disable/enable_checksums() before we are finished with + * the previous request. In that case, we will loop back here, to process the + * new request. + */ +again: + + memset(operations, 0, sizeof(operations)); + + /* + * If we're asked to enable checksums, we need to check if processing was + * previously interrupted such that we should resume rather than start + * from scratch. + */ + if (enabling_checksums) + { + /* + * If we are asked to enable checksums in a cluster which already + * has checksums enabled, exit immediately as there is nothing + * more to do. + */ + if (DataChecksumsNeedVerify()) + goto done; + + /* + * If the controlfile state is set to "inprogress-on" then we will + * resume from where we left off based on the catalog state. This + * will be safe since new relations created while the checksum- + * worker was disabled will have checksums enabled. + */ + else if (DataChecksumsOnInProgress()) + { + operations[0] = ENABLE_CHECKSUMS; + operations[1] = SET_CHECKSUMS_ON; + } + + /* + * If the controlfile state is set to "inprogress-off" then we + * were interrupted while the catalog state was being cleared. In + * this case we need to first reset state and then continue with + * enabling checksums. + */ + else if (DataChecksumsOffInProgress()) + { + operations[0] = RESET_STATE; + operations[1] = SET_INPROGRESS_ON; + operations[2] = ENABLE_CHECKSUMS; + operations[3] = SET_CHECKSUMS_ON; + } + + /* + * Data checksums are off in the cluster, we can proceed with + * enabling them. Just in case we will start by resetting the + * catalog state since we are doing this from scratch and we don't + * want leftover catalog state to cause us to miss a relation. + */ + else + { + operations[0] = RESET_STATE; + operations[1] = SET_INPROGRESS_ON; + operations[2] = ENABLE_CHECKSUMS; + operations[3] = SET_CHECKSUMS_ON; + } + } + else + { + /* + * Regardless of current state in the system, we go through the + * motions when asked to disable checksums. The catalog state is + * only defined to be relevant during the operation of enabling + * checksums, and have no use at any other point in time. That + * being said, a user who sees stale relhaschecksums entries in + * the catalog might run this just in case. + * + * Resetting state must be performed after setting data checksum + * state to off, as there otherwise might (depending on system + * data checksum state) be a window between catalog resetting and + * state transition when new relations are created with the + * catalog state set to true. + */ + operations[0] = DISABLE_CHECKSUMS; + operations[1] = RESET_STATE; + } + + for (int i = 0; i < MAX_OPS; i++) + { + current = operations[i]; + + if (!current) + break; + + switch (current) + { + case DISABLE_CHECKSUMS: + SetDataChecksumsOff(); + break; + + case SET_INPROGRESS_ON: + SetDataChecksumsOnInProgress(); + break; + + case SET_CHECKSUMS_ON: + SetDataChecksumsOn(); + break; + + case RESET_STATE: + status = ProcessAllDatabases(&connected, "ResetDataChecksumsStateInDatabase"); + if (!status) + ereport(ERROR, + (errmsg("unable to reset catalog checksum state"))); + break; + + case ENABLE_CHECKSUMS: + status = ProcessAllDatabases(&connected, "DatachecksumsWorkerMain"); + if (!status) + ereport(ERROR, + (errmsg("unable to enable checksums in cluster"))); + break; + + default: + elog(ERROR, "unknown checksum operation requested"); + break; + } + } + +done: + /* + * All done. But before we exit, check if the target state was changed while + * we were running. In that case we will have to start all over again. + */ + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + if (DatachecksumsWorkerShmem->launch_enable_checksums != enabling_checksums) + { + DatachecksumsWorkerShmem->enabling_checksums = DatachecksumsWorkerShmem->launch_enable_checksums; + DatachecksumsWorkerShmem->cost_delay = DatachecksumsWorkerShmem->launch_cost_delay; + DatachecksumsWorkerShmem->cost_limit = DatachecksumsWorkerShmem->launch_cost_limit; + LWLockRelease(DatachecksumsWorkerLock); + goto again; + } + + launcher_running = false; + DatachecksumsWorkerShmem->launcher_running = false; + LWLockRelease(DatachecksumsWorkerLock); +} + +/* + * ProcessAllDatabases + * Compute the list of all databases and process checksums in each + * + * This will repeatedly generate a list of databases to process for either + * enabling checksums or resetting the checksum catalog tracking. Until no + * new databases are found, this will loop around computing a new list and + * comparing it to the already seen ones. + */ +static bool +ProcessAllDatabases(bool *already_connected, const char *bgw_func_name) +{ + List *DatabaseList; + HTAB *ProcessedDatabases = NULL; + ListCell *lc; + HASHCTL hash_ctl; + bool found_failed = false; + + /* Initialize a hash tracking all processed databases */ + memset(&hash_ctl, 0, sizeof(hash_ctl)); + hash_ctl.keysize = sizeof(Oid); + hash_ctl.entrysize = sizeof(DatachecksumsWorkerResultEntry); + ProcessedDatabases = hash_create("Processed databases", + 64, + &hash_ctl, + HASH_ELEM | HASH_BLOBS); + + /* + * Initialize a connection to shared catalogs only. + */ + if (!*already_connected) + BackgroundWorkerInitializeConnection(NULL, NULL, 0); + + *already_connected = true; + + /* + * Set up so first run processes shared catalogs, but not once in every + * db. + */ + DatachecksumsWorkerShmem->process_shared_catalogs = true; + + /* + * Get a list of all databases to process. This may include databases that + * were created during our runtime. Since a database can be created as a + * copy of any other database (which may not have existed in our last + * run), we have to repeat this loop until no new databases show up in the + * list. + */ + DatabaseList = BuildDatabaseList(); + + while (true) + { + int processed_databases = 0; + + foreach(lc, DatabaseList) + { + DatachecksumsWorkerDatabase *db = (DatachecksumsWorkerDatabase *) lfirst(lc); + DatachecksumsWorkerResult result; + DatachecksumsWorkerResultEntry *entry; + bool found; + + elog(DEBUG1, + "starting processing of database %s with oid %u", + db->dbname, db->dboid); + + entry = (DatachecksumsWorkerResultEntry *) hash_search(ProcessedDatabases, &db->dboid, + HASH_FIND, NULL); + + if (entry) + { + if (entry->result == DATACHECKSUMSWORKER_RETRYDB) + { + /* + * Limit the number of retries to avoid infinite looping + * in case there simply wont be enough workers in the + * cluster to finish this operation. + */ + if (entry->retries > DATACHECKSUMSWORKER_MAX_DB_RETRIES) + entry->result = DATACHECKSUMSWORKER_FAILED; + } + + /* Skip if this database has been processed already */ + if (entry->result != DATACHECKSUMSWORKER_RETRYDB) + { + pfree(db->dbname); + pfree(db); + continue; + } + } + + result = ProcessDatabase(db, bgw_func_name); + processed_databases++; + + if (result == DATACHECKSUMSWORKER_SUCCESSFUL) + { + /* + * If one database has completed shared catalogs, we don't + * have to process them again. + */ + if (DatachecksumsWorkerShmem->process_shared_catalogs) + DatachecksumsWorkerShmem->process_shared_catalogs = false; + } + else if (result == DATACHECKSUMSWORKER_ABORTED) + { + /* Abort flag set, so exit the whole process */ + return false; + } + + entry = hash_search(ProcessedDatabases, &db->dboid, HASH_ENTER, &found); + entry->dboid = db->dboid; + entry->result = result; + if (!found) + entry->retries = 0; + else + entry->retries++; + + pfree(db->dbname); + pfree(db); + } + + elog(DEBUG1, + "%i databases processed for data checksum enabling, %s", + processed_databases, + (processed_databases ? "process with restart" : "process completed")); + + list_free(DatabaseList); + + /* + * If no databases were processed in this run of the loop, we have now + * finished all databases and no concurrently created ones can exist. + */ + if (processed_databases == 0) + break; + + /* + * Re-generate the list of databases for another pass. Since we wait + * for all pre-existing transactions finish, this way we can be + * certain that there are no databases left without checksums. + */ + WaitForAllTransactionsToFinish(); + DatabaseList = BuildDatabaseList(); + } + + /* + * ProcessedDatabases now has all databases and the results of their + * processing. Failure to enable checksums for a database can be because + * they actually failed for some reason, or because the database was + * dropped between us getting the database list and trying to process it. + * Get a fresh list of databases to detect the second case where the + * database was dropped before we had started processing it. If a database + * still exists, but enabling checksums failed then we fail the entire + * checksumming process and exit with an error. + */ + WaitForAllTransactionsToFinish(); + DatabaseList = BuildDatabaseList(); + + foreach(lc, DatabaseList) + { + DatachecksumsWorkerDatabase *db = (DatachecksumsWorkerDatabase *) lfirst(lc); + DatachecksumsWorkerResult *entry; + bool found; + + entry = hash_search(ProcessedDatabases, (void *) &db->dboid, + HASH_FIND, &found); + + /* + * We are only interested in the databases where the failed database + * still exists. + */ + if (found && *entry == DATACHECKSUMSWORKER_FAILED) + { + ereport(WARNING, + (errmsg("failed to enable data checksums in \"%s\"", + db->dbname))); + found_failed = found; + continue; + } + } + + if (found_failed) + { + /* Disable checksums on cluster, because we failed */ + SetDataChecksumsOff(); + ereport(ERROR, + (errmsg("checksums failed to get enabled in all databases, aborting"), + errhint("The server log might have more information on the error."))); + } + + /* + * Force a checkpoint to get everything out to disk. TODO: we probably + * don't want to use a CHECKPOINT_IMMEDIATE here but it's very convenient + * for testing until the patch is fully baked, as it may otherwise make + * tests take a lot longer. + */ + RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_IMMEDIATE); + + return true; +} + +/* + * DatachecksumsWorkerShmemSize + * Compute required space for datachecksumsworker-related shared memory + */ +Size +DatachecksumsWorkerShmemSize(void) +{ + Size size; + + size = sizeof(DatachecksumsWorkerShmemStruct); + size = MAXALIGN(size); + + return size; +} + +/* + * DatachecksumsWorkerShmemInit + * Allocate and initialize datachecksumsworker-related shared memory + */ +void +DatachecksumsWorkerShmemInit(void) +{ + bool found; + + DatachecksumsWorkerShmem = (DatachecksumsWorkerShmemStruct *) + ShmemInitStruct("DatachecksumsWorker Data", + DatachecksumsWorkerShmemSize(), + &found); + + MemSet(DatachecksumsWorkerShmem, 0, DatachecksumsWorkerShmemSize()); + + /* + * Even if this is a redundant assignment, we want to be explicit about + * our intent for readability, since we want to be able to query this + * state in case of restartability. + */ + DatachecksumsWorkerShmem->launch_enable_checksums = false; + DatachecksumsWorkerShmem->launcher_running = false; +} + +/* + * BuildDatabaseList + * Compile a list of all currently available databases in the cluster + * + * This creates the list of databases for the datachecksumsworker workers to + * add checksums to. If the caller wants to ensure that no concurrently + * running CREATE DATABASE calls exist, this needs to be preceeded by a call + * to WaitForAllTransactionsToFinish(). + */ +static List * +BuildDatabaseList(void) +{ + List *DatabaseList = NIL; + Relation rel; + TableScanDesc scan; + HeapTuple tup; + MemoryContext ctx = CurrentMemoryContext; + MemoryContext oldctx; + + StartTransactionCommand(); + + rel = table_open(DatabaseRelationId, AccessShareLock); + scan = table_beginscan_catalog(rel, 0, NULL); + + while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) + { + Form_pg_database pgdb = (Form_pg_database) GETSTRUCT(tup); + DatachecksumsWorkerDatabase *db; + + oldctx = MemoryContextSwitchTo(ctx); + + db = (DatachecksumsWorkerDatabase *) palloc(sizeof(DatachecksumsWorkerDatabase)); + + db->dboid = pgdb->oid; + db->dbname = pstrdup(NameStr(pgdb->datname)); + + DatabaseList = lappend(DatabaseList, db); + + MemoryContextSwitchTo(oldctx); + } + + table_endscan(scan); + table_close(rel, AccessShareLock); + + CommitTransactionCommand(); + + return DatabaseList; +} + +/* + * BuildRelationList + * Compile a list of relations in the database + * + * Returns a list of OIDs for the request relation types. If temp_relations + * is True then only temporary relations are returned. If temp_relations is + * False then non-temporary relations which have data checksums are returned. + * If include_shared is True then shared relations are included as well in a + * non-temporary list. include_shared has no relevance when building a list of + * temporary relations. + */ +static List * +BuildRelationList(bool temp_relations, bool include_shared) +{ + List *RelationList = NIL; + Relation rel; + TableScanDesc scan; + HeapTuple tup; + MemoryContext ctx = CurrentMemoryContext; + MemoryContext oldctx; + + StartTransactionCommand(); + + rel = table_open(RelationRelationId, AccessShareLock); + scan = table_beginscan_catalog(rel, 0, NULL); + + while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) + { + Form_pg_class pgc = (Form_pg_class) GETSTRUCT(tup); + + /* + * Only include temporary relations when asked for a temp relation + * list. + */ + if (pgc->relpersistence == RELPERSISTENCE_TEMP) + { + if (!temp_relations) + continue; + } + else + { + if (!RELKIND_HAS_STORAGE(pgc->relkind)) + continue; + + if (pgc->relhaschecksums) + continue; + + if (pgc->relisshared && !include_shared) + continue; + } + + oldctx = MemoryContextSwitchTo(ctx); + RelationList = lappend_oid(RelationList, pgc->oid); + MemoryContextSwitchTo(oldctx); + } + + table_endscan(scan); + table_close(rel, AccessShareLock); + + CommitTransactionCommand(); + + return RelationList; +} + +/* + * ResetDataChecksumsStateInDatabase + * Main worker function for clearing checksums state in the catalog + * + * Resets the pg_class.relhaschecksums flag to false for all entries in the + * current database. This is required to be performed before adding checksums + * to a running cluster in order to track the state of the processing. + */ +void +ResetDataChecksumsStateInDatabase(Datum arg) +{ + Relation rel; + HeapTuple tuple; + Oid dboid = DatumGetObjectId(arg); + TableScanDesc scan; + Form_pg_class pgc; + + pqsignal(SIGTERM, die); + + BackgroundWorkerUnblockSignals(); + + MyBackendType = B_DATACHECKSUMSWORKER_WORKER; + init_ps_display(NULL); + + ereport(DEBUG1, + (errmsg("resetting catalog state for data checksums in database with OID %u", + dboid))); + + BackgroundWorkerInitializeConnectionByOid(dboid, InvalidOid, BGWORKER_BYPASS_ALLOWCONN); + + StartTransactionCommand(); + + rel = table_open(RelationRelationId, RowExclusiveLock); + scan = table_beginscan_catalog(rel, 0, NULL); + + while (HeapTupleIsValid(tuple = heap_getnext(scan, ForwardScanDirection))) + { + tuple = heap_copytuple(tuple); + pgc = (Form_pg_class) GETSTRUCT(tuple); + + if (pgc->relhaschecksums) + { + pgc->relhaschecksums = false; + CatalogTupleUpdate(rel, &tuple->t_self, tuple); + } + + heap_freetuple(tuple); + } + + table_endscan(scan); + table_close(rel, RowExclusiveLock); + + CommitTransactionCommand(); + + DatachecksumsWorkerShmem->success = DATACHECKSUMSWORKER_SUCCESSFUL; +} + +/* + * DatachecksumsWorkerMain + * + * Main function for enabling checksums in a single database, This is the + * function set as the bgw_function_name in the dynamic background worker + * process initiiated for each database by the worker launcher. After enabling + * data checksums in each applicable relation in the database, it will wait for + * all temporary relations that were present when the function started to + * disappear before returning. This is required since we cannot rewrite + * existing temporary relations with data checksums. + */ +void +DatachecksumsWorkerMain(Datum arg) +{ + Oid dboid = DatumGetObjectId(arg); + List *RelationList = NIL; + List *InitialTempTableList = NIL; + ListCell *lc; + BufferAccessStrategy strategy; + bool aborted = false; + + enabling_checksums = true; + + pqsignal(SIGTERM, die); + + BackgroundWorkerUnblockSignals(); + + MyBackendType = B_DATACHECKSUMSWORKER_WORKER; + init_ps_display(NULL); + + ereport(DEBUG1, + (errmsg("starting data checksum processing in database with OID %u", + dboid))); + + BackgroundWorkerInitializeConnectionByOid(dboid, InvalidOid, + BGWORKER_BYPASS_ALLOWCONN); + + /* + * Get a list of all temp tables present as we start in this database. We + * need to wait until they are all gone until we are done, since we cannot + * access these relations and modify them. + */ + InitialTempTableList = BuildRelationList(true, false); + + /* + * Enable vacuum cost delay, if any. + */ + Assert(DatachecksumsWorkerShmem->enabling_checksums); + VacuumCostDelay = DatachecksumsWorkerShmem->cost_delay; + VacuumCostLimit = DatachecksumsWorkerShmem->cost_limit; + VacuumCostActive = (VacuumCostDelay > 0); + VacuumCostBalance = 0; + VacuumPageHit = 0; + VacuumPageMiss = 0; + VacuumPageDirty = 0; + + /* + * Create and set the vacuum strategy as our buffer strategy. + */ + strategy = GetAccessStrategy(BAS_VACUUM); + + RelationList = BuildRelationList(false, + DatachecksumsWorkerShmem->process_shared_catalogs); + foreach(lc, RelationList) + { + Oid reloid = lfirst_oid(lc); + + if (!ProcessSingleRelationByOid(reloid, strategy)) + { + aborted = true; + break; + } + } + list_free(RelationList); + + if (aborted) + { + DatachecksumsWorkerShmem->success = DATACHECKSUMSWORKER_ABORTED; + ereport(DEBUG1, + (errmsg("data checksum processing aborted in database OID %u", + dboid))); + return; + } + + /* + * Wait for all temp tables that existed when we started to go away. This + * is necessary since we cannot "reach" them to enable checksums. Any temp + * tables created after we started will already have checksums in them + * (due to the "inprogress-on" state), so no need to wait for those. + */ + for (;;) + { + List *CurrentTempTables; + ListCell *lc; + int numleft; + char activity[64]; + + CurrentTempTables = BuildRelationList(true, false); + numleft = 0; + foreach(lc, InitialTempTableList) + { + if (list_member_oid(CurrentTempTables, lfirst_oid(lc))) + numleft++; + } + list_free(CurrentTempTables); + + if (numleft == 0) + break; + + /* At least one temp table is left to wait for */ + snprintf(activity, + sizeof(activity), + "Waiting for %d temp tables to be removed", numleft); + pgstat_report_activity(STATE_RUNNING, activity); + + /* Retry every 5 seconds */ + ResetLatch(MyLatch); + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 5000, + WAIT_EVENT_CHECKSUM_ENABLE_FINISHCONDITION); + + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + aborted = DatachecksumsWorkerShmem->launch_enable_checksums != enabling_checksums; + LWLockRelease(DatachecksumsWorkerLock); + + if (aborted || abort_requested) + { + DatachecksumsWorkerShmem->success = DATACHECKSUMSWORKER_ABORTED; + ereport(DEBUG1, + (errmsg("data checksum processing aborted in database OID %u", + dboid))); + return; + } + } + + list_free(InitialTempTableList); + + DatachecksumsWorkerShmem->success = DATACHECKSUMSWORKER_SUCCESSFUL; + ereport(DEBUG1, + (errmsg("data checksum processing completed in database with OID %u", + dboid))); +} diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index f75b52719dd..0fef097eb80 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -4017,6 +4017,12 @@ pgstat_get_wait_ipc(WaitEventIPC w) case WAIT_EVENT_CHECKPOINT_START: event_name = "CheckpointStart"; break; + case WAIT_EVENT_CHECKSUM_ENABLE_STARTCONDITION: + event_name = "ChecksumEnableStartCondition"; + break; + case WAIT_EVENT_CHECKSUM_ENABLE_FINISHCONDITION: + event_name = "ChecksumEnableFinishCondition"; + break; case WAIT_EVENT_EXECUTE_GATHER: event_name = "ExecuteGather"; break; diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c index 0f54635550b..cc494b6f13d 100644 --- a/src/backend/replication/basebackup.c +++ b/src/backend/replication/basebackup.c @@ -1612,7 +1612,7 @@ sendFile(const char *readfilename, const char *tarfilename, _tarWriteHeader(tarfilename, NULL, statbuf, false); - if (!noverify_checksums && DataChecksumsEnabled()) + if (!noverify_checksums) { char *filename; @@ -1698,7 +1698,14 @@ sendFile(const char *readfilename, const char *tarfilename, */ if (!PageIsNew(page) && PageGetLSN(page) < startptr) { + HOLD_INTERRUPTS(); + if (!DataChecksumsNeedVerify()) + { + RESUME_INTERRUPTS(); + continue; + } checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE); + RESUME_INTERRUPTS(); phdr = (PageHeader) page; if (phdr->pd_checksum != checksum) { diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index afa1df00d0e..d9c482454ff 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -223,6 +223,7 @@ DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) case XLOG_FPW_CHANGE: case XLOG_FPI_FOR_HINT: case XLOG_FPI: + case XLOG_CHECKSUMS: break; default: elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info); diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 561c212092f..9362ec00184 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -2944,8 +2944,13 @@ BufferGetLSNAtomic(Buffer buffer) /* * If we don't need locking for correctness, fastpath out. */ + HOLD_INTERRUPTS(); if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer)) + { + RESUME_INTERRUPTS(); return PageGetLSN(page); + } + RESUME_INTERRUPTS(); /* Make sure we've got a real buffer, and that we hold a pin on it. */ Assert(BufferIsValid(buffer)); diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index f9bbe97b507..c7928f34957 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -28,6 +28,7 @@ #include "postmaster/autovacuum.h" #include "postmaster/bgworker_internals.h" #include "postmaster/bgwriter.h" +#include "postmaster/datachecksumsworker.h" #include "postmaster/postmaster.h" #include "replication/logicallauncher.h" #include "replication/origin.h" @@ -149,6 +150,7 @@ CreateSharedMemoryAndSemaphores(void) size = add_size(size, BTreeShmemSize()); size = add_size(size, SyncScanShmemSize()); size = add_size(size, AsyncShmemSize()); + size = add_size(size, DatachecksumsWorkerShmemSize()); #ifdef EXEC_BACKEND size = add_size(size, ShmemBackendArraySize()); #endif @@ -259,6 +261,7 @@ CreateSharedMemoryAndSemaphores(void) WalSndShmemInit(); WalRcvShmemInit(); ApplyLauncherShmemInit(); + DatachecksumsWorkerShmemInit(); /* * Set up other modules that need some shared memory space diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index c43cdd685b4..a3720617f94 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -18,6 +18,7 @@ #include #include "access/parallel.h" +#include "access/xlog.h" #include "port/pg_bitutils.h" #include "commands/async.h" #include "miscadmin.h" @@ -98,7 +99,6 @@ static volatile ProcSignalSlot *MyProcSignalSlot = NULL; static bool CheckProcSignal(ProcSignalReason reason); static void CleanupProcSignalState(int status, Datum arg); static void ResetProcSignalBarrierBits(uint32 flags); -static bool ProcessBarrierPlaceholder(void); /* * ProcSignalShmemSize @@ -538,8 +538,17 @@ ProcessProcSignalBarrier(void) type = (ProcSignalBarrierType) pg_rightmost_one_pos32(flags); switch (type) { - case PROCSIGNAL_BARRIER_PLACEHOLDER: - processed = ProcessBarrierPlaceholder(); + case PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON: + processed = AbsorbChecksumsOnInProgressBarrier(); + break; + case PROCSIGNAL_BARRIER_CHECKSUM_ON: + processed = AbsorbChecksumsOnBarrier(); + break; + case PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF: + processed = AbsorbChecksumsOffInProgressBarrier(); + break; + case PROCSIGNAL_BARRIER_CHECKSUM_OFF: + processed = AbsorbChecksumsOffBarrier(); break; } @@ -604,24 +613,6 @@ ResetProcSignalBarrierBits(uint32 flags) InterruptPending = true; } -static bool -ProcessBarrierPlaceholder(void) -{ - /* - * XXX. This is just a placeholder until the first real user of this - * machinery gets committed. Rename PROCSIGNAL_BARRIER_PLACEHOLDER to - * PROCSIGNAL_BARRIER_SOMETHING_ELSE where SOMETHING_ELSE is something - * appropriately descriptive. Get rid of this function and instead have - * ProcessBarrierSomethingElse. Most likely, that function should live in - * the file pertaining to that subsystem, rather than here. - * - * The return value should be 'true' if the barrier was successfully - * absorbed and 'false' if not. Note that returning 'false' can lead to - * very frequent retries, so try hard to make that an uncommon case. - */ - return true; -} - /* * CheckProcSignal - check to see if a particular reason has been * signaled, and clear the signal flag. Should be called after receiving diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index 6c7cf6c2956..5b083749d55 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -53,3 +53,4 @@ XactTruncationLock 44 # 45 was XactTruncationLock until removal of BackendRandomLock WrapLimitsVacuumLock 46 NotifyQueueTailLock 47 +DatachecksumsWorkerLock 48 diff --git a/src/backend/storage/page/README b/src/backend/storage/page/README index e30d7ac59ad..78edf57adc8 100644 --- a/src/backend/storage/page/README +++ b/src/backend/storage/page/README @@ -10,7 +10,9 @@ http://www.cs.toronto.edu/~bianca/papers/sigmetrics09.pdf, discussed 2010/12/22 on -hackers list. Current implementation requires this be enabled system-wide at initdb time, or -by using the pg_checksums tool on an offline cluster. +by using the pg_checksums tool on an offline cluster. Checksums can also be +turned on and off using pg_enable_data_checksums()/pg_disable_data_checksums() +at runtime. The checksum is not valid at all times on a data page!! The checksum is valid when the page leaves the shared pool and is checked diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index 9ac556b4ae0..8fbebd98700 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -100,13 +100,20 @@ PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags) */ if (!PageIsNew(page)) { - if (DataChecksumsEnabled()) + /* + * Hold interrupts for the duration of the checksum check to ensure + * that the data checksums state cannot change and thus risking a + * false positive or negative. + */ + HOLD_INTERRUPTS(); + if (DataChecksumsNeedVerify()) { checksum = pg_checksum_page((char *) page, blkno); if (checksum != p->pd_checksum) checksum_failure = true; } + RESUME_INTERRUPTS(); /* * The following checks don't prove the header is correct, only that @@ -1394,10 +1401,6 @@ PageSetChecksumCopy(Page page, BlockNumber blkno) { static char *pageCopy = NULL; - /* If we don't need a checksum, just return the passed-in data */ - if (PageIsNew(page) || !DataChecksumsEnabled()) - return (char *) page; - /* * We allocate the copy space once and use it over on each subsequent * call. The point of palloc'ing here, rather than having a static char @@ -1407,8 +1410,17 @@ PageSetChecksumCopy(Page page, BlockNumber blkno) if (pageCopy == NULL) pageCopy = MemoryContextAlloc(TopMemoryContext, BLCKSZ); + /* If we don't need a checksum, just return the passed-in data */ + HOLD_INTERRUPTS(); + if (PageIsNew(page) || !DataChecksumsNeedWrite()) + { + RESUME_INTERRUPTS(); + return (char *) page; + } + memcpy(pageCopy, (char *) page, BLCKSZ); ((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno); + RESUME_INTERRUPTS(); return pageCopy; } @@ -1421,9 +1433,14 @@ PageSetChecksumCopy(Page page, BlockNumber blkno) void PageSetChecksumInplace(Page page, BlockNumber blkno) { + HOLD_INTERRUPTS(); /* If we don't need a checksum, just return */ - if (PageIsNew(page) || !DataChecksumsEnabled()) + if (PageIsNew(page) || !DataChecksumsNeedWrite()) + { + RESUME_INTERRUPTS(); return; + } ((PageHeader) page)->pd_checksum = pg_checksum_page((char *) page, blkno); + RESUME_INTERRUPTS(); } diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index 62bff52638d..4ac396ccf1e 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -1567,9 +1567,6 @@ pg_stat_get_db_checksum_failures(PG_FUNCTION_ARGS) int64 result; PgStat_StatDBEntry *dbentry; - if (!DataChecksumsEnabled()) - PG_RETURN_NULL(); - if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL) result = 0; else @@ -1585,9 +1582,6 @@ pg_stat_get_db_checksum_last_failure(PG_FUNCTION_ARGS) TimestampTz result; PgStat_StatDBEntry *dbentry; - if (!DataChecksumsEnabled()) - PG_RETURN_NULL(); - if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL) result = 0; else diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 7ef510cd01b..633821bae59 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -271,7 +271,8 @@ static void write_relcache_init_file(bool shared); static void write_item(const void *data, Size len, FILE *fp); static void formrdesc(const char *relationName, Oid relationReltype, - bool isshared, int natts, const FormData_pg_attribute *attrs); + bool isshared, int natts, const FormData_pg_attribute *attrs, + bool haschecksums); static HeapTuple ScanPgRelation(Oid targetRelId, bool indexOK, bool force_non_historic); static Relation AllocateRelationDesc(Form_pg_class relp); @@ -1828,7 +1829,8 @@ RelationInitTableAccessMethod(Relation relation) static void formrdesc(const char *relationName, Oid relationReltype, bool isshared, - int natts, const FormData_pg_attribute *attrs) + int natts, const FormData_pg_attribute *attrs, + bool haschecksums) { Relation relation; int i; @@ -1896,6 +1898,8 @@ formrdesc(const char *relationName, Oid relationReltype, relation->rd_rel->relnatts = (int16) natts; relation->rd_rel->relam = HEAP_TABLE_AM_OID; + relation->rd_rel->relhaschecksums = haschecksums; + /* * initialize attribute tuple form * @@ -3548,6 +3552,27 @@ RelationBuildLocalRelation(const char *relname, relkind == RELKIND_MATVIEW) RelationInitTableAccessMethod(rel); + /* + * Set the data checksum state. Since the data checksum state can change + * at any time, the fetched value might be out of date by the time the + * relation is built. DataChecksumsNeedWrite returns true when data + * checksums are: enabled; are in the process of being enabled (state: + * "inprogress-on"); are in the process of being disabled (state: + * "inprogress-off"). Since relhaschecksums is only used to track progress + * when data checksums are being enabled, and going from disabled to + * enabled will clear relhaschecksums before starting, it is safe to use + * this value for a concurrent state transition to off. + * + * If DataChecksumsNeedWrite returns false, and is concurrently changed to + * true then that implies that checksums are being enabled. Worst case, + * this will lead to the relation being processed for checksums even + * though each page written will have them already. Performing this last + * shortens the window, but doesn't avoid it. + */ + HOLD_INTERRUPTS(); + rel->rd_rel->relhaschecksums = DataChecksumsNeedWrite(); + RESUME_INTERRUPTS(); + /* * Okay to insert into the relcache hash table. * @@ -3813,6 +3838,7 @@ void RelationCacheInitializePhase2(void) { MemoryContext oldcxt; + bool haschecksums; /* * relation mapper needs initialized too @@ -3837,16 +3863,24 @@ RelationCacheInitializePhase2(void) */ if (!load_relcache_init_file(true)) { + /* + * Our local state can't change at this point, so we can cache the + * checksum state. + */ + HOLD_INTERRUPTS(); + haschecksums = DataChecksumsNeedWrite(); + RESUME_INTERRUPTS(); + formrdesc("pg_database", DatabaseRelation_Rowtype_Id, true, - Natts_pg_database, Desc_pg_database); + Natts_pg_database, Desc_pg_database, haschecksums); formrdesc("pg_authid", AuthIdRelation_Rowtype_Id, true, - Natts_pg_authid, Desc_pg_authid); + Natts_pg_authid, Desc_pg_authid, haschecksums); formrdesc("pg_auth_members", AuthMemRelation_Rowtype_Id, true, - Natts_pg_auth_members, Desc_pg_auth_members); + Natts_pg_auth_members, Desc_pg_auth_members, haschecksums); formrdesc("pg_shseclabel", SharedSecLabelRelation_Rowtype_Id, true, - Natts_pg_shseclabel, Desc_pg_shseclabel); + Natts_pg_shseclabel, Desc_pg_shseclabel, haschecksums); formrdesc("pg_subscription", SubscriptionRelation_Rowtype_Id, true, - Natts_pg_subscription, Desc_pg_subscription); + Natts_pg_subscription, Desc_pg_subscription, haschecksums); #define NUM_CRITICAL_SHARED_RELS 5 /* fix if you change list above */ } @@ -3875,6 +3909,7 @@ RelationCacheInitializePhase3(void) RelIdCacheEnt *idhentry; MemoryContext oldcxt; bool needNewCacheFile = !criticalSharedRelcachesBuilt; + bool haschecksums; /* * relation mapper needs initialized too @@ -3895,15 +3930,18 @@ RelationCacheInitializePhase3(void) !load_relcache_init_file(false)) { needNewCacheFile = true; + HOLD_INTERRUPTS(); + haschecksums = DataChecksumsNeedWrite(); + RESUME_INTERRUPTS(); formrdesc("pg_class", RelationRelation_Rowtype_Id, false, - Natts_pg_class, Desc_pg_class); + Natts_pg_class, Desc_pg_class, haschecksums); formrdesc("pg_attribute", AttributeRelation_Rowtype_Id, false, - Natts_pg_attribute, Desc_pg_attribute); + Natts_pg_attribute, Desc_pg_attribute, haschecksums); formrdesc("pg_proc", ProcedureRelation_Rowtype_Id, false, - Natts_pg_proc, Desc_pg_proc); + Natts_pg_proc, Desc_pg_proc, haschecksums); formrdesc("pg_type", TypeRelation_Rowtype_Id, false, - Natts_pg_type, Desc_pg_type); + Natts_pg_type, Desc_pg_type, haschecksums); #define NUM_CRITICAL_LOCAL_RELS 4 /* fix if you change list above */ } diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index 0f67b99cc55..045da219044 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -275,6 +275,12 @@ GetBackendTypeDesc(BackendType backendType) case B_LOGGER: backendDesc = "logger"; break; + case B_DATACHECKSUMSWORKER_LAUNCHER: + backendDesc = "datachecksumsworker launcher"; + break; + case B_DATACHECKSUMSWORKER_WORKER: + backendDesc = "datachecksumsworker worker"; + break; } return backendDesc; diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index e5965bc517d..92367ece4b8 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -606,6 +606,11 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username, if (MyBackendId > MaxBackends || MyBackendId <= 0) elog(FATAL, "bad backend ID: %d", MyBackendId); + /* + * Set up backend local cache of Controldata values. + */ + InitLocalControldata(); + /* Now that we have a BackendId, we can participate in ProcSignal */ ProcSignalInit(MyBackendId); diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 17579eeaca9..3b7207afb54 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -36,6 +36,7 @@ #include "access/transam.h" #include "access/twophase.h" #include "access/xact.h" +#include "access/xlog.h" #include "access/xlog_internal.h" #include "catalog/namespace.h" #include "catalog/pg_authid.h" @@ -76,6 +77,7 @@ #include "replication/walreceiver.h" #include "replication/walsender.h" #include "storage/bufmgr.h" +#include "storage/checksum.h" #include "storage/dsm_impl.h" #include "storage/fd.h" #include "storage/large_object.h" @@ -500,6 +502,17 @@ static struct config_enum_entry shared_memory_options[] = { {NULL, 0, false} }; +/* + * Options for data_checksums enum. + */ +static const struct config_enum_entry data_checksum_options[] = { + {"on", DATA_CHECKSUMS_ON, true}, + {"off", DATA_CHECKSUMS_OFF, true}, + {"inprogress-on", DATA_CHECKSUMS_INPROGRESS_ON, true}, + {"inprogress-off", DATA_CHECKSUMS_INPROGRESS_OFF, true}, + {NULL, 0, false} +}; + /* * Options for enum values stored in other modules */ @@ -609,7 +622,7 @@ static int max_identifier_length; static int block_size; static int segment_size; static int wal_block_size; -static bool data_checksums; +static int data_checksums; static bool integer_datetimes; static bool assert_enabled; static bool in_hot_standby; @@ -1910,17 +1923,6 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, - { - {"data_checksums", PGC_INTERNAL, PRESET_OPTIONS, - gettext_noop("Shows whether data checksums are turned on for this cluster."), - NULL, - GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE - }, - &data_checksums, - false, - NULL, NULL, NULL - }, - { {"syslog_sequence_numbers", PGC_SIGHUP, LOGGING_WHERE, gettext_noop("Add sequence number to syslog messages to avoid duplicate suppression."), @@ -4830,6 +4832,17 @@ static struct config_enum ConfigureNamesEnum[] = NULL, NULL, NULL }, + { + {"data_checksums", PGC_INTERNAL, PRESET_OPTIONS, + gettext_noop("Shows whether data checksums are turned on for this cluster."), + NULL, + GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE + }, + &data_checksums, + DATA_CHECKSUMS_OFF, data_checksum_options, + NULL, NULL, show_data_checksums + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, 0, NULL, NULL, NULL, NULL diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c index 0223ee44082..f3f029f41e5 100644 --- a/src/bin/pg_checksums/pg_checksums.c +++ b/src/bin/pg_checksums/pg_checksums.c @@ -600,7 +600,7 @@ main(int argc, char *argv[]) exit(1); } - if (ControlFile->data_checksum_version > 0 && + if (ControlFile->data_checksum_version == DATA_CHECKSUMS_ON && mode == PG_MODE_ENABLE) { pg_log_error("data checksums are already enabled in cluster"); diff --git a/src/bin/pg_upgrade/controldata.c b/src/bin/pg_upgrade/controldata.c index 4f647cdf334..12988574583 100644 --- a/src/bin/pg_upgrade/controldata.c +++ b/src/bin/pg_upgrade/controldata.c @@ -671,6 +671,15 @@ check_control_data(ControlData *oldctrl, * check_for_isn_and_int8_passing_mismatch(). */ + /* + * If checksums have been turned on in the old cluster, but the + * datachecksumsworker have yet to finish, then disallow upgrading. The + * user should either let the process finish, or turn off checksums, + * before retrying. + */ + if (oldctrl->data_checksum_version == 2) + pg_fatal("checksum enabling in old cluster is in progress\n"); + /* * We might eventually allow upgrades from checksum to no-checksum * clusters. diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index 919a7849fd0..b35cd4d503a 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -218,7 +218,7 @@ typedef struct uint32 large_object; bool date_is_int; bool float8_pass_by_value; - bool data_checksum_version; + uint32 data_checksum_version; } ControlData; /* diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 75ec1073bd0..6947c095914 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -198,8 +198,11 @@ extern PGDLLIMPORT int wal_level; * individual bits on a page, it's still consistent no matter what combination * of the bits make it to disk, but the checksum wouldn't match. Also WAL-log * them if forced by wal_log_hints=on. + * + * Since XLogHintBitIsNeeded calls DataChecksumsNeedWrite, interrupts must be + * held off during this call. */ -#define XLogHintBitIsNeeded() (DataChecksumsEnabled() || wal_log_hints) +#define XLogHintBitIsNeeded() (wal_log_hints || DataChecksumsNeedWrite()) /* Do we need to WAL-log information required only for Hot Standby and logical replication? */ #define XLogStandbyInfoActive() (wal_level >= WAL_LEVEL_REPLICA) @@ -318,7 +321,19 @@ extern TimestampTz GetCurrentChunkReplayStartTime(void); extern void UpdateControlFile(void); extern uint64 GetSystemIdentifier(void); extern char *GetMockAuthenticationNonce(void); -extern bool DataChecksumsEnabled(void); +extern bool DataChecksumsNeedWrite(void); +extern bool DataChecksumsNeedVerify(void); +extern bool DataChecksumsOnInProgress(void); +extern bool DataChecksumsOffInProgress(void); +extern void SetDataChecksumsOnInProgress(void); +extern void SetDataChecksumsOn(void); +extern void SetDataChecksumsOff(void); +extern bool AbsorbChecksumsOnInProgressBarrier(void); +extern bool AbsorbChecksumsOffInProgressBarrier(void); +extern bool AbsorbChecksumsOnBarrier(void); +extern bool AbsorbChecksumsOffBarrier(void); +extern const char *show_data_checksums(void); +extern void InitLocalControldata(void); extern XLogRecPtr GetFakeLSNForUnloggedRel(void); extern Size XLOGShmemSize(void); extern void XLOGShmemInit(void); diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 224cae0246f..adbe81e890b 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -25,6 +25,7 @@ #include "lib/stringinfo.h" #include "pgtime.h" #include "storage/block.h" +#include "storage/checksum.h" #include "storage/relfilenode.h" @@ -249,6 +250,12 @@ typedef struct xl_restore_point char rp_name[MAXFNAMELEN]; } xl_restore_point; +/* Information logged when checksum level is changed */ +typedef struct xl_checksum_state +{ + ChecksumType new_checksumtype; +} xl_checksum_state; + /* End of recovery mark, when we don't do an END_OF_RECOVERY checkpoint */ typedef struct xl_end_of_recovery { diff --git a/src/include/catalog/pg_class.h b/src/include/catalog/pg_class.h index e8dcd15a55f..bf296625e42 100644 --- a/src/include/catalog/pg_class.h +++ b/src/include/catalog/pg_class.h @@ -119,6 +119,9 @@ CATALOG(pg_class,1259,RelationRelationId) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83,Relat /* is relation a partition? */ bool relispartition BKI_DEFAULT(f); + /* does the relation have checksums enabled */ + bool relhaschecksums BKI_DEFAULT(f); + /* heap for rewrite during DDL, link to original rel */ Oid relrewrite BKI_DEFAULT(0); diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index e3f48158ce7..d8229422afc 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -76,6 +76,7 @@ typedef struct CheckPoint #define XLOG_END_OF_RECOVERY 0x90 #define XLOG_FPI_FOR_HINT 0xA0 #define XLOG_FPI 0xB0 +#define XLOG_CHECKSUMS 0xC0 /* diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index b5f52d4e4a3..f050f15a58c 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -11301,6 +11301,22 @@ proname => 'raw_array_subscript_handler', prorettype => 'internal', proargtypes => 'internal', prosrc => 'raw_array_subscript_handler' }, +{ oid => '9258', + descr => 'disable data checksums', + proname => 'pg_disable_data_checksums', provolatile => 'v', prorettype => 'void', + proparallel => 'r', + proargtypes => '', + prosrc => 'disable_data_checksums' }, + +{ oid => '9257', + descr => 'enable data checksums', + proname => 'pg_enable_data_checksums', provolatile => 'v', prorettype => 'void', + proparallel => 'r', + proargtypes => 'int4 int4', proallargtypes => '{int4,int4}', + proargmodes => '{i,i}', + proargnames => '{cost_delay,cost_limit}', + prosrc => 'enable_data_checksums' }, + # collation management functions { oid => '3445', descr => 'import collations from operating system', proname => 'pg_import_system_collations', procost => '100', diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 1bdc97e3082..f013acba76a 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -324,6 +324,8 @@ typedef enum BackendType B_ARCHIVER, B_STATS_COLLECTOR, B_LOGGER, + B_DATACHECKSUMSWORKER_LAUNCHER, + B_DATACHECKSUMSWORKER_WORKER, } BackendType; extern BackendType MyBackendType; diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 724068cf87e..0974dfadfe4 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -963,6 +963,8 @@ typedef enum WAIT_EVENT_BTREE_PAGE, WAIT_EVENT_CHECKPOINT_DONE, WAIT_EVENT_CHECKPOINT_START, + WAIT_EVENT_CHECKSUM_ENABLE_STARTCONDITION, + WAIT_EVENT_CHECKSUM_ENABLE_FINISHCONDITION, WAIT_EVENT_EXECUTE_GATHER, WAIT_EVENT_HASH_BATCH_ALLOCATE, WAIT_EVENT_HASH_BATCH_ELECT, diff --git a/src/include/postmaster/datachecksumsworker.h b/src/include/postmaster/datachecksumsworker.h new file mode 100644 index 00000000000..845f6bceaae --- /dev/null +++ b/src/include/postmaster/datachecksumsworker.h @@ -0,0 +1,30 @@ +/*------------------------------------------------------------------------- + * + * datachecksumsworker.h + * header file for checksum helper background worker + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/postmaster/datachecksumsworker.h + * + *------------------------------------------------------------------------- + */ +#ifndef DATACHECKSUMSWORKER_H +#define DATACHECKSUMSWORKER_H + +/* Shared memory */ +extern Size DatachecksumsWorkerShmemSize(void); +extern void DatachecksumsWorkerShmemInit(void); + +/* Start the background processes for enabling or disabling checksums */ +void StartDatachecksumsWorkerLauncher(bool enable_checksums, + int cost_delay, int cost_limit); + +/* Background worker entrypoints */ +void DatachecksumsWorkerLauncherMain(Datum arg); +void DatachecksumsWorkerMain(Datum arg); +void ResetDataChecksumsStateInDatabase(Datum arg); + +#endif /* DATACHECKSUMSWORKER_H */ diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index 359b749f7f4..c35b747520a 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -198,6 +198,9 @@ typedef PageHeaderData *PageHeader; */ #define PG_PAGE_LAYOUT_VERSION 4 #define PG_DATA_CHECKSUM_VERSION 1 +#define PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION 2 +#define PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION 3 + /* ---------------------------------------------------------------- * page support macros diff --git a/src/include/storage/checksum.h b/src/include/storage/checksum.h index 80d23591921..f736b12f986 100644 --- a/src/include/storage/checksum.h +++ b/src/include/storage/checksum.h @@ -15,6 +15,14 @@ #include "storage/block.h" +typedef enum ChecksumType +{ + DATA_CHECKSUMS_OFF = 0, + DATA_CHECKSUMS_ON, + DATA_CHECKSUMS_INPROGRESS_ON, + DATA_CHECKSUMS_INPROGRESS_OFF +} ChecksumType; + /* * Compute the checksum for a Postgres page. The page must be aligned on a * 4-byte boundary. diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h index 4ae7dc33b8e..d865796d048 100644 --- a/src/include/storage/procsignal.h +++ b/src/include/storage/procsignal.h @@ -48,12 +48,10 @@ typedef enum typedef enum { - /* - * XXX. PROCSIGNAL_BARRIER_PLACEHOLDER should be replaced when the first - * real user of the ProcSignalBarrier mechanism is added. It's just here - * for now because we can't have an empty enum. - */ - PROCSIGNAL_BARRIER_PLACEHOLDER = 0 + PROCSIGNAL_BARRIER_CHECKSUM_OFF = 0, + PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON, + PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF, + PROCSIGNAL_BARRIER_CHECKSUM_ON } ProcSignalBarrierType; /* diff --git a/src/test/Makefile b/src/test/Makefile index ab1ef9a4753..9774816625b 100644 --- a/src/test/Makefile +++ b/src/test/Makefile @@ -13,7 +13,7 @@ top_builddir = ../.. include $(top_builddir)/src/Makefile.global SUBDIRS = perl regress isolation modules authentication recovery subscription \ - locale + locale checksum # Test suites that are not safe by default but can be run if selected # by the user via the whitespace-separated list in variable diff --git a/src/test/checksum/.gitignore b/src/test/checksum/.gitignore new file mode 100644 index 00000000000..871e943d50e --- /dev/null +++ b/src/test/checksum/.gitignore @@ -0,0 +1,2 @@ +# Generated by test suite +/tmp_check/ diff --git a/src/test/checksum/Makefile b/src/test/checksum/Makefile new file mode 100644 index 00000000000..fd60f7e97f3 --- /dev/null +++ b/src/test/checksum/Makefile @@ -0,0 +1,23 @@ +#------------------------------------------------------------------------- +# +# Makefile for src/test/checksum +# +# Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group +# Portions Copyright (c) 1994, Regents of the University of California +# +# src/test/checksum/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/test/checksum +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +check: + $(prove_check) + +installcheck: + $(prove_installcheck) + +clean distclean maintainer-clean: + rm -rf tmp_check diff --git a/src/test/checksum/README b/src/test/checksum/README new file mode 100644 index 00000000000..0f0317060b3 --- /dev/null +++ b/src/test/checksum/README @@ -0,0 +1,22 @@ +src/test/checksum/README + +Regression tests for data checksums +=================================== + +This directory contains a test suite for enabling data checksums +in a running cluster. + +Running the tests +================= + + make check + +or + + make installcheck + +NOTE: This creates a temporary installation (in the case of "check"), +with multiple nodes, be they master or standby(s) for the purpose of +the tests. + +NOTE: This requires the --enable-tap-tests argument to configure. diff --git a/src/test/checksum/t/001_basic.pl b/src/test/checksum/t/001_basic.pl new file mode 100644 index 00000000000..3b229de915a --- /dev/null +++ b/src/test/checksum/t/001_basic.pl @@ -0,0 +1,92 @@ +# Test suite for testing enabling data checksums in an online cluster +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More; + +# Initialize node with checksums disabled. +my $node = get_new_node('main'); +$node->init(); +$node->start(); + +# Create some content to have un-checksummed data in the cluster +$node->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); + +# Ensure that checksums are turned off +my $result = $node->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';" +); +is($result, 'off', 'ensure checksums are disabled'); + +# No relation in pg_class should have relhaschecksums at this point +$result = $node->safe_psql('postgres', + "SELECT count(*) FROM pg_catalog.pg_class WHERE relhaschecksums;"); +is($result, '0', 'ensure no entries in pg_class has checksums recorded'); + +# Enable data checksums +$node->safe_psql('postgres', "SELECT pg_enable_data_checksums();"); + +# Wait for checksums to become enabled +$result = $node->poll_query_until( + 'postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'on'); +is($result, 1, 'ensure checksums are enabled'); + +# Check that relations with storage have been marked with relhaschecksums in +# pg_class +$result = $node->safe_psql('postgres', + "SELECT count(*) FROM pg_catalog.pg_class WHERE NOT relhaschecksums " + . "AND relkind IN ('r', 'i', 'S', 't', 'm');"); +is($result, '0', 'ensure all relations are correctly flagged in the catalog'); + +# Run a dummy query just to make sure we can read back some data +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t"); +is($result, '10000', 'ensure checksummed pages can be read back'); + +# Enable data checksums again which should be a no-op.. +$node->safe_psql('postgres', "SELECT pg_enable_data_checksums();"); +# ..and make sure we can still read/write data +$node->safe_psql('postgres', "UPDATE t SET a = a + 1;"); +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t"); +is($result, '10000', 'ensure checksummed pages can be read back'); + +# Disable checksums again +$node->safe_psql('postgres', "SELECT pg_disable_data_checksums();"); + +# Wait for checksums to be disabled. Disabling checksums clear the catalog +# relhaschecksums state so await that before calling it done. +$result = $node->poll_query_until('postgres', + "SELECT count(*) FROM pg_catalog.pg_class WHERE relhaschecksums;", '0'); +is($result, '1', 'ensure no entries in pg_class has checksums recorded'); +$result = $node->poll_query_until( + 'postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'off'); +is($result, 1, 'ensure checksums are disabled'); + +# Test reading again +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t"); +is($result, '10000', 'ensure previously checksummed pages can be read back'); + +# Re-enable checksums and make sure that the relhaschecksums flags in the +# catalog aren't tricking processing into skipping previously checksummed +# relations +$node->safe_psql('postgres', "UPDATE t SET a = a + 1;"); + +$node->safe_psql('postgres', "SELECT pg_enable_data_checksums();"); +$result = $node->poll_query_until( + 'postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'on'); +is($result, 1, 'ensure checksums are enabled'); + +# Run a dummy query just to make sure we can read back some data +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t"); +is($result, '10000', 'ensure checksummed pages can be read back'); + +$node->stop; + +done_testing(); diff --git a/src/test/checksum/t/002_restarts.pl b/src/test/checksum/t/002_restarts.pl new file mode 100644 index 00000000000..41a4d640375 --- /dev/null +++ b/src/test/checksum/t/002_restarts.pl @@ -0,0 +1,117 @@ +# Test suite for testing enabling data checksums in an online cluster with +# restarting the processing +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More; +use IPC::Run qw(pump finish timer); + +# If we don't have IO::Pty, forget it, because IPC::Run depends on that +# to support pty connections +eval { require IO::Pty; }; +if ($@) +{ + plan skip_all => 'IO::Pty is needed to run this test'; +} + +# Initialize node with checksums disabled. +my $node = get_new_node('main'); +$node->init(); +$node->start(); + +# Create some content to have un-checksummed data in the cluster +$node->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); + +# Ensure that checksums are disabled +my $result = $node->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';" +); +is($result, 'off', 'ensure checksums are disabled'); + +# Create a barrier for checksumming to block on, in this case a pre-existing +# temporary table which is kept open while processing is started. We can +# accomplish this by setting up an interactive psql process which keeps the +# temporary table created as we enable checksums in another psql process. +my $in = ''; +my $out = ''; +my $timer = timer(5); + +my $h = $node->interactive_psql('postgres', \$in, \$out, $timer); + +$out = ''; +$timer->start(5); + +$in .= "CREATE TEMPORARY TABLE tt (a integer);\n"; +pump $h until ($out =~ /CREATE TABLE/ || $timer->is_expired); + +# In another session, make sure we can see the blocking temp table but start +# processing anyways and check that we are blocked with a proper wait event. +$result = $node->safe_psql('postgres', + "SELECT relpersistence FROM pg_catalog.pg_class WHERE relname = 'tt';"); +is($result, 't', 'ensure we can see the temporary table'); + +$node->safe_psql('postgres', "SELECT pg_enable_data_checksums();"); + +$result = $node->poll_query_until( + 'postgres', + "SELECT count(*) FROM pg_catalog.pg_class WHERE NOT relhaschecksums " + . "AND relkind IN ('r', 'i', 'S', 't', 'm');", + '1'); +is($result, 1, 'ensure there is a single table left'); + +$result = $node->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';" +); +is($result, 'inprogress-on', "ensure checksums aren't enabled yet"); + +$result = $node->safe_psql('postgres', + "SELECT wait_event FROM pg_stat_activity WHERE backend_type = 'datachecksumsworker worker';" +); +is($result, 'ChecksumEnableFinishCondition', 'test for correct wait event'); + +$result = $node->safe_psql('postgres', + "SELECT count(*) FROM pg_catalog.pg_class WHERE NOT relhaschecksums " + . "AND relkind IN ('r', 'i', 'S', 't', 'm');"); +is($result, '1', + 'doublecheck that there is a single table left before restarting'); + +$node->stop; +$node->start; + +$result = $node->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';" +); +is($result, 'inprogress-on', "ensure checksums aren't enabled yet"); + +$result = $node->safe_psql('postgres', + "SELECT count(*) FROM pg_catalog.pg_class WHERE NOT relhaschecksums " + . "AND relkind IN ('r', 'i', 'S', 't', 'm');"); +is($result, '0', 'no temporary tables this time around'); + +$node->safe_psql('postgres', "SELECT pg_enable_data_checksums();"); + +$result = $node->poll_query_until( + 'postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'on'); +is($result, 1, 'ensure checksums are turned on'); + +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t"); +is($result, '10000', 'ensure checksummed pages can be read back'); + +$result = $node->poll_query_until( + 'postgres', + "SELECT count(*) FROM pg_stat_activity WHERE backend_type LIKE 'datachecksumsworker%';", + '0'); +is($result, 1, 'await datachecksums worker/launcher termination'); + +$result = $node->safe_psql('postgres', "SELECT pg_disable_data_checksums();"); +$result = $node->poll_query_until( + 'postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'off'); +is($result, 1, 'ensure checksums are turned off'); + +done_testing(); diff --git a/src/test/checksum/t/003_standby_checksum.pl b/src/test/checksum/t/003_standby_checksum.pl new file mode 100644 index 00000000000..1555a1694be --- /dev/null +++ b/src/test/checksum/t/003_standby_checksum.pl @@ -0,0 +1,127 @@ +# Test suite for testing enabling data checksums in an online cluster with +# streaming replication +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More; + +# Initialize primary node +my $node_primary = get_new_node('primary'); +$node_primary->init(allows_streaming => 1); +$node_primary->start; +my $backup_name = 'my_backup'; + +# Take backup +$node_primary->backup($backup_name); + +# Create streaming standby linking to primary +my $node_standby_1 = get_new_node('standby_1'); +$node_standby_1->init_from_backup($node_primary, $backup_name, + has_streaming => 1); +$node_standby_1->start; + +# Create some content on the primary to have un-checksummed data in the cluster +$node_primary->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); + +# Wait for standbys to catch up +$node_primary->wait_for_catchup($node_standby_1, 'replay', + $node_primary->lsn('insert')); + +# Check that checksums are turned off on all nodes +my $result = $node_primary->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';" +); +is($result, "off", 'ensure checksums are turned off on primary'); + +$result = $node_standby_1->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';" +); +is($result, "off", 'ensure checksums are turned off on standby_1'); + +# Enable checksums for the cluster +$node_primary->safe_psql('postgres', "SELECT pg_enable_data_checksums();"); + +# Ensure that the primary switches to "inprogress-on" +$result = $node_primary->poll_query_until( + 'postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + "inprogress-on"); +is($result, 1, 'ensure checksums are in progress on primary'); + +# Wait for checksum enable to be replayed +$node_primary->wait_for_catchup($node_standby_1, 'replay'); + +# Ensure that the standby has switched to "inprogress-on" or "on". Normally it +# would be "inprogress-on", but it is theoretically possible for the primary to +# complete the checksum enabling *and* have the standby replay that record +# before we reach the check below. +$result = $node_standby_1->poll_query_until( + 'postgres', + "SELECT setting = 'off' FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'f'); +is($result, 1, 'ensure standby has absorbed the inprogress-on barrier'); +$result = $node_standby_1->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';" +); +cmp_ok( + $result, '~~', + [ "inprogress-on", "on" ], + 'ensure checksums are on, or in progress, on standby_1'); + +# Insert some more data which should be checksummed on INSERT +$node_primary->safe_psql('postgres', + "INSERT INTO t VALUES (generate_series(1, 10000));"); + +# Wait for checksums enabled on the primary +$result = $node_primary->poll_query_until( + 'postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'on'); +is($result, 1, 'ensure checksums are enabled on the primary'); + +# Wait for checksums enabled on the standby +$result = $node_standby_1->poll_query_until( + 'postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'on'); +is($result, 1, 'ensure checksums are enabled on the standby'); + +$result = $node_primary->safe_psql('postgres', "SELECT count(a) FROM t"); +is($result, '20000', 'ensure we can safely read all data with checksums'); + +$result = $node_primary->poll_query_until( + 'postgres', + "SELECT count(*) FROM pg_stat_activity WHERE backend_type LIKE 'datachecksumsworker%';", + '0'); +is($result, 1, 'await datachecksums worker/launcher termination'); + +# Disable checksums and ensure it's propagated to standby and that we can +# still read all data +$node_primary->safe_psql('postgres', "SELECT pg_disable_data_checksums();"); +# Wait for checksum disable to be replayed +$node_primary->wait_for_catchup($node_standby_1, 'replay'); +$result = $node_primary->poll_query_until( + 'postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'off'); +is($result, 1, 'ensure data checksums are disabled on the primary 2'); +$result = $node_primary->poll_query_until('postgres', + "SELECT count(*) FROM pg_catalog.pg_class WHERE relhaschecksums;", '0'); +is($result, '1', 'ensure no entries in pg_class has checksums recorded'); + +# Ensure that the standby has switched to off +$result = $node_standby_1->poll_query_until('postgres', + "SELECT count(*) FROM pg_catalog.pg_class WHERE relhaschecksums;", '0'); +is($result, '1', 'ensure no entries in pg_class has checksums recorded'); +$result = $node_standby_1->poll_query_until( + 'postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'off'); +is($result, 1, 'ensure checksums are off on standby_1'); + +$result = $node_primary->safe_psql('postgres', "SELECT count(a) FROM t"); +is($result, "20000", 'ensure we can safely read all data without checksums'); + +done_testing(); diff --git a/src/test/checksum/t/004_offline.pl b/src/test/checksum/t/004_offline.pl new file mode 100644 index 00000000000..2dfca4df235 --- /dev/null +++ b/src/test/checksum/t/004_offline.pl @@ -0,0 +1,105 @@ +# Test suite for testing enabling data checksums offline from various states +# of checksum processing +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More; +use IPC::Run qw(pump finish timer); + +# If we don't have IO::Pty, forget it, because IPC::Run depends on that +# to support pty connections +eval { require IO::Pty; }; +if ($@) +{ + plan skip_all => 'IO::Pty is needed to run this test'; +} + +# Initialize node with checksums disabled. +my $node = get_new_node('main'); +$node->init(); +$node->start(); + +# Create some content to have un-checksummed data in the cluster +$node->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); + +# Ensure that checksums are disabled +my $result = $node->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';" +); +is($result, 'off', 'ensure checksums are disabled'); + +# Enable checksums offline using pg_checksums +$node->stop(); +$node->checksum_enable_offline(); +$node->start(); + +# Ensure that checksums are enabled +$result = $node->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';" +); +is($result, 'on', 'ensure checksums are enabled'); + +# Run a dummy query just to make sure we can read back some data +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t"); +is($result, '10000', 'ensure checksummed pages can be read back'); + +# Disable checksums offline again using pg_checksums +$node->stop(); +$node->checksum_disable_offline(); +$node->start(); + +# Ensure that checksums are disabled +$result = $node->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';" +); +is($result, 'off', 'ensure checksums are disabled'); + +# Create a barrier for checksumming to block on, in this case a pre-existing +# temporary table which is kept open while processing is started. We can +# accomplish this by setting up an interactive psql process which keeps the +# temporary table created as we enable checksums in another psql process. +my $in = ''; +my $out = ''; +my $timer = timer(5); + +my $h = $node->interactive_psql('postgres', \$in, \$out, $timer); + +$out = ''; +$timer->start(5); + +$in .= "CREATE TEMPORARY TABLE tt (a integer);\n"; +pump $h until ($out =~ /CREATE TABLE/ || $timer->is_expired); + +# In another session, make sure we can see the blocking temp table but start +# processing anyways and check that we are blocked with a proper wait event. +$result = $node->safe_psql('postgres', + "SELECT relpersistence FROM pg_catalog.pg_class WHERE relname = 'tt';"); +is($result, 't', 'ensure we can see the temporary table'); + +$node->safe_psql('postgres', "SELECT pg_enable_data_checksums();"); + +$result = $node->poll_query_until( + 'postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'inprogress-on'); +is($result, 1, 'ensure checksums are in the process of being enabled'); + +# Turn the cluster off and enable checksums offline, then start back up +$node->stop(); +$node->checksum_enable_offline(); +$node->start(); + +# Ensure that checksums are now enabled even though processing wasn't +# restarted +$result = $node->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';" +); +is($result, 'on', 'ensure checksums are enabled'); + +# Run a dummy query just to make sure we can read back some data +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t"); +is($result, '10000', 'ensure checksummed pages can be read back'); + +done_testing(); diff --git a/src/test/perl/PostgresNode.pm b/src/test/perl/PostgresNode.pm index 9667f7667ec..b7431a76005 100644 --- a/src/test/perl/PostgresNode.pm +++ b/src/test/perl/PostgresNode.pm @@ -2221,6 +2221,42 @@ sub pg_recvlogical_upto } } +=item $node->checksum_enable_offline() + +Enable data page checksums in an offline cluster with B. The +caller is responsible for ensuring that the cluster is in the right state for +this operation. + +=cut + +sub checksum_enable_offline +{ + my ($self) = @_; + + print "# Enabling checksums in \"$self->data_dir\"\n"; + TestLib::system_or_bail('pg_checksums', '-D', $self->data_dir, '-e'); + print "# Checksums enabled\n"; + return; +} + +=item checksum_disable_offline + +Disable data page checksums in an offline cluster with B. The +caller is responsible for ensuring that the cluster is in the right state for +this operation. + +=cut + +sub checksum_disable_offline +{ + my ($self) = @_; + + print "# Disabling checksums in \"$self->data_dir\"\n"; + TestLib::system_or_bail('pg_checksums', '-D', $self->data_dir, '-d'); + print "# Checksums disabled\n"; + return; +} + =pod =back -- 2.29.2