From 03a741bd31efb99578a3a30e55a2c8fca9d95881 Mon Sep 17 00:00:00 2001 From: Daniel Gustafsson Date: Wed, 25 Nov 2020 14:12:12 +0100 Subject: [PATCH] Support checksum enable/disable in running cluster v24 This allows data checksums to be enabled, or disabled, in a running cluster without restricting access to the cluster during processing. Further description of the process TBW once the dust settles around this. Daniel Gustafsson, Magnus Hagander --- doc/src/sgml/amcheck.sgml | 2 +- doc/src/sgml/catalogs.sgml | 11 + doc/src/sgml/func.sgml | 71 + doc/src/sgml/monitoring.sgml | 6 +- doc/src/sgml/ref/initdb.sgml | 1 + doc/src/sgml/ref/pg_checksums.sgml | 6 + doc/src/sgml/wal.sgml | 97 ++ src/backend/access/heap/heapam.c | 4 +- src/backend/access/rmgrdesc/xlogdesc.c | 18 + src/backend/access/transam/xlog.c | 381 ++++- src/backend/access/transam/xlogfuncs.c | 47 + src/backend/catalog/heap.c | 3 + src/backend/catalog/system_views.sql | 5 + src/backend/postmaster/Makefile | 1 + src/backend/postmaster/bgworker.c | 10 + src/backend/postmaster/datachecksumsworker.c | 1527 ++++++++++++++++++ src/backend/postmaster/pgstat.c | 6 + src/backend/replication/basebackup.c | 9 +- src/backend/replication/logical/decode.c | 1 + src/backend/storage/buffer/bufmgr.c | 5 + src/backend/storage/ipc/ipci.c | 3 + src/backend/storage/ipc/procsignal.c | 46 +- src/backend/storage/lmgr/lwlocknames.txt | 1 + src/backend/storage/page/README | 4 +- src/backend/storage/page/bufpage.c | 29 +- src/backend/utils/adt/pgstatfuncs.c | 6 - src/backend/utils/cache/relcache.c | 60 +- src/backend/utils/init/miscinit.c | 6 + src/backend/utils/init/postinit.c | 5 + src/backend/utils/misc/guc.c | 37 +- src/bin/pg_checksums/pg_checksums.c | 2 +- src/bin/pg_upgrade/controldata.c | 9 + src/bin/pg_upgrade/pg_upgrade.h | 2 +- src/include/access/xlog.h | 19 +- src/include/access/xlog_internal.h | 7 + src/include/catalog/pg_class.h | 3 + src/include/catalog/pg_control.h | 1 + src/include/catalog/pg_proc.dat | 16 + src/include/miscadmin.h | 2 + src/include/pgstat.h | 2 + src/include/postmaster/datachecksumsworker.h | 36 + src/include/storage/bufpage.h | 3 + src/include/storage/checksum.h | 8 + src/include/storage/procsignal.h | 10 +- src/test/Makefile | 2 +- src/test/checksum/.gitignore | 2 + src/test/checksum/Makefile | 23 + src/test/checksum/README | 22 + src/test/checksum/t/001_basic.pl | 89 + src/test/checksum/t/002_restarts.pl | 108 ++ src/test/checksum/t/003_standby_checksum.pl | 116 ++ 51 files changed, 2815 insertions(+), 75 deletions(-) create mode 100644 src/backend/postmaster/datachecksumsworker.c create mode 100644 src/include/postmaster/datachecksumsworker.h create mode 100644 src/test/checksum/.gitignore create mode 100644 src/test/checksum/Makefile create mode 100644 src/test/checksum/README create mode 100644 src/test/checksum/t/001_basic.pl create mode 100644 src/test/checksum/t/002_restarts.pl create mode 100644 src/test/checksum/t/003_standby_checksum.pl diff --git a/doc/src/sgml/amcheck.sgml b/doc/src/sgml/amcheck.sgml index 99fad708bf..494cd1bd08 100644 --- a/doc/src/sgml/amcheck.sgml +++ b/doc/src/sgml/amcheck.sgml @@ -497,7 +497,7 @@ SET client_min_messages = DEBUG1; Structural corruption can happen due to faulty storage hardware, or relation files being overwritten or modified by unrelated software. This kind of corruption can also be detected with - data page + data page checksums. diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index 79069ddfab..9cf87c03f3 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -2166,6 +2166,17 @@ SCRAM-SHA-256$<iteration count>:&l + + + relhaschecksums bool + + + True if relation has data checksums on all pages. This state is only + used during checksum processing; this field should never be consulted + for cluster checksum status. + + + relrewrite oid diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index df29af6371..07464a5590 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -25095,6 +25095,77 @@ postgres=# SELECT * FROM pg_walfile_name_offset(pg_stop_backup()); + + Data Checksum Functions + + + The functions shown in can + be used to enable or disable data checksums in a running cluster. + See for details. + + + + Data Checksum Functions + + + + + Function + + + Description + + + + + + + + + pg_enable_data_checksums + + pg_enable_data_checksums ( cost_delay int, cost_limit int ) + boolean + + + Initiates data checksums for the cluster. This will switch the data + checksums mode to inprogress-on as well as start a + background worker that will process all data in the database and enable + checksums for it. When all data pages have had checksums enabled, the + cluster will automatically switch data checksums mode to + on. Returns true if processing + was started. + + + If cost_delay and cost_limit are + specified, the speed of the process is throttled using the same principles as + Cost-based Vacuum Delay. + + + + + + + pg_disable_data_checksums + + pg_disable_data_checksums () + boolean + + + Disables data checksums for the cluster. This will switch the data + checksum mode to inprogress-off while data checksums + are being disabled. When all active backends have ceased to validate + data checksums, the data checksum mode will be changed to off. + Returns false in case data checksums are disabled + already. + + + + +
+ +
+ Database Object Management Functions diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 52a69a5366..4c770d6611 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -3693,8 +3693,7 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
Number of data page checksum failures detected in this - database (or on a shared object), or NULL if data checksums are not - enabled. + database.
@@ -3704,8 +3703,7 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i Time at which the last data page checksum failure was detected in - this database (or on a shared object), or NULL if data checksums are not - enabled. + this database (or on a shared object). diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml index 385ac25150..e3b0048806 100644 --- a/doc/src/sgml/ref/initdb.sgml +++ b/doc/src/sgml/ref/initdb.sgml @@ -219,6 +219,7 @@ PostgreSQL documentation failures will be reported in the pg_stat_database view. + See for details. diff --git a/doc/src/sgml/ref/pg_checksums.sgml b/doc/src/sgml/ref/pg_checksums.sgml index 1dd4e54ff1..0dd1c509eb 100644 --- a/doc/src/sgml/ref/pg_checksums.sgml +++ b/doc/src/sgml/ref/pg_checksums.sgml @@ -45,6 +45,12 @@ PostgreSQL documentation exit status is nonzero if the operation failed. + + When enabling checksums, if checksums were in the process of being enabled + when the cluster was shut down, pg_checksums + will still process all relations regardless of the online processing. + + When verifying checksums, every file in the cluster is scanned. When enabling checksums, every file in the cluster is rewritten in-place. diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml index d1c3893b14..a9d8bd631f 100644 --- a/doc/src/sgml/wal.sgml +++ b/doc/src/sgml/wal.sgml @@ -230,6 +230,103 @@ + + Data Checksums + + checksums + + + + Data pages are not checksum protected by default, but this can optionally be + enabled for a cluster. When enabled, each data page will be assigned a + checksum that is updated when the page is written and verified every time + the page is read. Only data pages are protected by checksums, internal data + structures and temporary files are not. + + + + Checksums are normally enabled when the cluster is initialized using initdb. + They can also be enabled or disabled at a later time, either as an offline + operation or in a running cluster. In all cases, checksums are enabled or + disabled at the full cluster level, and cannot be specified individually for + databases or tables. + + + + The current state of checksums in the cluster can be verified by viewing the + value of the read-only configuration variable by issuing the command SHOW + data_checksums. + + + + When attempting to recover from corrupt data it may be necessary to bypass + the checksum protection in order to recover data. To do this, temporarily + set the configuration parameter . + + + + On-line Enabling of Checksums + + + Checksums can be enabled or disabled online, by calling the appropriate + functions. + Disabling of checksums takes effect immediately when the function is called. + + + + Enabling checksums will put the cluster checksum mode in + inprogress mode. During this time, checksums will be + written but not verified. In addition to this, a background worker process + is started that enables checksums on all existing data in the cluster. Once + this worker has completed processing all databases in the cluster, the + checksum mode will automatically switch to on. The + processing will consume a background worker process, make sure that + max_worker_processes allows for at least one more + additional process. + + + + The process will initially wait for all open transactions to finish before + it starts, so that it can be certain that there are no tables that have been + created inside a transaction that has not committed yet and thus would not + be visible to the process enabling checksums. It will also, for each database, + wait for all pre-existing temporary tables to get removed before it finishes. + If long-lived temporary tables are used in the application it may be necessary + to terminate these application connections to allow the process to complete. + + + + If the cluster is stopped while in inprogress-on mode, for + any reason, then this process must be restarted manually. To do this, + re-execute the function pg_enable_data_checksums() + once the cluster has been restarted. The background worker will attempt + to resume the work from where it was interrupted. + + + + + Enabling checksums can cause significant I/O to the system, as most of the + database pages will need to be rewritten, and will be written both to the + data files and the WAL. + + + + + + + Off-line Enabling of Checksums + + + The pg_checksums + application can be used to enable or disable data checksums, as well as + verify checksums, on an offline cluster. + + + + + Write-Ahead Logging (<acronym>WAL</acronym>) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 1b2f70499e..81ab0785ef 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -7258,7 +7258,7 @@ log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid, * and dirtied. * * If checksums are enabled, we also generate a full-page image of - * heap_buffer, if necessary. + * heap_buffer. */ XLogRecPtr log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer, @@ -7279,11 +7279,13 @@ log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer, XLogRegisterBuffer(0, vm_buffer, 0); flags = REGBUF_STANDARD; + HOLD_INTERRUPTS(); if (!XLogHintBitIsNeeded()) flags |= REGBUF_NO_IMAGE; XLogRegisterBuffer(1, heap_buffer, flags); recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE); + RESUME_INTERRUPTS(); return recptr; } diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index 3200f777f5..4f61107a6a 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -18,6 +18,7 @@ #include "access/xlog.h" #include "access/xlog_internal.h" #include "catalog/pg_control.h" +#include "storage/bufpage.h" #include "utils/guc.h" #include "utils/timestamp.h" @@ -140,6 +141,20 @@ xlog_desc(StringInfo buf, XLogReaderState *record) xlrec.ThisTimeLineID, xlrec.PrevTimeLineID, timestamptz_to_str(xlrec.end_time)); } + else if (info == XLOG_CHECKSUMS) + { + xl_checksum_state xlrec; + + memcpy(&xlrec, rec, sizeof(xl_checksum_state)); + if (xlrec.new_checksumtype == PG_DATA_CHECKSUM_VERSION) + appendStringInfo(buf, "on"); + else if (xlrec.new_checksumtype == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION) + appendStringInfo(buf, "inprogress-off"); + else if (xlrec.new_checksumtype == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION) + appendStringInfo(buf, "inprogress-on"); + else + appendStringInfo(buf, "off"); + } } const char * @@ -185,6 +200,9 @@ xlog_identify(uint8 info) case XLOG_FPI_FOR_HINT: id = "FPI_FOR_HINT"; break; + case XLOG_CHECKSUMS: + id = "CHECKSUMS"; + break; } return id; diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 13f1d8c3dc..df0ef05ad9 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -38,6 +38,7 @@ #include "access/xlogreader.h" #include "access/xlogutils.h" #include "catalog/catversion.h" +#include "catalog/pg_class.h" #include "catalog/pg_control.h" #include "catalog/pg_database.h" #include "commands/progress.h" @@ -49,6 +50,7 @@ #include "pgstat.h" #include "port/atomics.h" #include "postmaster/bgwriter.h" +#include "postmaster/datachecksumsworker.h" #include "postmaster/startup.h" #include "postmaster/walwriter.h" #include "replication/basebackup.h" @@ -252,6 +254,16 @@ static bool LocalPromoteIsTriggered = false; */ static int LocalXLogInsertAllowed = -1; +/* + * Local state for Controlfile data_checksum_version. After initialization, + * this is only updated when absorbing a procsignal barrier during interrupt + * processing. The reason for keeping a copy in backend-private memory is to + * avoid locking for interrogating checksum state. Possible values are the + * checksum versions defined in storage/bufpage.h and zero for when checksums + * are disabled. + */ +static uint32 LocalDataChecksumVersion = 0; + /* * When ArchiveRecoveryRequested is set, archive recovery was requested, * ie. signal files were present. When InArchiveRecovery is set, we are @@ -893,6 +905,7 @@ static void SetLatestXTime(TimestampTz xtime); static void SetCurrentChunkStartTime(TimestampTz xtime); static void CheckRequiredParameterValues(void); static void XLogReportParameters(void); +static void XlogChecksums(ChecksumType new_type); static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI); static void LocalSetXLogInsertAllowed(void); @@ -1065,8 +1078,8 @@ XLogInsertRecord(XLogRecData *rdata, * and fast otherwise. * * Also check to see if fullPageWrites or forcePageWrites was just turned - * on; if we weren't already doing full-page writes then go back and - * recompute. + * on, or of we are in the process of enabling checksums in the cluster; + * if we weren't already doing full-page writes then go back and recompute. * * If we aren't doing full-page writes then RedoRecPtr doesn't actually * affect the contents of the XLOG record, so we'll update our local copy @@ -1079,7 +1092,7 @@ XLogInsertRecord(XLogRecData *rdata, Assert(RedoRecPtr < Insert->RedoRecPtr); RedoRecPtr = Insert->RedoRecPtr; } - doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites); + doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites || DataChecksumsOnInProgress()); if (doPageWrites && (!prevDoPageWrites || @@ -4891,9 +4904,7 @@ ReadControlFile(void) CalculateCheckpointSegments(); - /* Make the initdb settings visible as GUC variables, too */ - SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no", - PGC_INTERNAL, PGC_S_OVERRIDE); + LocalDataChecksumVersion = ControlFile->data_checksum_version; } /* @@ -4927,13 +4938,299 @@ GetMockAuthenticationNonce(void) } /* - * Are checksums enabled for data pages? + * DataChecksumsNeedWrite + * Returns whether data checksums must be written or not + * + * Are checksums enabled, or in the process of being enabled, for data pages? + * In case checksums are being enabled we must write the checksum even though + * it's not verified during this stage. + */ +bool +DataChecksumsNeedWrite(void) +{ + Assert(InterruptHoldoffCount > 0); + return (LocalDataChecksumVersion == PG_DATA_CHECKSUM_VERSION || + LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION || + LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION); +} + +/* + * DataChecksumsNeedVerify + * Returns whether data checksums must be verified or not + * + * Data checksums are only verified if they are fully enabled in the cluster. + * During the "inprogress-on" and "inprogress-off" states they are only + * updated, not verified. + * + * This function is intended for callsites which have read data and are about + * to perform checksum validation based on the result of this. To avoid the + * the risk of the checksum state changing between reading and performing the + * validation (or not), interrupts must be held off. This implies that calling + * this function must be performed as close to the validation call as possible + * to keep the critical section short. This is in order to protect against + * TOCTOU situations around checksum validation. */ bool -DataChecksumsEnabled(void) +DataChecksumsNeedVerify(void) +{ + Assert(InterruptHoldoffCount > 0); + return (LocalDataChecksumVersion == PG_DATA_CHECKSUM_VERSION); +} + +/* + * DataChecksumsOnInProgress + * Returns whether data checksums are being enabled + * + * Most operations don't need to worry about the "inprogress" states, and + * should use DataChecksumsNeedVerify() or DataChecksumsNeedWrite(). The + * "inprogress" state for enabling checksums is used when the checksum worker + * is setting checksums on all pages, it can thus be used to check for aborted + * checksum processing which need to be restarted. + */ +inline bool +DataChecksumsOnInProgress(void) +{ + return (LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION); +} + +/* + * DataChecksumsOffInProgress + * Returns whether data checksums are being disabled + * + * The "inprogress" state for disabling checksums is used for when the worker + * resets the catalog state. Operations should use DataChecksumsNeedVerify() + * or DataChecksumsNeedWrite() for deciding whether to read/write checksums. + */ +bool +DataChecksumsOffInProgress(void) +{ + return (LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION); +} + +void +SetDataChecksumsOnInProgress(void) +{ + uint64 barrier; + + Assert(ControlFile != NULL); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + + if (ControlFile->data_checksum_version != 0) + { + LWLockRelease(ControlFileLock); + return; + } + LWLockRelease(ControlFileLock); + + MyProc->delayChkpt = true; + START_CRIT_SECTION(); + + XlogChecksums(PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON); + + END_CRIT_SECTION(); + MyProc->delayChkpt = false; + + WaitForProcSignalBarrier(barrier); +} + +void +AbsorbChecksumsOnInProgressBarrier(void) +{ + Assert(LocalDataChecksumVersion == 0 || LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION); + LocalDataChecksumVersion = PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION; +} + +/* + * SetDataChecksumsOn + * Enables data checksums cluster-wide + * + * Enabling data checksums is performed using two barriers, the first one + * sets the checksums state to "inprogress-on" and the second one to "on". + * During "inprogress-on", checksums are written but not verified. When all + * existing pages are guaranteed to have checksums, and all new pages will be + * initiated with checksums, the state can be changed to "on". + */ +void +SetDataChecksumsOn(void) { + uint64 barrier; + Assert(ControlFile != NULL); - return (ControlFile->data_checksum_version > 0); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + + if (ControlFile->data_checksum_version != PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION) + { + LWLockRelease(ControlFileLock); + elog(ERROR, "checksums not in \"inprogress-on\" mode"); + } + + LWLockRelease(ControlFileLock); + + MyProc->delayChkpt = true; + START_CRIT_SECTION(); + + XlogChecksums(PG_DATA_CHECKSUM_VERSION); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->data_checksum_version = PG_DATA_CHECKSUM_VERSION; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON); + + END_CRIT_SECTION(); + MyProc->delayChkpt = false; + + WaitForProcSignalBarrier(barrier); +} + +void +AbsorbChecksumsOnBarrier(void) +{ + Assert(LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION); + LocalDataChecksumVersion = PG_DATA_CHECKSUM_VERSION; +} + +/* + * SetDataChecksumsOff + * Disables data checksums cluster-wide + * + * Disabling data checksums must be performed with two sets of barriers, each + * carrying a different state. The state is first set to "inprogress-off" + * during which checksums are still written but not verified. This ensures that + * backends which have yet to observe the state change from "on" won't get + * validation errors on concurrently modified pages. Once all backends have + * changed to "inprogress-off", the barrier for moving to "off" can be + * emitted. + */ +void +SetDataChecksumsOff(void) +{ + uint64 barrier; + + Assert(ControlFile); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + + /* If data checksums are already disabled there is nothing to do */ + if (ControlFile->data_checksum_version == 0) + { + LWLockRelease(ControlFileLock); + return; + } + + /* + * If data checksums are currently enabled we first transition to the + * inprogress-off state during which backends continue to write checksums + * without verifying them. When all backends are in "inprogress-off" the + * next transition to "off" can be performed, after which all data checksum + * processing is disabled. + */ + if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_VERSION) + { + LWLockRelease(ControlFileLock); + + MyProc->delayChkpt = true; + START_CRIT_SECTION(); + + XlogChecksums(PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF); + + END_CRIT_SECTION(); + MyProc->delayChkpt = false; + + /* + * Update local state in all backends to ensure that any backend in + * "on" state is changed to "inprogress-off". + */ + WaitForProcSignalBarrier(barrier); + + /* + * At this point we know that no backends are verifying data checksums + * during reading. Next, we can safely move to state "off" to also + * stop writing checksums. + */ + } + else + { + /* + * Ending up here implies that the checksums state is "inprogress-on" + * and we can transition directly to "off" from there. + */ + LWLockRelease(ControlFileLock); + } + + /* + * Ensure that we don't incur a checkpoint during disabling checksums. + */ + MyProc->delayChkpt = true; + START_CRIT_SECTION(); + + XlogChecksums(0); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->data_checksum_version = 0; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF); + + END_CRIT_SECTION(); + MyProc->delayChkpt = false; + + WaitForProcSignalBarrier(barrier); +} + +/* + * Barrier absorption functions for disabling data checksums + */ +void +AbsorbChecksumsOffInProgressBarrier(void) +{ + LocalDataChecksumVersion = PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION; +} + +void +AbsorbChecksumsOffBarrier(void) +{ + LocalDataChecksumVersion = 0; +} + +void +InitLocalControldata(void) +{ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + LocalDataChecksumVersion = ControlFile->data_checksum_version; + LWLockRelease(ControlFileLock); +} + +/* guc hook */ +const char * +show_data_checksums(void) +{ + if (LocalDataChecksumVersion == PG_DATA_CHECKSUM_VERSION) + return "on"; + else if (LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION) + return "inprogress-on"; + else if (LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION) + return "inprogress-off"; + else + return "off"; } /* @@ -7917,6 +8214,32 @@ StartupXLOG(void) */ CompleteCommitTsInitialization(); + /* + * If we reach this point with checksums in progress state (either being + * enabled or being disabled), we notify the user that they need to + * manually restart the process to enable checksums. This is because we + * cannot launch a dynamic background worker directly from here, it has to + * be launched from a regular backend. + */ + if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION) + ereport(WARNING, + (errmsg("data checksums are being enabled, but no worker is running"), + errhint("Either disable or enable data checksums by calling the pg_disable_data_checksums() or pg_enable_data_checksums() functions."))); + + /* + * If data checksums were being disabled when the cluster was shutdown, we + * know that we have a state where all backends have stopped validating + * checksums and we can move to off instead. + */ + if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION) + { + XlogChecksums(0); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->data_checksum_version = 0; + LWLockRelease(ControlFileLock); + } + /* * All done with end-of-recovery actions. * @@ -9768,6 +10091,24 @@ XLogReportParameters(void) } } +/* + * Log the new state of checksums + */ +static void +XlogChecksums(ChecksumType new_type) +{ + xl_checksum_state xlrec; + XLogRecPtr recptr; + + xlrec.new_checksumtype = new_type; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xl_checksum_state)); + + recptr = XLogInsert(RM_XLOG_ID, XLOG_CHECKSUMS); + XLogFlush(recptr); +} + /* * Update full_page_writes in shared memory, and write an * XLOG_FPW_CHANGE record if necessary. @@ -10223,6 +10564,28 @@ xlog_redo(XLogReaderState *record) /* Keep track of full_page_writes */ lastFullPageWrites = fpw; } + else if (info == XLOG_CHECKSUMS) + { + xl_checksum_state state; + + memcpy(&state, XLogRecGetData(record), sizeof(xl_checksum_state)); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->data_checksum_version = state.new_checksumtype; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + if (state.new_checksumtype == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION) + WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON)); + else if (state.new_checksumtype == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION) + WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF)); + else if (state.new_checksumtype == PG_DATA_CHECKSUM_VERSION) + WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON)); + else + { + Assert(state.new_checksumtype == 0); + WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF)); + } + } } #ifdef WAL_DEBUG diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c index 290658b22c..ab531484a7 100644 --- a/src/backend/access/transam/xlogfuncs.c +++ b/src/backend/access/transam/xlogfuncs.c @@ -25,6 +25,7 @@ #include "catalog/pg_type.h" #include "funcapi.h" #include "miscadmin.h" +#include "postmaster/datachecksumsworker.h" #include "pgstat.h" #include "replication/walreceiver.h" #include "storage/fd.h" @@ -784,3 +785,49 @@ pg_promote(PG_FUNCTION_ARGS) (errmsg("server did not promote within %d seconds", wait_seconds))); PG_RETURN_BOOL(false); } + +/* + * Disables checksums for the cluster, unless already disabled. + * + * Has immediate effect - the checksums are set to off right away. + */ +Datum +disable_data_checksums(PG_FUNCTION_ARGS) +{ + if (!superuser()) + ereport(ERROR, + (errmsg("must be superuser"))); + + StartDatachecksumsWorkerLauncher(false, 0, 0); + + PG_RETURN_BOOL(true); +} + +/* + * Enables checksums for the cluster, unless already enabled. + * + * Supports vacuum-like cost-based throttling, to limit system load. + * Starts a background worker that updates checksums on existing data. + */ +Datum +enable_data_checksums(PG_FUNCTION_ARGS) +{ + int cost_delay = PG_GETARG_INT32(0); + int cost_limit = PG_GETARG_INT32(1); + + if (!superuser()) + ereport(ERROR, + (errmsg("must be superuser"))); + + if (cost_delay < 0) + ereport(ERROR, + (errmsg("cost delay cannot be less than zero"))); + + if (cost_limit <= 0) + ereport(ERROR, + (errmsg("cost limit must be a positive value"))); + + StartDatachecksumsWorkerLauncher(true, cost_delay, cost_limit); + + PG_RETURN_BOOL(true); +} diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index 4cd7d76938..ea642fa0ff 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -965,10 +965,13 @@ InsertPgClassTuple(Relation pg_class_desc, /* relpartbound is set by updating this tuple, if necessary */ nulls[Anum_pg_class_relpartbound - 1] = true; + HOLD_INTERRUPTS(); + values[Anum_pg_class_relhaschecksums - 1] = BoolGetDatum(DataChecksumsNeedWrite()); tup = heap_form_tuple(RelationGetDescr(pg_class_desc), values, nulls); /* finally insert the new tuple, update the indexes, and clean up */ CatalogTupleInsert(pg_class_desc, tup); + RESUME_INTERRUPTS(); heap_freetuple(tup); } diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index b140c210bc..9ac784af9a 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1246,6 +1246,11 @@ CREATE OR REPLACE FUNCTION RETURNS boolean STRICT VOLATILE LANGUAGE INTERNAL AS 'pg_promote' PARALLEL SAFE; +CREATE OR REPLACE FUNCTION pg_enable_data_checksums ( + cost_delay int DEFAULT 0, cost_limit int DEFAULT 100) + RETURNS boolean STRICT VOLATILE LANGUAGE internal AS 'enable_data_checksums' + PARALLEL RESTRICTED; + -- legacy definition for compatibility with 9.3 CREATE OR REPLACE FUNCTION json_populate_record(base anyelement, from_json json, use_json_as_text boolean DEFAULT false) diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile index bfdf6a833d..59b82ee9ce 100644 --- a/src/backend/postmaster/Makefile +++ b/src/backend/postmaster/Makefile @@ -17,6 +17,7 @@ OBJS = \ bgworker.o \ bgwriter.o \ checkpointer.o \ + datachecksumsworker.o \ fork_process.o \ interrupt.o \ pgarch.o \ diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c index 5a9a0e3435..aeb6d8c642 100644 --- a/src/backend/postmaster/bgworker.c +++ b/src/backend/postmaster/bgworker.c @@ -18,6 +18,7 @@ #include "pgstat.h" #include "port/atomics.h" #include "postmaster/bgworker_internals.h" +#include "postmaster/datachecksumsworker.h" #include "postmaster/interrupt.h" #include "postmaster/postmaster.h" #include "replication/logicallauncher.h" @@ -128,6 +129,15 @@ static const struct }, { "ApplyWorkerMain", ApplyWorkerMain + }, + { + "DatachecksumsWorkerLauncherMain", DatachecksumsWorkerLauncherMain + }, + { + "DatachecksumsWorkerMain", DatachecksumsWorkerMain + }, + { + "ResetDataChecksumsStateInDatabase", ResetDataChecksumsStateInDatabase } }; diff --git a/src/backend/postmaster/datachecksumsworker.c b/src/backend/postmaster/datachecksumsworker.c new file mode 100644 index 0000000000..5d94db95f9 --- /dev/null +++ b/src/backend/postmaster/datachecksumsworker.c @@ -0,0 +1,1527 @@ +/*------------------------------------------------------------------------- + * + * datachecksumsworker.c + * Background worker for enabling or disabling data checksums online + * + * When enabling data checksums on a database at initdb time or with + * pg_checksums, no extra process is required as each page is checksummed, and + * verified, when accessed. When enabling checksums on an already running + * cluster, which does not run with checksums enabled, this worker will ensure + * that all pages are checksummed before verification of the checksums is + * turned on. In the case of disabling checksums, the state transition is + * recorded in the catalog and control file, and no changes are performed + * on the data pages or in the catalog. + * + * Checksums can be either enabled or disabled cluster-wide, with on/off being + * the end state for data_checksums. + * + * Enabling checksums + * ------------------ + * When enabling checksums in an online cluster, data_checksums will be set to + * "inprogress-on" which signals that write operations MUST compute and write + * the checksum on the data page, but during reading the checksum SHALL NOT be + * verified. This ensures that all objects created during checksumming will + * have checksums set, but no reads will fail due to incorrect checksum. The + * DataChecksumsWorker will compile a list of databases which exist at the + * start of checksumming, and all of these which haven't been dropped during + * the processing MUST have been processed successfully in order for checksums + * to be enabled. Any new relation created during processing will see the + * in-progress state and will automatically be checksummed as well as have its + * state recorded in the catalog to avoid the datachecksumsworker having to + * process it when already checksummed. + * + * For each database, all relations which have storage are read and every data + * page is marked dirty to force a write with the checksum. This will generate + * a lot of WAL as the entire database is read and written. Once all datapages + * in a relation have been written, pg_class.relhaschecksums is set to true to + * indicate that the relation is done. + * + * If the processing is interrupted by a cluster restart, it will be restarted + * from where it left off given that pg_class.relhaschecksums track state of + * processed relations and the in-progress state will ensure all new writes + * performed with checksums. Each database will be reprocessed, but relations + * where pg_class.relhaschecksums is true are skipped. + * + * If data checksums are enabled, then disabled, and then re-enabled, every + * relation's pg_class.relhaschecksums field will be reset to false before + * entering the in-progress mode. + * + * + * Disabling checksums + * ------------------- + * When disabling checksums, data_checksums will be set to "inprogress-off" + * which signals that checksums are written but no longer verified. This ensure + * that backends which have yet to move from the "on" state will still be able + * to process data checksum validation. During "inprogress-off", the catalog + * state pg_class.relhaschecksums is cleared for all relations. + * + * + * Synchronization and Correctness + * ------------------------------- + * The processes involved in enabling, or disabling, data checksums in an + * online cluster must be properly synchronized with the normal backends + * serving concurrent queries to ensure correctness. Correctness is defined + * as the following: + * + * - Backends SHALL NOT violate local datachecksum state + * - Data checksums SHALL NOT be considered enabled cluster-wide until all + * currently connected backends have the local state "enabled" + * + * There are two levels of synchronization required for enabling data checksums + * in an online cluster: (i) changing state in the active backends ("on", + * "off", "inprogress-on" and "inprogress-off"), and (ii) ensuring no + * incompatible objects and processes are left in a database when workers end. + * The former deals with cluster-wide agreement on data checksum state and the + * latter with ensuring that any concurrent activity cannot break the data + * checksum contract during processing. + * + * Synchronizing the state change is done with procsignal barriers, where the + * backend updating the global state in the controlfile will wait for all other + * backends to absorb the barrier before WAL logging. Barrier absorption will + * happen during interrupt processing, which means that connected backends will + * change state at different times. + * + * When Enabling Data Checksums + * ---------------------------- + * A process which fails to observe data checksums being enabled can induce + * two types of errors: failing to write the checksum when modifying the page + * and failing to validate the data checksum on the page when reading it. + * + * When the DataChecksumsWorker has finished writing checksums on all pages + * and enable data checksums cluster-wide, there are three sets of backends: + * + * Bg: Backend updating the global state and emitting the procsignalbarrier + * Bd: Backends on "off" state + * Be: Backends in "on" state + * Bi: Backends in "inprogress-on" state + * + * Backends transition from the Bd state to Be like so: Bd -> Bi -> Be + * + * Backends in Bi and Be will write checksums when modifying a page, but only + * backends in Be will verify the checksum during reading. The Bg backend is + * blocked waiting for all backends in Bi to process interrupts and move to + * Be. Any backend starting will observe the global state being "on" and will + * thus automatically belong to Be. Checksums are enabled cluster-wide when + * Bi is an empty set. All sets are compatible while still operating based on + * their local state. + * + * When Disabling Data Checksums + * ----------------------------- + * A process which fails to observe data checksums being disabled can induce + * two types of errors: writing the checksum when modifying the page and + * validating a data checksum which is no longer correct due to modifications + * to the page. + * + * Bg: Backend updating the global state and emitting the procsignalbarrier + * Bd: Backands in "off" state + * Be: Backends in "on" state + * Bi: Backends in "inprogress-off" state + * + * Backends transition from the Be state to Bd like so: Be -> Bi -> Bd + * + * The goal is to transition all backends to Bd making the others empty sets. + * Backends in Bi writes data checksums, but don't validate them, such that + * backends still in Be can continue to validate pages until the barrier has + * been absorbed such that they are in Bi. Once all backends are in Bi, the + * barrier to transition to "off" can be raised and all backends can safely + * stop writing data checksums as no backend is enforcing data checksum + * validation. + * + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/postmaster/datachecksumsworker.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/genam.h" +#include "access/heapam.h" +#include "access/htup_details.h" +#include "access/xact.h" +#include "catalog/indexing.h" +#include "catalog/pg_class.h" +#include "catalog/pg_database.h" +#include "commands/vacuum.h" +#include "common/relpath.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/bgworker.h" +#include "postmaster/bgwriter.h" +#include "postmaster/datachecksumsworker.h" +#include "storage/bufmgr.h" +#include "storage/checksum.h" +#include "storage/lmgr.h" +#include "storage/ipc.h" +#include "storage/procarray.h" +#include "storage/smgr.h" +#include "tcop/tcopprot.h" +#include "utils/fmgroids.h" +#include "utils/lsyscache.h" +#include "utils/ps_status.h" +#include "utils/syscache.h" + +#define DATACHECKSUMSWORKER_MAX_DB_RETRIES 5 + +#define MAX_OPS 4 + +typedef enum DataChecksumOperation +{ + ENABLE_CHECKSUMS = 1, + DISABLE_CHECKSUMS, + RESET_STATE, + SET_INPROGRESS_ON, + SET_CHECKSUMS_ON +} DataChecksumOperation; + +typedef enum +{ + DATACHECKSUMSWORKER_SUCCESSFUL = 0, + DATACHECKSUMSWORKER_ABORTED, + DATACHECKSUMSWORKER_FAILED, + DATACHECKSUMSWORKER_RETRYDB, +} DatachecksumsWorkerResult; + +typedef struct DatachecksumsWorkerShmemStruct +{ + /* + * Access to launcher_started and abort must be protected by + * DatachecksumsWorkerLock. + */ + bool launcher_started; + bool abort; + + /* + * Variables for the worker to signal the launcher, or subsequent workers + * in other databases. As there is only a single worker, and the launcher + * won't read these until the worker exits, they can be accessed without + * the need for a lock. If multiple workers are supported then this will + * have to be revisited. + */ + DatachecksumsWorkerResult success; + bool process_shared_catalogs; + + /* + * The below members are set when the launcher starts, and are only + * accessed read-only by the single worker. Thus, we can access these + * without a lock. If multiple workers, or dynamic cost parameters, are + * supported at some point then this would need to be revisited. + */ + int cost_delay; + int cost_limit; + int operations[MAX_OPS]; + bool target; +} DatachecksumsWorkerShmemStruct; + +/* Shared memory segment for datachecksumsworker */ +static DatachecksumsWorkerShmemStruct * DatachecksumsWorkerShmem; + +/* Bookkeeping for work to do */ +typedef struct DatachecksumsWorkerDatabase +{ + Oid dboid; + char *dbname; +} DatachecksumsWorkerDatabase; + +typedef struct DatachecksumsWorkerResultEntry +{ + Oid dboid; + DatachecksumsWorkerResult result; + int retries; +} DatachecksumsWorkerResultEntry; + + +/* Prototypes */ +static List *BuildDatabaseList(void); +static List *BuildRelationList(bool temp_relations, bool include_shared); +static DatachecksumsWorkerResult ProcessDatabase(DatachecksumsWorkerDatabase *db, const char *bgw_func_name); +static bool ProcessAllDatabases(bool *already_connected, const char *bgw_func_name); +static bool ProcessSingleRelationFork(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy); +static void launcher_cancel_handler(SIGNAL_ARGS); +static void SetRelHasChecksums(Oid relOid); +static void WaitForAllTransactionsToFinish(void); + +/* + * DataChecksumsWorkerStarted + * Informational function to query the state of the worker + */ +bool +DataChecksumsWorkerStarted(void) +{ + bool started; + + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + started = DatachecksumsWorkerShmem->launcher_started && !DatachecksumsWorkerShmem->abort; + LWLockRelease(DatachecksumsWorkerLock); + + return started; +} + + +/* + * StartDataChecksumsWorkerLauncher + * Main entry point for datachecksumsworker launcher process + * + * The main entrypoint for starting data checksums processing for enabling as + * well as disabling. + */ +void +StartDatachecksumsWorkerLauncher(bool enable_checksums, int cost_delay, int cost_limit) +{ + BackgroundWorker bgw; + BackgroundWorkerHandle *bgw_handle; + + /* + * Given that any backend can initiate a data checksum operation, the + * launcher can at this point be in one of the below distinct states: + * + * A: Started and performing an operation; B: Started and in the + * process of aborting; C: Not started + * + * If the launcher is in state A, and the requested target state is equal + * to the currently performed operation then we can return immediately. + * This can happen if two users enable checksums simultaneously. If the + * requested target is to disable checksums while they are being enabled, + * we must abort the current processing. This can happen if a user + * enables data checksums and then, before checksumming is done, disables + * data checksums again. + * + * If the launcher is in state B, we need to wait for processing to end + * and the abort flag be cleared before we can restart with the requested + * operation. Here we will exit immediately and leave it to the user to + * restart processing at a later time. + * + * If the launcher is in state C we can start performing the requested + * operation immediately. + */ + + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + + /* + * If the launcher is already started, the only operation we can perform + * is to cancel it iff the user requested for checksums to be disabled. + * That doesn't however mean that all other cases yield an error, as some + * might be perfectly benevolent. + */ + if (DatachecksumsWorkerShmem->launcher_started) + { + if (DatachecksumsWorkerShmem->abort) + { + ereport(NOTICE, + (errmsg("data checksum processing is concurrently being aborted, please retry"))); + + LWLockRelease(DatachecksumsWorkerLock); + return; + } + + /* + * If the launcher is started data checksums cannot be on or off, but + * it may be in an inprogress state. Since the state transition may + * not have happened yet (in case of rapidly initiated checksum enable + * calls for example) we inspect the target state of the currently + * running launcher. + */ + + if (enable_checksums) + { + /* + * If we are asked to enable checksums when they are already being + * enabled, there is nothing to do so exit. + */ + if (DatachecksumsWorkerShmem->target) + { + LWLockRelease(DatachecksumsWorkerLock); + return; + } + + /* + * Disabling checksums is likely to be a very quick operation in + * many cases so trying to abort it to save the checksums would + * run the risk of race conditions. + */ + else + { + ereport(NOTICE, + (errmsg("data checksums are concurrently being disabled, please retry"))); + + LWLockRelease(DatachecksumsWorkerLock); + return; + } + + /* This should be unreachable */ + Assert(false); + } + else if (!enable_checksums) + { + /* + * Data checksums are already being disabled, exit silently. + */ + if (DataChecksumsOffInProgress()) + { + LWLockRelease(DatachecksumsWorkerLock); + return; + } + + DatachecksumsWorkerShmem->abort = true; + LWLockRelease(DatachecksumsWorkerLock); + return; + } + } + + /* + * The launcher is currently not running, so we need to query the system + * data checksum state to determine how to proceed based on the requested + * target state. + */ + else + { + memset(DatachecksumsWorkerShmem->operations, 0, sizeof(DatachecksumsWorkerShmem->operations)); + DatachecksumsWorkerShmem->target = enable_checksums; + + /* + * If the launcher isn't started and we're asked to enable checksums, + * we need to check if processing was previously interrupted such that + * we should resume rather than start from scratch. + */ + if (enable_checksums) + { + /* + * If we are asked to enable checksums in a cluster which already + * has checksums enabled, exit immediately as there is nothing + * more to do. + */ + if (DataChecksumsNeedVerify()) + { + LWLockRelease(DatachecksumsWorkerLock); + return; + } + + /* + * If the controlfile state is set to "inprogress-on" then we will + * resume from where we left off based on the catalog state. This + * will be safe since new relations created while the checksum- + * worker was disabled will have checksums enabled. + */ + else if (DataChecksumsOnInProgress()) + { + DatachecksumsWorkerShmem->operations[0] = ENABLE_CHECKSUMS; + DatachecksumsWorkerShmem->operations[1] = SET_CHECKSUMS_ON; + } + + /* + * If the controlfile state is set to "inprogress-off" then we + * were interrupted while the catalog state was being cleared. In + * this case we need to first reset state and then continue with + * enabling checksums. + */ + else if (DataChecksumsOffInProgress()) + { + DatachecksumsWorkerShmem->operations[0] = RESET_STATE; + DatachecksumsWorkerShmem->operations[1] = SET_INPROGRESS_ON; + DatachecksumsWorkerShmem->operations[2] = ENABLE_CHECKSUMS; + DatachecksumsWorkerShmem->operations[3] = SET_CHECKSUMS_ON; + } + + /* + * Data checksums are off in the cluster, we can proceed with + * enabling them. Just in case we will start by resetting the + * catalog state since we are doing this from scratch and we don't + * want leftover catalog state to cause us to miss a relation. + */ + else + { + DatachecksumsWorkerShmem->operations[0] = RESET_STATE; + DatachecksumsWorkerShmem->operations[1] = SET_INPROGRESS_ON; + DatachecksumsWorkerShmem->operations[2] = ENABLE_CHECKSUMS; + DatachecksumsWorkerShmem->operations[3] = SET_CHECKSUMS_ON; + } + } + else if (!enable_checksums) + { + /* + * Regardless of current state in the system, we go through the + * motions when asked to disable checksums. The catalog state is + * only defined to be relevant during the operation of enabling + * checksums, and have no use at any other point in time. That + * being said, a user who sees stale relhaschecksums entries in the + * catalog might run this just in case. + * + * Resetting state must be performed after setting data checksum + * state to off, as there otherwise might (depending on system data + * checksum state) be a window between catalog resetting and state + * transition when new relations are created with the catalog state + * set to true. + */ + DatachecksumsWorkerShmem->operations[0] = DISABLE_CHECKSUMS; + DatachecksumsWorkerShmem->operations[1] = RESET_STATE; + } + } + + /* + * Backoff parameters to throttle the load during enabling. As there is + * no real processing performed during disabling checksums the backoff + * parameters do not apply there. + */ + if (enable_checksums) + { + DatachecksumsWorkerShmem->cost_delay = cost_delay; + DatachecksumsWorkerShmem->cost_limit = cost_limit; + } + else + { + DatachecksumsWorkerShmem->cost_delay = 0; + DatachecksumsWorkerShmem->cost_limit = 0; + } + + /* + * Prepare the BackgroundWorker and launch it. + */ + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "DatachecksumsWorkerLauncherMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, "datachecksumsworker launcher"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "datachecksumsworker launcher"); + bgw.bgw_restart_time = BGW_NEVER_RESTART; + bgw.bgw_notify_pid = MyProcPid; + bgw.bgw_main_arg = (Datum) 0; + + DatachecksumsWorkerShmem->launcher_started = true; + LWLockRelease(DatachecksumsWorkerLock); + + if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle)) + { + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + DatachecksumsWorkerShmem->launcher_started = false; + LWLockRelease(DatachecksumsWorkerLock); + ereport(ERROR, + (errmsg("failed to start background worker to process data checksums"))); + } +} + +/* + * ShutdownDatachecksumsWorkerIfRunning + * Request shutdown of the datachecksumsworker + * + * This does not turn off processing immediately, it signals the checksum + * process to end when done with the current block. + */ +void +ShutdownDatachecksumsWorkerIfRunning(void) +{ + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + + /* If the launcher isn't started, there is nothing to shut down */ + if (DatachecksumsWorkerShmem->launcher_started) + DatachecksumsWorkerShmem->abort = true; + + LWLockRelease(DatachecksumsWorkerLock); +} + +/* + * ProcessSingleRelationFork + * Enable data checksums in a single relation/fork. + * + * Returns true if successful, and false if *aborted*. On error, an actual + * error is raised in the lower levels. + */ +static bool +ProcessSingleRelationFork(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy) +{ + BlockNumber numblocks = RelationGetNumberOfBlocksInFork(reln, forkNum); + BlockNumber blknum; + char activity[NAMEDATALEN * 2 + 128]; + char *relns; + + relns = get_namespace_name(RelationGetNamespace(reln)); + + if (!relns) + return false; + + /* + * We are looping over the blocks which existed at the time of process + * start, which is safe since new blocks are created with checksums set + * already due to the state being "inprogress-on". + */ + for (blknum = 0; blknum < numblocks; blknum++) + { + Buffer buf = ReadBufferExtended(reln, forkNum, blknum, RBM_NORMAL, strategy); + + /* + * Report to pgstat every 100 blocks to keep from overwhelming the + * activity reporting with close to identical reports. + */ + if ((blknum % 100) == 0) + { + snprintf(activity, sizeof(activity) - 1, "processing: %s.%s (%s block %d/%d)", + relns, RelationGetRelationName(reln), + forkNames[forkNum], blknum, numblocks); + pgstat_report_activity(STATE_RUNNING, activity); + } + + /* Need to get an exclusive lock before we can flag as dirty */ + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + /* + * Mark the buffer as dirty and force a full page write. We have to + * re-write the page to WAL even if the checksum hasn't changed, + * because if there is a replica it might have a slightly different + * version of the page with an invalid checksum, caused by unlogged + * changes (e.g. hintbits) on the master happening while checksums + * were off. This can happen if there was a valid checksum on the page + * at one point in the past, so only when checksums are first on, then + * off, and then turned on again. Iff wal_level is set to "minimal", + * this could be avoided iff the checksum is calculated to be correct. + */ + START_CRIT_SECTION(); + MarkBufferDirty(buf); + log_newpage_buffer(buf, false); + END_CRIT_SECTION(); + + UnlockReleaseBuffer(buf); + + /* + * This is the only place where we check if we are asked to abort, the + * abortion will bubble up from here. It's safe to check this without + * a lock, because if we miss it being set, we will try again soon. + */ + if (DatachecksumsWorkerShmem->abort) + return false; + + vacuum_delay_point(); + } + + pfree(relns); + return true; +} + +/* + * ProcessSingleRelationByOid + * Process a single relation based on oid. + * + * Returns true if successful, and false if *aborted*. On error, an actual + * error is raised in the lower levels. + */ +static bool +ProcessSingleRelationByOid(Oid relationId, BufferAccessStrategy strategy) +{ + Relation rel; + ForkNumber fnum; + bool aborted = false; + + StartTransactionCommand(); + + elog(DEBUG2, + "adding data checksums to relation with OID %u", + relationId); + + rel = try_relation_open(relationId, AccessShareLock); + if (rel == NULL) + { + /* + * Relation no longer exists. We don't consider this an error since + * there are no pages in it that need data checksums, and thus return + * true. The worker operates off a list of relations generated at the + * start of processing, so relations being dropped in the meantime is + * to be expected. + */ + CommitTransactionCommand(); + pgstat_report_activity(STATE_IDLE, NULL); + return true; + } + RelationOpenSmgr(rel); + + for (fnum = 0; fnum <= MAX_FORKNUM; fnum++) + { + if (smgrexists(rel->rd_smgr, fnum)) + { + if (!ProcessSingleRelationFork(rel, fnum, strategy)) + { + aborted = true; + break; + } + } + } + relation_close(rel, AccessShareLock); + elog(DEBUG2, + "data checksum processing done for relation with OID %u: %s", + relationId, (aborted ? "aborted" : "finished")); + + if (!aborted) + SetRelHasChecksums(relationId); + + CommitTransactionCommand(); + + pgstat_report_activity(STATE_IDLE, NULL); + + return !aborted; +} + +/* + * SetRelHasChecksums + * + * Sets the pg_class.relhaschecksums flag for the relation specified by relOid + * to true. The corresponding function for clearing state is + * ResetDataChecksumsStateInDatabase which operate on all relations in a + * database. + */ +static void +SetRelHasChecksums(Oid relOid) +{ + Relation rel; + Relation heaprel; + Form_pg_class pg_class_tuple; + HeapTuple tuple; + + /* + * If the relation has gone away since we checksummed it then that's not + * an errorcase. Exit early and continue on the next relation instead. + */ + heaprel = try_relation_open(relOid, ShareUpdateExclusiveLock); + if (!heaprel) + return; + rel = table_open(RelationRelationId, RowExclusiveLock); + + tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(relOid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for relation %u", relOid); + + pg_class_tuple = (Form_pg_class) GETSTRUCT(tuple); + pg_class_tuple->relhaschecksums = true; + + CatalogTupleUpdate(rel, &tuple->t_self, tuple); + + ReleaseSysCache(tuple); + + table_close(rel, RowExclusiveLock); + relation_close(heaprel, ShareUpdateExclusiveLock); +} + +/* + * ProcessDatabase + * Enable data checksums in a single database. + * + * We do this by launching a dynamic background worker into this database, and + * waiting for it to finish. We have to do this in a separate worker, since + * each process can only be connected to one database during its lifetime. + */ +static DatachecksumsWorkerResult +ProcessDatabase(DatachecksumsWorkerDatabase * db, const char *bgw_func_name) +{ + BackgroundWorker bgw; + BackgroundWorkerHandle *bgw_handle; + BgwHandleStatus status; + pid_t pid; + char activity[NAMEDATALEN + 64]; + + DatachecksumsWorkerShmem->success = DATACHECKSUMSWORKER_FAILED; + + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "%s", bgw_func_name); + snprintf(bgw.bgw_name, BGW_MAXLEN, "datachecksumsworker worker"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "datachecksumsworker worker"); + bgw.bgw_restart_time = BGW_NEVER_RESTART; + bgw.bgw_notify_pid = MyProcPid; + bgw.bgw_main_arg = ObjectIdGetDatum(db->dboid); + + /* + * If there are no worker slots available, make sure we retry processing + * this database. This will make the datachecksumsworker move on to the + * next database and quite likely fail with the same problem. TODO: Maybe + * we need a backoff to avoid running through all the databases here in + * short order. + */ + if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle)) + { + ereport(WARNING, + (errmsg("failed to start worker for enabling data checksums in database \"%s\", retrying", + db->dbname), + errhint("The max_worker_processes setting might be too low."))); + return DATACHECKSUMSWORKER_RETRYDB; + } + + status = WaitForBackgroundWorkerStartup(bgw_handle, &pid); + if (status == BGWH_STOPPED) + { + ereport(WARNING, + (errmsg("could not start background worker for enabling data checksums in database \"%s\"", + db->dbname), + errhint("More details on the error might be found in the server log."))); + return DATACHECKSUMSWORKER_FAILED; + } + + /* + * If the postmaster crashed we cannot end up with a processed database so + * we have no alternative other than exiting. When enabling checksums we + * won't at this time have changed the pg_control version to enabled so + * when the cluster comes back up processing will have to be resumed. When + * disabling, the pg_control version will be set to off before this so + * when the cluster comes up checksums will be off as expected. In the + * latter case we might have stale relhaschecksums flags in pg_class which + * need to be handled in some way. TODO + */ + if (status == BGWH_POSTMASTER_DIED) + ereport(FATAL, + (errmsg("cannot enable data checksums without the postmaster process"), + errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums()."))); + + Assert(status == BGWH_STARTED); + ereport(DEBUG1, + (errmsg("initiating data checksum processing in database \"%s\"", + db->dbname))); + + snprintf(activity, sizeof(activity) - 1, + "Waiting for worker in database %s (pid %d)", db->dbname, pid); + pgstat_report_activity(STATE_RUNNING, activity); + + status = WaitForBackgroundWorkerShutdown(bgw_handle); + if (status == BGWH_POSTMASTER_DIED) + ereport(FATAL, + (errmsg("postmaster exited during data checksum processing in \"%s\"", + db->dbname), + errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums()."))); + + if (DatachecksumsWorkerShmem->success == DATACHECKSUMSWORKER_ABORTED) + ereport(LOG, + (errmsg("data checksums processing was aborted in database \"%s\"", + db->dbname))); + + pgstat_report_activity(STATE_IDLE, NULL); + + return DatachecksumsWorkerShmem->success; +} + +static void +launcher_exit(int code, Datum arg) +{ + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + DatachecksumsWorkerShmem->abort = false; + DatachecksumsWorkerShmem->launcher_started = false; + LWLockRelease(DatachecksumsWorkerLock); +} + +static void +launcher_cancel_handler(SIGNAL_ARGS) +{ + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + DatachecksumsWorkerShmem->abort = true; + LWLockRelease(DatachecksumsWorkerLock); +} + +/* + * WaitForAllTransactionsToFinish + * Blocks awaiting all current transactions to finish + * + * Returns when all transactions which are active at the call of the function + * have ended, or if the postmaster dies while waiting. If the postmaster dies + * the abort flag will be set to indicate that the caller of this shouldn't + * proceed. + */ +static void +WaitForAllTransactionsToFinish(void) +{ + TransactionId waitforxid; + bool aborted = false; + + LWLockAcquire(XidGenLock, LW_SHARED); + waitforxid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + LWLockRelease(XidGenLock); + + while (!aborted) + { + TransactionId oldestxid = GetOldestActiveTransactionId(); + + if (TransactionIdPrecedes(oldestxid, waitforxid)) + { + char activity[64]; + int rc; + + /* Oldest running xid is older than us, so wait */ + snprintf(activity, + sizeof(activity), + "Waiting for current transactions to finish (waiting for %u)", + waitforxid); + pgstat_report_activity(STATE_RUNNING, activity); + + /* Retry every 5 seconds */ + ResetLatch(MyLatch); + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + 5000, + WAIT_EVENT_CHECKSUM_ENABLE_STARTCONDITION); + + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + + /* + * If the postmaster died we won't be able to enable checksums + * cluster-wide so abort and hope to continue when restarted. + */ + if (rc & WL_POSTMASTER_DEATH) + DatachecksumsWorkerShmem->abort = true; + aborted = DatachecksumsWorkerShmem->abort; + + LWLockRelease(DatachecksumsWorkerLock); + } + else + { + pgstat_report_activity(STATE_IDLE, NULL); + return; + } + } +} + +/* + * DatachecksumsWorkerLauncherMain + * + * Main function for launching dynamic background workers for processing data + * checksums in databases. This function has the bgworker management, with + * ProcessAllDatabases being responsible for looping over the databases and + * initiating processing. + */ +void +DatachecksumsWorkerLauncherMain(Datum arg) +{ + bool connected = false; + bool status = false; + DataChecksumOperation current; + + on_shmem_exit(launcher_exit, 0); + + ereport(DEBUG1, + (errmsg("background worker \"datachecksumsworker\" launcher started"))); + + pqsignal(SIGTERM, die); + pqsignal(SIGINT, launcher_cancel_handler); + + BackgroundWorkerUnblockSignals(); + + InitXLOGAccess(); + + MyBackendType = B_DATACHECKSUMSWORKER_LAUNCHER; + init_ps_display(NULL); + + for (int i = 0; i < MAX_OPS; i++) + { + current = DatachecksumsWorkerShmem->operations[i]; + + if (!current) + break; + + switch (current) + { + case DISABLE_CHECKSUMS: + SetDataChecksumsOff(); + break; + + case SET_INPROGRESS_ON: + SetDataChecksumsOnInProgress(); + break; + + case SET_CHECKSUMS_ON: + SetDataChecksumsOn(); + break; + + case RESET_STATE: + status = ProcessAllDatabases(&connected, "ResetDataChecksumsStateInDatabase"); + if (!status) + { + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + DatachecksumsWorkerShmem->launcher_started = false; + DatachecksumsWorkerShmem->abort = false; + LWLockRelease(DatachecksumsWorkerLock); + ereport(ERROR, + (errmsg("unable to reset catalog checksum state"))); + } + break; + + case ENABLE_CHECKSUMS: + status = ProcessAllDatabases(&connected, "DatachecksumsWorkerMain"); + if (!status) + { + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + DatachecksumsWorkerShmem->launcher_started = false; + DatachecksumsWorkerShmem->abort = false; + LWLockRelease(DatachecksumsWorkerLock); + ereport(ERROR, + (errmsg("unable to enable checksums in cluster"))); + } + break; + + default: + elog(ERROR, "unknown checksum operation requested"); + break; + } + } + + /* + * Clean up after processing + */ + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + DatachecksumsWorkerShmem->launcher_started = false; + DatachecksumsWorkerShmem->abort = false; + LWLockRelease(DatachecksumsWorkerLock); +} + +/* + * ProcessAllDatabases + * Compute the list of all databases and process checksums in each + * + * This will repeatedly generate a list of databases to process for either + * enabling checksums or resetting the checksum catalog tracking. Until no + * new databases are found, this will loop around computing a new list and + * comparing it to the already seen ones. + */ +static bool +ProcessAllDatabases(bool *already_connected, const char *bgw_func_name) +{ + List *DatabaseList; + HTAB *ProcessedDatabases = NULL; + ListCell *lc; + HASHCTL hash_ctl; + bool found_failed = false; + + /* Initialize a hash tracking all processed databases */ + memset(&hash_ctl, 0, sizeof(hash_ctl)); + hash_ctl.keysize = sizeof(Oid); + hash_ctl.entrysize = sizeof(DatachecksumsWorkerResultEntry); + ProcessedDatabases = hash_create("Processed databases", + 64, + &hash_ctl, + HASH_ELEM | HASH_BLOBS); + + /* + * Initialize a connection to shared catalogs only. + */ + if (!*already_connected) + BackgroundWorkerInitializeConnection(NULL, NULL, 0); + + *already_connected = true; + + /* + * Set up so first run processes shared catalogs, but not once in every + * db. + */ + DatachecksumsWorkerShmem->process_shared_catalogs = true; + + while (true) + { + int processed_databases = 0; + + /* + * Get a list of all databases to process. This may include databases + * that were created during our runtime. + * + * Since a database can be created as a copy of any other database + * (which may not have existed in our last run), we have to repeat + * this loop until no new databases show up in the list. Since we wait + * for all pre-existing transactions finish, this way we can be + * certain that there are no databases left without checksums. + */ + DatabaseList = BuildDatabaseList(); + + foreach(lc, DatabaseList) + { + DatachecksumsWorkerDatabase *db = (DatachecksumsWorkerDatabase *) lfirst(lc); + DatachecksumsWorkerResult result; + DatachecksumsWorkerResultEntry *entry; + bool found; + + elog(DEBUG1, + "starting processing of database %s with oid %u", + db->dbname, db->dboid); + + entry = (DatachecksumsWorkerResultEntry *) hash_search(ProcessedDatabases, &db->dboid, + HASH_FIND, NULL); + + if (entry) + { + if (entry->result == DATACHECKSUMSWORKER_RETRYDB) + { + /* + * Limit the number of retries to avoid infinite looping + * in case there simply wont be enough workers in the + * cluster to finish this operation. + */ + if (entry->retries > DATACHECKSUMSWORKER_MAX_DB_RETRIES) + entry->result = DATACHECKSUMSWORKER_FAILED; + } + + /* Skip if this database has been processed already */ + if (entry->result != DATACHECKSUMSWORKER_RETRYDB) + { + pfree(db->dbname); + pfree(db); + continue; + } + } + + result = ProcessDatabase(db, bgw_func_name); + processed_databases++; + + if (result == DATACHECKSUMSWORKER_SUCCESSFUL) + { + /* + * If one database has completed shared catalogs, we don't + * have to process them again. + */ + if (DatachecksumsWorkerShmem->process_shared_catalogs) + DatachecksumsWorkerShmem->process_shared_catalogs = false; + } + else if (result == DATACHECKSUMSWORKER_ABORTED) + { + /* Abort flag set, so exit the whole process */ + return false; + } + + entry = hash_search(ProcessedDatabases, &db->dboid, HASH_ENTER, &found); + entry->dboid = db->dboid; + entry->result = result; + if (!found) + entry->retries = 0; + else + entry->retries++; + + pfree(db->dbname); + pfree(db); + } + + elog(DEBUG1, + "%i databases processed for data checksum enabling, %s", + processed_databases, + (processed_databases ? "process with restart" : "process completed")); + + list_free(DatabaseList); + + /* + * If no databases were processed in this run of the loop, we have now + * finished all databases and no concurrently created ones can exist. + */ + if (processed_databases == 0) + break; + } + + /* + * ProcessedDatabases now has all databases and the results of their + * processing. Failure to enable checksums for a database can be because + * they actually failed for some reason, or because the database was + * dropped between us getting the database list and trying to process it. + * Get a fresh list of databases to detect the second case where the + * database was dropped before we had started processing it. If a database + * still exists, but enabling checksums failed then we fail the entire + * checksumming process and exit with an error. + */ + DatabaseList = BuildDatabaseList(); + + foreach(lc, DatabaseList) + { + DatachecksumsWorkerDatabase *db = (DatachecksumsWorkerDatabase *) lfirst(lc); + DatachecksumsWorkerResult *entry; + bool found; + + entry = hash_search(ProcessedDatabases, (void *) &db->dboid, + HASH_FIND, &found); + + /* + * We are only interested in the databases where the failed database + * still exists. + */ + if (found && *entry == DATACHECKSUMSWORKER_FAILED) + { + ereport(WARNING, + (errmsg("failed to enable data checksums in \"%s\"", + db->dbname))); + found_failed = found; + continue; + } + } + + if (found_failed) + { + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + DatachecksumsWorkerShmem->abort = false; + DatachecksumsWorkerShmem->launcher_started = false; + LWLockRelease(DatachecksumsWorkerLock); + + /* Disable checksums on cluster, because we failed */ + SetDataChecksumsOff(); + ereport(ERROR, + (errmsg("checksums failed to get enabled in all databases, aborting"), + errhint("The server log might have more information on the error."))); + } + + /* + * Force a checkpoint to get everything out to disk. TODO: we probably + * don't want to use a CHECKPOINT_IMMEDIATE here but it's very convenient + * for testing until the patch is fully baked, as it may otherwise make + * tests take a lot longer. + */ + RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_IMMEDIATE); + + return true; +} + +/* + * DatachecksumsWorkerShmemSize + * Compute required space for datachecksumsworker-related shared memory + */ +Size +DatachecksumsWorkerShmemSize(void) +{ + Size size; + + size = sizeof(DatachecksumsWorkerShmemStruct); + size = MAXALIGN(size); + + return size; +} + +/* + * DatachecksumsWorkerShmemInit + * Allocate and initialize datachecksumsworker-related shared memory + */ +void +DatachecksumsWorkerShmemInit(void) +{ + bool found; + + DatachecksumsWorkerShmem = (DatachecksumsWorkerShmemStruct *) + ShmemInitStruct("DatachecksumsWorker Data", + DatachecksumsWorkerShmemSize(), + &found); + + MemSet(DatachecksumsWorkerShmem, 0, DatachecksumsWorkerShmemSize()); + + /* + * Even if this is a redundant assignment, we want to be explicit about + * our intent for readability, since we want to be able to query this + * state in case of restartability. + */ + DatachecksumsWorkerShmem->launcher_started = false; +} + +/* + * BuildDatabaseList + * Compile a list of all currently available databases in the cluster + * + * This creates the list of databases for the datachecksumsworker workers to + * add checksums to. + */ +static List * +BuildDatabaseList(void) +{ + List *DatabaseList = NIL; + Relation rel; + TableScanDesc scan; + HeapTuple tup; + MemoryContext ctx = CurrentMemoryContext; + MemoryContext oldctx; + + StartTransactionCommand(); + + rel = table_open(DatabaseRelationId, AccessShareLock); + + /* + * Before we do this, wait for all pending transactions to finish. This + * will ensure there are no concurrently running CREATE DATABASE, which + * could cause us to miss the creation of a database that was copied + * without checksums. + */ + WaitForAllTransactionsToFinish(); + + scan = table_beginscan_catalog(rel, 0, NULL); + + while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) + { + Form_pg_database pgdb = (Form_pg_database) GETSTRUCT(tup); + DatachecksumsWorkerDatabase *db; + + oldctx = MemoryContextSwitchTo(ctx); + + db = (DatachecksumsWorkerDatabase *) palloc(sizeof(DatachecksumsWorkerDatabase)); + + db->dboid = pgdb->oid; + db->dbname = pstrdup(NameStr(pgdb->datname)); + + DatabaseList = lappend(DatabaseList, db); + + MemoryContextSwitchTo(oldctx); + } + + table_endscan(scan); + table_close(rel, AccessShareLock); + + CommitTransactionCommand(); + + return DatabaseList; +} + +/* + * BuildRelationList + * Compile a list of relations in the database + * + * Returns a list of OIDs for the request relation types. If temp_relations + * is True then only temporary relations are returned. If temp_relations is + * False then non-temporary relations which have data checksums are returned. + * If include_shared is True then shared relations are included as well in a + * non-temporary list. include_shared has no relevance when building a list of + * temporary relations. + */ +static List * +BuildRelationList(bool temp_relations, bool include_shared) +{ + List *RelationList = NIL; + Relation rel; + TableScanDesc scan; + HeapTuple tup; + MemoryContext ctx = CurrentMemoryContext; + MemoryContext oldctx; + + StartTransactionCommand(); + + rel = table_open(RelationRelationId, AccessShareLock); + scan = table_beginscan_catalog(rel, 0, NULL); + + while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) + { + Form_pg_class pgc = (Form_pg_class) GETSTRUCT(tup); + + /* + * Only include temporary relations when asked for a temp relation + * list. + */ + if (pgc->relpersistence == RELPERSISTENCE_TEMP) + { + if (!temp_relations) + continue; + } + else + { + if (!RELKIND_HAS_STORAGE(pgc->relkind)) + continue; + + if (pgc->relhaschecksums) + continue; + + if (pgc->relisshared && !include_shared) + continue; + } + + oldctx = MemoryContextSwitchTo(ctx); + RelationList = lappend_oid(RelationList, pgc->oid); + MemoryContextSwitchTo(oldctx); + } + + table_endscan(scan); + table_close(rel, AccessShareLock); + + CommitTransactionCommand(); + + return RelationList; +} + +/* + * ResetDataChecksumsStateInDatabase + * Main worker function for clearing checksums state in the catalog + * + * Resets the pg_class.relhaschecksums flag to false for all entries in the + * current database. This is required to be performed before adding checksums + * to a running cluster in order to track the state of the processing. + */ +void +ResetDataChecksumsStateInDatabase(Datum arg) +{ + Relation rel; + HeapTuple tuple; + Oid dboid = DatumGetObjectId(arg); + TableScanDesc scan; + Form_pg_class pgc; + + pqsignal(SIGTERM, die); + + BackgroundWorkerUnblockSignals(); + + MyBackendType = B_DATACHECKSUMSWORKER_WORKER; + init_ps_display(NULL); + + ereport(DEBUG1, + (errmsg("resetting catalog state for data checksums in database with OID %u", + dboid))); + + BackgroundWorkerInitializeConnectionByOid(dboid, InvalidOid, BGWORKER_BYPASS_ALLOWCONN); + + StartTransactionCommand(); + + rel = table_open(RelationRelationId, RowExclusiveLock); + scan = table_beginscan_catalog(rel, 0, NULL); + + while (HeapTupleIsValid(tuple = heap_getnext(scan, ForwardScanDirection))) + { + tuple = heap_copytuple(tuple); + pgc = (Form_pg_class) GETSTRUCT(tuple); + + if (pgc->relhaschecksums) + { + pgc->relhaschecksums = false; + CatalogTupleUpdate(rel, &tuple->t_self, tuple); + } + + heap_freetuple(tuple); + } + + table_endscan(scan); + table_close(rel, RowExclusiveLock); + + CommitTransactionCommand(); + + DatachecksumsWorkerShmem->success = DATACHECKSUMSWORKER_SUCCESSFUL; +} + +/* + * DatachecksumsWorkerMain + * + * Main function for enabling checksums in a single database, This is the + * function set as the bgw_function_name in the dynamic background worker + * process initiiated for each database by the worker launcher. After enabling + * data checksums in each applicable relation in the database, it will wait for + * all temporary relations that were present when the function started to + * disappear before returning. This is required since we cannot rewrite + * existing temporary relations with data checksums. + */ +void +DatachecksumsWorkerMain(Datum arg) +{ + Oid dboid = DatumGetObjectId(arg); + List *RelationList = NIL; + List *InitialTempTableList = NIL; + ListCell *lc; + BufferAccessStrategy strategy; + bool aborted = false; + + pqsignal(SIGTERM, die); + + BackgroundWorkerUnblockSignals(); + + MyBackendType = B_DATACHECKSUMSWORKER_WORKER; + init_ps_display(NULL); + + ereport(DEBUG1, + (errmsg("starting data checksum processing in database with OID %u", + dboid))); + + BackgroundWorkerInitializeConnectionByOid(dboid, InvalidOid, + BGWORKER_BYPASS_ALLOWCONN); + + /* + * Get a list of all temp tables present as we start in this database. We + * need to wait until they are all gone until we are done, since we cannot + * access these relations and modify them. + */ + InitialTempTableList = BuildRelationList(true, false); + + /* + * Enable vacuum cost delay, if any. + */ + VacuumCostDelay = DatachecksumsWorkerShmem->cost_delay; + VacuumCostLimit = DatachecksumsWorkerShmem->cost_limit; + VacuumCostActive = (VacuumCostDelay > 0); + VacuumCostBalance = 0; + VacuumPageHit = 0; + VacuumPageMiss = 0; + VacuumPageDirty = 0; + + /* + * Create and set the vacuum strategy as our buffer strategy. + */ + strategy = GetAccessStrategy(BAS_VACUUM); + + RelationList = BuildRelationList(false, + DatachecksumsWorkerShmem->process_shared_catalogs); + foreach(lc, RelationList) + { + Oid reloid = lfirst_oid(lc); + + if (!ProcessSingleRelationByOid(reloid, strategy)) + { + aborted = true; + break; + } + } + list_free(RelationList); + + if (aborted) + { + DatachecksumsWorkerShmem->success = DATACHECKSUMSWORKER_ABORTED; + SetDataChecksumsOff(); + ereport(DEBUG1, + (errmsg("data checksum processing aborted in database OID %u", + dboid))); + return; + } + + /* + * Wait for all temp tables that existed when we started to go away. This + * is necessary since we cannot "reach" them to enable checksums. Any temp + * tables created after we started will already have checksums in them + * (due to the "inprogress-on" state), so no need to wait for those. + */ + while (!aborted) + { + List *CurrentTempTables; + ListCell *lc; + int numleft; + char activity[64]; + int rc; + + CurrentTempTables = BuildRelationList(true, false); + numleft = 0; + foreach(lc, InitialTempTableList) + { + if (list_member_oid(CurrentTempTables, lfirst_oid(lc))) + numleft++; + } + list_free(CurrentTempTables); + + if (numleft == 0) + break; + + /* At least one temp table is left to wait for */ + snprintf(activity, + sizeof(activity), + "Waiting for %d temp tables to be removed", numleft); + pgstat_report_activity(STATE_RUNNING, activity); + + /* Retry every 5 seconds */ + ResetLatch(MyLatch); + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + 5000, + WAIT_EVENT_CHECKSUM_ENABLE_FINISHCONDITION); + + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + + /* + * If the postmaster died we won't be able to enable checksums + * cluster-wide so abort and hope to continue when restarted. + */ + if (rc & WL_POSTMASTER_DEATH) + DatachecksumsWorkerShmem->abort = true; + aborted = DatachecksumsWorkerShmem->abort; + + LWLockRelease(DatachecksumsWorkerLock); + } + + list_free(InitialTempTableList); + + DatachecksumsWorkerShmem->success = DATACHECKSUMSWORKER_SUCCESSFUL; + ereport(DEBUG1, + (errmsg("data checksum processing completed in database with OID %u", + dboid))); +} diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 9bad14981b..60b1f6de60 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -3932,6 +3932,12 @@ pgstat_get_wait_ipc(WaitEventIPC w) case WAIT_EVENT_CHECKPOINT_START: event_name = "CheckpointStart"; break; + case WAIT_EVENT_CHECKSUM_ENABLE_STARTCONDITION: + event_name = "ChecksumEnableStartCondition"; + break; + case WAIT_EVENT_CHECKSUM_ENABLE_FINISHCONDITION: + event_name = "ChecksumEnableFinishCondition"; + break; case WAIT_EVENT_EXECUTE_GATHER: event_name = "ExecuteGather"; break; diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c index 22be7ca9d5..80fb7aeef9 100644 --- a/src/backend/replication/basebackup.c +++ b/src/backend/replication/basebackup.c @@ -1606,7 +1606,7 @@ sendFile(const char *readfilename, const char *tarfilename, _tarWriteHeader(tarfilename, NULL, statbuf, false); - if (!noverify_checksums && DataChecksumsEnabled()) + if (!noverify_checksums) { char *filename; @@ -1692,7 +1692,14 @@ sendFile(const char *readfilename, const char *tarfilename, */ if (!PageIsNew(page) && PageGetLSN(page) < startptr) { + HOLD_INTERRUPTS(); + if (!DataChecksumsNeedVerify()) + { + RESUME_INTERRUPTS(); + continue; + } checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE); + RESUME_INTERRUPTS(); phdr = (PageHeader) page; if (phdr->pd_checksum != checksum) { diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index 3f84ee99b8..908edfb423 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -212,6 +212,7 @@ DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) case XLOG_FPW_CHANGE: case XLOG_FPI_FOR_HINT: case XLOG_FPI: + case XLOG_CHECKSUMS: break; default: elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info); diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index ad0d1a9abc..8a14f29027 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -2933,8 +2933,13 @@ BufferGetLSNAtomic(Buffer buffer) /* * If we don't need locking for correctness, fastpath out. */ + HOLD_INTERRUPTS(); if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer)) + { + RESUME_INTERRUPTS(); return PageGetLSN(page); + } + RESUME_INTERRUPTS(); /* Make sure we've got a real buffer, and that we hold a pin on it. */ Assert(BufferIsValid(buffer)); diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 96c2aaabbd..9a33560469 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -28,6 +28,7 @@ #include "postmaster/autovacuum.h" #include "postmaster/bgworker_internals.h" #include "postmaster/bgwriter.h" +#include "postmaster/datachecksumsworker.h" #include "postmaster/postmaster.h" #include "replication/logicallauncher.h" #include "replication/origin.h" @@ -149,6 +150,7 @@ CreateSharedMemoryAndSemaphores(void) size = add_size(size, BTreeShmemSize()); size = add_size(size, SyncScanShmemSize()); size = add_size(size, AsyncShmemSize()); + size = add_size(size, DatachecksumsWorkerShmemSize()); #ifdef EXEC_BACKEND size = add_size(size, ShmemBackendArraySize()); #endif @@ -259,6 +261,7 @@ CreateSharedMemoryAndSemaphores(void) WalSndShmemInit(); WalRcvShmemInit(); ApplyLauncherShmemInit(); + DatachecksumsWorkerShmemInit(); /* * Set up other modules that need some shared memory space diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index ffe67acea1..c5331a68ba 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -18,6 +18,7 @@ #include #include "access/parallel.h" +#include "access/xlog.h" #include "commands/async.h" #include "miscadmin.h" #include "pgstat.h" @@ -92,7 +93,11 @@ static volatile ProcSignalSlot *MyProcSignalSlot = NULL; static bool CheckProcSignal(ProcSignalReason reason); static void CleanupProcSignalState(int status, Datum arg); -static void ProcessBarrierPlaceholder(void); + +static void ProcessBarrierChecksumOnInProgress(void); +static void ProcessBarrierChecksumOffInProgress(void); +static void ProcessBarrierChecksumOn(void); +static void ProcessBarrierChecksumOff(void); /* * ProcSignalShmemSize @@ -495,8 +500,14 @@ ProcessProcSignalBarrier(void) * unconditionally, but it's more efficient to call only the ones that * might need us to do something based on the flags. */ - if (BARRIER_SHOULD_CHECK(flags, PROCSIGNAL_BARRIER_PLACEHOLDER)) - ProcessBarrierPlaceholder(); + if (BARRIER_SHOULD_CHECK(flags, PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON)) + ProcessBarrierChecksumOnInProgress(); + else if (BARRIER_SHOULD_CHECK(flags, PROCSIGNAL_BARRIER_CHECKSUM_ON)) + ProcessBarrierChecksumOn(); + else if (BARRIER_SHOULD_CHECK(flags, PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF)) + ProcessBarrierChecksumOffInProgress(); + else if (BARRIER_SHOULD_CHECK(flags, PROCSIGNAL_BARRIER_CHECKSUM_OFF)) + ProcessBarrierChecksumOff(); /* * State changes related to all types of barriers that might have been @@ -509,16 +520,27 @@ ProcessProcSignalBarrier(void) } static void -ProcessBarrierPlaceholder(void) +ProcessBarrierChecksumOn(void) { - /* - * XXX. This is just a placeholder until the first real user of this - * machinery gets committed. Rename PROCSIGNAL_BARRIER_PLACEHOLDER to - * PROCSIGNAL_BARRIER_SOMETHING_ELSE where SOMETHING_ELSE is something - * appropriately descriptive. Get rid of this function and instead have - * ProcessBarrierSomethingElse. Most likely, that function should live in - * the file pertaining to that subsystem, rather than here. - */ + AbsorbChecksumsOnBarrier(); +} + +static void +ProcessBarrierChecksumOff(void) +{ + AbsorbChecksumsOffBarrier(); +} + +static void +ProcessBarrierChecksumOnInProgress(void) +{ + AbsorbChecksumsOnInProgressBarrier(); +} + +static void +ProcessBarrierChecksumOffInProgress(void) +{ + AbsorbChecksumsOffInProgressBarrier(); } /* diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index 774292fd94..23eaf9e576 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -53,3 +53,4 @@ XactTruncationLock 44 # 45 was XactTruncationLock until removal of BackendRandomLock WrapLimitsVacuumLock 46 NotifyQueueTailLock 47 +DatachecksumsWorkerLock 48 diff --git a/src/backend/storage/page/README b/src/backend/storage/page/README index e30d7ac59a..78edf57adc 100644 --- a/src/backend/storage/page/README +++ b/src/backend/storage/page/README @@ -10,7 +10,9 @@ http://www.cs.toronto.edu/~bianca/papers/sigmetrics09.pdf, discussed 2010/12/22 on -hackers list. Current implementation requires this be enabled system-wide at initdb time, or -by using the pg_checksums tool on an offline cluster. +by using the pg_checksums tool on an offline cluster. Checksums can also be +turned on and off using pg_enable_data_checksums()/pg_disable_data_checksums() +at runtime. The checksum is not valid at all times on a data page!! The checksum is valid when the page leaves the shared pool and is checked diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index ddf18079e2..3b74ddaa92 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -100,13 +100,20 @@ PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags) */ if (!PageIsNew(page)) { - if (DataChecksumsEnabled()) + /* + * Hold interrupts for the duration of the checksum check to ensure + * that the data checksums state cannot change and thus risking a + * false positive or negative. + */ + HOLD_INTERRUPTS(); + if (DataChecksumsNeedVerify()) { checksum = pg_checksum_page((char *) page, blkno); if (checksum != p->pd_checksum) checksum_failure = true; } + RESUME_INTERRUPTS(); /* * The following checks don't prove the header is correct, only that @@ -1394,10 +1401,6 @@ PageSetChecksumCopy(Page page, BlockNumber blkno) { static char *pageCopy = NULL; - /* If we don't need a checksum, just return the passed-in data */ - if (PageIsNew(page) || !DataChecksumsEnabled()) - return (char *) page; - /* * We allocate the copy space once and use it over on each subsequent * call. The point of palloc'ing here, rather than having a static char @@ -1407,8 +1410,17 @@ PageSetChecksumCopy(Page page, BlockNumber blkno) if (pageCopy == NULL) pageCopy = MemoryContextAlloc(TopMemoryContext, BLCKSZ); + /* If we don't need a checksum, just return the passed-in data */ + HOLD_INTERRUPTS(); + if (PageIsNew(page) || !DataChecksumsNeedWrite()) + { + RESUME_INTERRUPTS(); + return (char *) page; + } + memcpy(pageCopy, (char *) page, BLCKSZ); ((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno); + RESUME_INTERRUPTS(); return pageCopy; } @@ -1421,9 +1433,14 @@ PageSetChecksumCopy(Page page, BlockNumber blkno) void PageSetChecksumInplace(Page page, BlockNumber blkno) { + HOLD_INTERRUPTS(); /* If we don't need a checksum, just return */ - if (PageIsNew(page) || !DataChecksumsEnabled()) + if (PageIsNew(page) || !DataChecksumsNeedWrite()) + { + RESUME_INTERRUPTS(); return; + } ((PageHeader) page)->pd_checksum = pg_checksum_page((char *) page, blkno); + RESUME_INTERRUPTS(); } diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index 6afe1b6f56..e1c8bb640e 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -1565,9 +1565,6 @@ pg_stat_get_db_checksum_failures(PG_FUNCTION_ARGS) int64 result; PgStat_StatDBEntry *dbentry; - if (!DataChecksumsEnabled()) - PG_RETURN_NULL(); - if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL) result = 0; else @@ -1583,9 +1580,6 @@ pg_stat_get_db_checksum_last_failure(PG_FUNCTION_ARGS) TimestampTz result; PgStat_StatDBEntry *dbentry; - if (!DataChecksumsEnabled()) - PG_RETURN_NULL(); - if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL) result = 0; else diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 66393becfb..9a38499dcb 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -271,7 +271,8 @@ static void write_relcache_init_file(bool shared); static void write_item(const void *data, Size len, FILE *fp); static void formrdesc(const char *relationName, Oid relationReltype, - bool isshared, int natts, const FormData_pg_attribute *attrs); + bool isshared, int natts, const FormData_pg_attribute *attrs, + bool haschecksums); static HeapTuple ScanPgRelation(Oid targetRelId, bool indexOK, bool force_non_historic); static Relation AllocateRelationDesc(Form_pg_class relp); @@ -1816,7 +1817,8 @@ RelationInitTableAccessMethod(Relation relation) static void formrdesc(const char *relationName, Oid relationReltype, bool isshared, - int natts, const FormData_pg_attribute *attrs) + int natts, const FormData_pg_attribute *attrs, + bool haschecksums) { Relation relation; int i; @@ -1884,6 +1886,8 @@ formrdesc(const char *relationName, Oid relationReltype, relation->rd_rel->relnatts = (int16) natts; relation->rd_rel->relam = HEAP_TABLE_AM_OID; + relation->rd_rel->relhaschecksums = haschecksums; + /* * initialize attribute tuple form * @@ -3536,6 +3540,27 @@ RelationBuildLocalRelation(const char *relname, relkind == RELKIND_MATVIEW) RelationInitTableAccessMethod(rel); + /* + * Set the checksum state. Since the checksum state can change at any + * time, the fetched value might be out of date by the time. + * DataChecksumsNeedWrite returns true when checksums are: enabled; are + * in the process of being enabled "inprogress-on"; are in the process of + * being disabled "inprogress-off". Since relhaschecksums is only used to + * track progress when checksums are being enabled, and going from + * disabled to enabled will clear relhaschecksums before starting, it is + * safe to use this value for a concurrent state transition to off. + * + * If DataChecksumsNeedWrite returns false, and is concurrently changed to + * true then that implies that checksums are being enabled. Worst case, + * this will lead to the relation being processed for checksums even + * though each page written will have them already. + * + * Performing this last shortens the TOCTOU window, but doesn't avoid it. + */ + HOLD_INTERRUPTS(); + rel->rd_rel->relhaschecksums = DataChecksumsNeedWrite(); + RESUME_INTERRUPTS(); + /* * Okay to insert into the relcache hash table. * @@ -3802,6 +3827,7 @@ void RelationCacheInitializePhase2(void) { MemoryContext oldcxt; + bool haschecksums; /* * relation mapper needs initialized too @@ -3826,16 +3852,24 @@ RelationCacheInitializePhase2(void) */ if (!load_relcache_init_file(true)) { + /* + * Our local state can't change at this point, so we can cache the + * checksum state. + */ + HOLD_INTERRUPTS(); + haschecksums = DataChecksumsNeedWrite(); + RESUME_INTERRUPTS(); + formrdesc("pg_database", DatabaseRelation_Rowtype_Id, true, - Natts_pg_database, Desc_pg_database); + Natts_pg_database, Desc_pg_database, haschecksums); formrdesc("pg_authid", AuthIdRelation_Rowtype_Id, true, - Natts_pg_authid, Desc_pg_authid); + Natts_pg_authid, Desc_pg_authid, haschecksums); formrdesc("pg_auth_members", AuthMemRelation_Rowtype_Id, true, - Natts_pg_auth_members, Desc_pg_auth_members); + Natts_pg_auth_members, Desc_pg_auth_members, haschecksums); formrdesc("pg_shseclabel", SharedSecLabelRelation_Rowtype_Id, true, - Natts_pg_shseclabel, Desc_pg_shseclabel); + Natts_pg_shseclabel, Desc_pg_shseclabel, haschecksums); formrdesc("pg_subscription", SubscriptionRelation_Rowtype_Id, true, - Natts_pg_subscription, Desc_pg_subscription); + Natts_pg_subscription, Desc_pg_subscription, haschecksums); #define NUM_CRITICAL_SHARED_RELS 5 /* fix if you change list above */ } @@ -3864,6 +3898,7 @@ RelationCacheInitializePhase3(void) RelIdCacheEnt *idhentry; MemoryContext oldcxt; bool needNewCacheFile = !criticalSharedRelcachesBuilt; + bool haschecksums; /* * relation mapper needs initialized too @@ -3884,15 +3919,18 @@ RelationCacheInitializePhase3(void) !load_relcache_init_file(false)) { needNewCacheFile = true; + HOLD_INTERRUPTS(); + haschecksums = DataChecksumsNeedWrite(); + RESUME_INTERRUPTS(); formrdesc("pg_class", RelationRelation_Rowtype_Id, false, - Natts_pg_class, Desc_pg_class); + Natts_pg_class, Desc_pg_class, haschecksums); formrdesc("pg_attribute", AttributeRelation_Rowtype_Id, false, - Natts_pg_attribute, Desc_pg_attribute); + Natts_pg_attribute, Desc_pg_attribute, haschecksums); formrdesc("pg_proc", ProcedureRelation_Rowtype_Id, false, - Natts_pg_proc, Desc_pg_proc); + Natts_pg_proc, Desc_pg_proc, haschecksums); formrdesc("pg_type", TypeRelation_Rowtype_Id, false, - Natts_pg_type, Desc_pg_type); + Natts_pg_type, Desc_pg_type, haschecksums); #define NUM_CRITICAL_LOCAL_RELS 4 /* fix if you change list above */ } diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index ed2ab4b5b2..03b940dfd7 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -275,6 +275,12 @@ GetBackendTypeDesc(BackendType backendType) case B_LOGGER: backendDesc = "logger"; break; + case B_DATACHECKSUMSWORKER_LAUNCHER: + backendDesc = "datachecksumsworker launcher"; + break; + case B_DATACHECKSUMSWORKER_WORKER: + backendDesc = "datachecksumsworker worker"; + break; } return backendDesc; diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 82d451569d..35782a93da 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -594,6 +594,11 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username, if (MyBackendId > MaxBackends || MyBackendId <= 0) elog(FATAL, "bad backend ID: %d", MyBackendId); + /* + * Set up local cache of Controldata values. + */ + InitLocalControldata(); + /* Now that we have a BackendId, we can participate in ProcSignal */ ProcSignalInit(MyBackendId); diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 02d2d267b5..b3f028fb86 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -36,6 +36,7 @@ #include "access/transam.h" #include "access/twophase.h" #include "access/xact.h" +#include "access/xlog.h" #include "access/xlog_internal.h" #include "catalog/namespace.h" #include "catalog/pg_authid.h" @@ -76,6 +77,7 @@ #include "replication/walreceiver.h" #include "replication/walsender.h" #include "storage/bufmgr.h" +#include "storage/checksum.h" #include "storage/dsm_impl.h" #include "storage/fd.h" #include "storage/large_object.h" @@ -498,6 +500,17 @@ static struct config_enum_entry shared_memory_options[] = { {NULL, 0, false} }; +/* + * Options for data_checksums enum. + */ +static const struct config_enum_entry data_checksum_options[] = { + {"on", DATA_CHECKSUMS_ON, true}, + {"off", DATA_CHECKSUMS_OFF, true}, + {"inprogress-on", DATA_CHECKSUMS_INPROGRESS_ON, true}, + {"inprogress-off", DATA_CHECKSUMS_INPROGRESS_OFF, true}, + {NULL, 0, false} +}; + /* * Options for enum values stored in other modules */ @@ -607,7 +620,7 @@ static int max_identifier_length; static int block_size; static int segment_size; static int wal_block_size; -static bool data_checksums; +static int data_checksums_tmp; static bool integer_datetimes; static bool assert_enabled; static char *recovery_target_timeline_string; @@ -1898,17 +1911,6 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, - { - {"data_checksums", PGC_INTERNAL, PRESET_OPTIONS, - gettext_noop("Shows whether data checksums are turned on for this cluster."), - NULL, - GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE - }, - &data_checksums, - false, - NULL, NULL, NULL - }, - { {"syslog_sequence_numbers", PGC_SIGHUP, LOGGING_WHERE, gettext_noop("Add sequence number to syslog messages to avoid duplicate suppression."), @@ -4784,6 +4786,17 @@ static struct config_enum ConfigureNamesEnum[] = NULL, NULL, NULL }, + { + {"data_checksums", PGC_INTERNAL, PRESET_OPTIONS, + gettext_noop("Shows whether data checksums are turned on for this cluster."), + NULL, + GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE + }, + &data_checksums_tmp, + DATA_CHECKSUMS_OFF, data_checksum_options, + NULL, NULL, show_data_checksums + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, 0, NULL, NULL, NULL, NULL diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c index 28aba92a4c..0cf91f076c 100644 --- a/src/bin/pg_checksums/pg_checksums.c +++ b/src/bin/pg_checksums/pg_checksums.c @@ -600,7 +600,7 @@ main(int argc, char *argv[]) exit(1); } - if (ControlFile->data_checksum_version > 0 && + if (ControlFile->data_checksum_version == DATA_CHECKSUMS_ON && mode == PG_MODE_ENABLE) { pg_log_error("data checksums are already enabled in cluster"); diff --git a/src/bin/pg_upgrade/controldata.c b/src/bin/pg_upgrade/controldata.c index 39bcaa8fe1..32058ebf61 100644 --- a/src/bin/pg_upgrade/controldata.c +++ b/src/bin/pg_upgrade/controldata.c @@ -657,6 +657,15 @@ check_control_data(ControlData *oldctrl, * check_for_isn_and_int8_passing_mismatch(). */ + /* + * If checksums have been turned on in the old cluster, but the + * datachecksumsworker have yet to finish, then disallow upgrading. The + * user should either let the process finish, or turn off checksums, + * before retrying. + */ + if (oldctrl->data_checksum_version == 2) + pg_fatal("checksum enabling in old cluster is in progress\n"); + /* * We might eventually allow upgrades from checksum to no-checksum * clusters. diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index ee70243c2e..bfa05eb1b0 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -218,7 +218,7 @@ typedef struct uint32 large_object; bool date_is_int; bool float8_pass_by_value; - bool data_checksum_version; + uint32 data_checksum_version; } ControlData; /* diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 221af87e71..8dfd70fba6 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -198,8 +198,11 @@ extern PGDLLIMPORT int wal_level; * individual bits on a page, it's still consistent no matter what combination * of the bits make it to disk, but the checksum wouldn't match. Also WAL-log * them if forced by wal_log_hints=on. + * + * Since XLogHintBitIsNeeded calls DataChecksumsNeedWrite, interrupts must be + * held off during this call. */ -#define XLogHintBitIsNeeded() (DataChecksumsEnabled() || wal_log_hints) +#define XLogHintBitIsNeeded() (wal_log_hints || DataChecksumsNeedWrite()) /* Do we need to WAL-log information required only for Hot Standby and logical replication? */ #define XLogStandbyInfoActive() (wal_level >= WAL_LEVEL_REPLICA) @@ -318,7 +321,19 @@ extern TimestampTz GetCurrentChunkReplayStartTime(void); extern void UpdateControlFile(void); extern uint64 GetSystemIdentifier(void); extern char *GetMockAuthenticationNonce(void); -extern bool DataChecksumsEnabled(void); +extern bool DataChecksumsNeedWrite(void); +extern bool DataChecksumsNeedVerify(void); +extern bool DataChecksumsOnInProgress(void); +extern bool DataChecksumsOffInProgress(void); +extern void SetDataChecksumsOnInProgress(void); +extern void SetDataChecksumsOn(void); +extern void SetDataChecksumsOff(void); +extern void AbsorbChecksumsOnInProgressBarrier(void); +extern void AbsorbChecksumsOffInProgressBarrier(void); +extern void AbsorbChecksumsOnBarrier(void); +extern void AbsorbChecksumsOffBarrier(void); +extern const char *show_data_checksums(void); +extern void InitLocalControldata(void); extern XLogRecPtr GetFakeLSNForUnloggedRel(void); extern Size XLOGShmemSize(void); extern void XLOGShmemInit(void); diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 4146753d47..80a959bd7f 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -25,6 +25,7 @@ #include "lib/stringinfo.h" #include "pgtime.h" #include "storage/block.h" +#include "storage/checksum.h" #include "storage/relfilenode.h" @@ -249,6 +250,12 @@ typedef struct xl_restore_point char rp_name[MAXFNAMELEN]; } xl_restore_point; +/* Information logged when checksum level is changed */ +typedef struct xl_checksum_state +{ + ChecksumType new_checksumtype; +} xl_checksum_state; + /* End of recovery mark, when we don't do an END_OF_RECOVERY checkpoint */ typedef struct xl_end_of_recovery { diff --git a/src/include/catalog/pg_class.h b/src/include/catalog/pg_class.h index bb5e72ca43..275eb0a1a6 100644 --- a/src/include/catalog/pg_class.h +++ b/src/include/catalog/pg_class.h @@ -119,6 +119,9 @@ CATALOG(pg_class,1259,RelationRelationId) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83,Relat /* is relation a partition? */ bool relispartition BKI_DEFAULT(f); + /* does the relation have checksums enabled */ + bool relhaschecksums BKI_DEFAULT(f); + /* heap for rewrite during DDL, link to original rel */ Oid relrewrite BKI_DEFAULT(0); diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index 06bed90c5e..6bc802d8ba 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -76,6 +76,7 @@ typedef struct CheckPoint #define XLOG_END_OF_RECOVERY 0x90 #define XLOG_FPI_FOR_HINT 0xA0 #define XLOG_FPI 0xB0 +#define XLOG_CHECKSUMS 0xC0 /* diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index fc2202b843..70c1f26375 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -10936,6 +10936,22 @@ proargnames => '{max_data_alignment,database_block_size,blocks_per_segment,wal_block_size,bytes_per_wal_segment,max_identifier_length,max_index_columns,max_toast_chunk_size,large_object_chunk_size,float8_pass_by_value,data_page_checksum_version}', prosrc => 'pg_control_init' }, +{ oid => '4142', + descr => 'disable data checksums', + proname => 'pg_disable_data_checksums', provolatile => 'v', prorettype => 'bool', + proparallel => 'r', + proargtypes => '', + prosrc => 'disable_data_checksums' }, + +{ oid => '4035', + descr => 'enable data checksums', + proname => 'pg_enable_data_checksums', provolatile => 'v', prorettype => 'bool', + proparallel => 'r', + proargtypes => 'int4 int4', proallargtypes => '{int4,int4}', + proargmodes => '{i,i}', + proargnames => '{cost_delay,cost_limit}', + prosrc => 'enable_data_checksums' }, + # collation management functions { oid => '3445', descr => 'import collations from operating system', proname => 'pg_import_system_collations', procost => '100', diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 72e3352398..c4893551a3 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -323,6 +323,8 @@ typedef enum BackendType B_ARCHIVER, B_STATS_COLLECTOR, B_LOGGER, + B_DATACHECKSUMSWORKER_LAUNCHER, + B_DATACHECKSUMSWORKER_WORKER, } BackendType; extern BackendType MyBackendType; diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 5954068dec..50848876cf 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -929,6 +929,8 @@ typedef enum WAIT_EVENT_BTREE_PAGE, WAIT_EVENT_CHECKPOINT_DONE, WAIT_EVENT_CHECKPOINT_START, + WAIT_EVENT_CHECKSUM_ENABLE_STARTCONDITION, + WAIT_EVENT_CHECKSUM_ENABLE_FINISHCONDITION, WAIT_EVENT_EXECUTE_GATHER, WAIT_EVENT_HASH_BATCH_ALLOCATE, WAIT_EVENT_HASH_BATCH_ELECT, diff --git a/src/include/postmaster/datachecksumsworker.h b/src/include/postmaster/datachecksumsworker.h new file mode 100644 index 0000000000..3572ec80c5 --- /dev/null +++ b/src/include/postmaster/datachecksumsworker.h @@ -0,0 +1,36 @@ +/*------------------------------------------------------------------------- + * + * datachecksumsworker.h + * header file for checksum helper background worker + * + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/postmaster/datachecksumsworker.h + * + *------------------------------------------------------------------------- + */ +#ifndef DATACHECKSUMSWORKER_H +#define DATACHECKSUMSWORKER_H + +/* Shared memory */ +extern Size DatachecksumsWorkerShmemSize(void); +extern void DatachecksumsWorkerShmemInit(void); + +/* Status functions */ +bool DataChecksumsWorkerStarted(void); + +/* Start the background processes for enabling checksums */ +void StartDatachecksumsWorkerLauncher(bool enable_checksums, + int cost_delay, int cost_limit); + +/* Shutdown the background processes, if any */ +void ShutdownDatachecksumsWorkerIfRunning(void); + +/* Background worker entrypoints */ +void DatachecksumsWorkerLauncherMain(Datum arg); +void DatachecksumsWorkerMain(Datum arg); +void ResetDataChecksumsStateInDatabase(Datum arg); + +#endif /* DATACHECKSUMSWORKER_H */ diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index d0a52f8e08..3bb7742642 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -198,6 +198,9 @@ typedef PageHeaderData *PageHeader; */ #define PG_PAGE_LAYOUT_VERSION 4 #define PG_DATA_CHECKSUM_VERSION 1 +#define PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION 2 +#define PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION 3 + /* ---------------------------------------------------------------- * page support macros diff --git a/src/include/storage/checksum.h b/src/include/storage/checksum.h index 6e77744cbc..f6ae955f58 100644 --- a/src/include/storage/checksum.h +++ b/src/include/storage/checksum.h @@ -15,6 +15,14 @@ #include "storage/block.h" +typedef enum ChecksumType +{ + DATA_CHECKSUMS_OFF = 0, + DATA_CHECKSUMS_ON, + DATA_CHECKSUMS_INPROGRESS_ON, + DATA_CHECKSUMS_INPROGRESS_OFF +} ChecksumType; + /* * Compute the checksum for a Postgres page. The page must be aligned on a * 4-byte boundary. diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h index 5cb39697f3..37cd0abbd6 100644 --- a/src/include/storage/procsignal.h +++ b/src/include/storage/procsignal.h @@ -48,12 +48,10 @@ typedef enum typedef enum { - /* - * XXX. PROCSIGNAL_BARRIER_PLACEHOLDER should be replaced when the first - * real user of the ProcSignalBarrier mechanism is added. It's just here - * for now because we can't have an empty enum. - */ - PROCSIGNAL_BARRIER_PLACEHOLDER = 0 + PROCSIGNAL_BARRIER_CHECKSUM_OFF = 0, + PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON, + PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF, + PROCSIGNAL_BARRIER_CHECKSUM_ON } ProcSignalBarrierType; /* diff --git a/src/test/Makefile b/src/test/Makefile index 14cde4f5ba..61d6b918b9 100644 --- a/src/test/Makefile +++ b/src/test/Makefile @@ -13,7 +13,7 @@ top_builddir = ../.. include $(top_builddir)/src/Makefile.global SUBDIRS = perl regress isolation modules authentication recovery subscription \ - locale + locale checksum # Test suites that are not safe by default but can be run if selected # by the user via the whitespace-separated list in variable diff --git a/src/test/checksum/.gitignore b/src/test/checksum/.gitignore new file mode 100644 index 0000000000..871e943d50 --- /dev/null +++ b/src/test/checksum/.gitignore @@ -0,0 +1,2 @@ +# Generated by test suite +/tmp_check/ diff --git a/src/test/checksum/Makefile b/src/test/checksum/Makefile new file mode 100644 index 0000000000..558a8135f1 --- /dev/null +++ b/src/test/checksum/Makefile @@ -0,0 +1,23 @@ +#------------------------------------------------------------------------- +# +# Makefile for src/test/checksum +# +# Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group +# Portions Copyright (c) 1994, Regents of the University of California +# +# src/test/checksum/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/test/checksum +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +check: + $(prove_check) + +installcheck: + $(prove_installcheck) + +clean distclean maintainer-clean: + rm -rf tmp_check diff --git a/src/test/checksum/README b/src/test/checksum/README new file mode 100644 index 0000000000..0f0317060b --- /dev/null +++ b/src/test/checksum/README @@ -0,0 +1,22 @@ +src/test/checksum/README + +Regression tests for data checksums +=================================== + +This directory contains a test suite for enabling data checksums +in a running cluster. + +Running the tests +================= + + make check + +or + + make installcheck + +NOTE: This creates a temporary installation (in the case of "check"), +with multiple nodes, be they master or standby(s) for the purpose of +the tests. + +NOTE: This requires the --enable-tap-tests argument to configure. diff --git a/src/test/checksum/t/001_basic.pl b/src/test/checksum/t/001_basic.pl new file mode 100644 index 0000000000..0f44512f83 --- /dev/null +++ b/src/test/checksum/t/001_basic.pl @@ -0,0 +1,89 @@ +# Test suite for testing enabling data checksums in an online cluster +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More; + +# Initialize node with checksums disabled. +my $node = get_new_node('main'); +$node->init(); +$node->start(); + +# Create some content to have un-checksummed data in the cluster +$node->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); + +# Ensure that checksums are turned off +my $result = $node->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"); +is($result, 'off', 'ensure checksums are disabled'); + +# No relation in pg_class should have relhaschecksums at this point +$result = $node->safe_psql('postgres', + "SELECT count(*) FROM pg_catalog.pg_class WHERE relhaschecksums;"); +is($result, '0', 'ensure no entries in pg_class has checksums recorded'); + +# Enable data checksums +$node->safe_psql('postgres', "SELECT pg_enable_data_checksums();"); + +# Wait for checksums to become enabled +$result = $node->poll_query_until('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'on'); +is($result, 1, 'ensure checksums are enabled'); + +# Check that relations with storage have been marked with relhaschecksums in +# pg_class +$result = $node->safe_psql('postgres', + "SELECT count(*) FROM pg_catalog.pg_class WHERE NOT relhaschecksums " . + "AND relkind IN ('r', 'i', 'S', 't', 'm');"); +is($result, '0', 'ensure all relations are correctly flagged in the catalog'); + +# Run a dummy query just to make sure we read back some data +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t"); +is($result, '10000', 'ensure checksummed pages can be read back'); + +# Enable data checksums again which should be a no-op.. +$node->safe_psql('postgres', "SELECT pg_enable_data_checksums();"); +# ..and make sure we can still read/write data +$node->safe_psql('postgres', "UPDATE t SET a = a + 1;"); +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t"); +is($result, '10000', 'ensure checksummed pages can be read back'); + +# Disable checksums again +$node->safe_psql('postgres', "SELECT pg_disable_data_checksums();"); + +# Wait for checksums to be disabled. Disabling checksums clear the catalog +# relhaschecksums state so await that before calling it done. +$result = $node->poll_query_until('postgres', + "SELECT count(*) FROM pg_catalog.pg_class WHERE relhaschecksums;", + '0'); +is($result, '1', 'ensure no entries in pg_class has checksums recorded'); +$result = $node->poll_query_until('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'off'); +is($result, 1, 'ensure checksums are disabled'); + +# Test reading again +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t"); +is($result, '10000', 'ensure previously checksummed pages can be read back'); + +# Re-enable checksums and make sure that the relhaschecksums flags in the +# catalog aren't tricking processing into skipping previously checksummed +# relations +$node->safe_psql('postgres', "UPDATE t SET a = a + 1;"); + +$node->safe_psql('postgres', "SELECT pg_enable_data_checksums();"); +$result = $node->poll_query_until('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'on'); +is($result, 1, 'ensure checksums are enabled'); + +# Run a dummy query just to make sure we read back some data +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t"); +is($result, '10000', 'ensure checksummed pages can be read back'); + +$node->stop; + +done_testing(); diff --git a/src/test/checksum/t/002_restarts.pl b/src/test/checksum/t/002_restarts.pl new file mode 100644 index 0000000000..dc5bcb9629 --- /dev/null +++ b/src/test/checksum/t/002_restarts.pl @@ -0,0 +1,108 @@ +# Test suite for testing enabling data checksums in an online cluster with +# restarting the processing +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More; +use IPC::Run qw(pump finish timer); + +# If we don't have IO::Pty, forget it, because IPC::Run depends on that +# to support pty connections +eval { require IO::Pty; }; +if ($@) +{ + plan skip_all => 'IO::Pty is needed to run this test'; +} + +# Initialize node with checksums disabled. +my $node = get_new_node('main'); +$node->init(); +$node->start(); + +# Create some content to have un-checksummed data in the cluster +$node->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); + +# Ensure that checksums are disabled +my $result = $node->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"); +is($result, 'off', 'ensure checksums are disabled'); + +# Create a barrier for checksumming to block on, in this case a pre-existing +# temporary table which is kept open while processing is started. We can +# accomplish this by setting up an interactive psql process which keeps the +# temporary table created as we enable checksums in another psql process. +my $in = ''; +my $out = ''; +my $timer = timer(5); + +my $h = $node->interactive_psql('postgres', \$in, \$out, $timer); + +$out = ''; +$timer->start(5); + +$in .= "CREATE TEMPORARY TABLE tt (a integer);\n"; +pump $h until ($out =~ /CREATE TABLE/ || $timer->is_expired); + +# In another session, make sure we can see the blocking temp table but start +# processing anyways and check that we are blocked with a proper wait event. +$result = $node->safe_psql('postgres', + "SELECT relpersistence FROM pg_catalog.pg_class WHERE relname = 'tt';"); +is($result, 't', 'ensure we can see the temporary table'); + +$node->safe_psql('postgres', "SELECT pg_enable_data_checksums();"); + +$result = $node->poll_query_until('postgres', + "SELECT count(*) FROM pg_catalog.pg_class WHERE NOT relhaschecksums " . + "AND relkind IN ('r', 'i', 'S', 't', 'm');", + '1'); +is($result, 1, 'ensure there is a single table left'); + +$result = $node->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"); +is($result, 'inprogress-on', "ensure checksums aren't enabled yet"); + +$result = $node->safe_psql('postgres', + "SELECT wait_event FROM pg_stat_activity WHERE backend_type = 'datachecksumsworker worker';"); +is($result, 'ChecksumEnableFinishCondition', 'test for correct wait event'); + +$result = $node->safe_psql('postgres', + "SELECT count(*) FROM pg_catalog.pg_class WHERE NOT relhaschecksums " . + "AND relkind IN ('r', 'i', 'S', 't', 'm');"); +is($result, '1', 'doublecheck that there is a single table left before restarting'); + +$node->stop; +$node->start; + +$result = $node->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"); +is($result, 'inprogress-on', "ensure checksums aren't enabled yet"); + +$result = $node->safe_psql('postgres', + "SELECT count(*) FROM pg_catalog.pg_class WHERE NOT relhaschecksums " . + "AND relkind IN ('r', 'i', 'S', 't', 'm');"); +is($result, '0', 'no temporary tables this time around'); + +$node->safe_psql('postgres', "SELECT pg_enable_data_checksums();"); + +$result = $node->poll_query_until('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'on'); +is($result, 1, 'ensure checksums are turned on'); + +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t"); +is($result, '10000', 'ensure checksummed pages can be read back'); + +$result = $node->poll_query_until('postgres', + "SELECT count(*) FROM pg_stat_activity WHERE backend_type LIKE 'datachecksumsworker%';", + '0'); +is($result, 1, 'await datachecksums worker/launcher termination'); + +$result = $node->safe_psql('postgres', "SELECT pg_disable_data_checksums();"); +$result = $node->poll_query_until('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'off'); +is($result, 1, 'ensure checksums are turned off'); + +done_testing(); diff --git a/src/test/checksum/t/003_standby_checksum.pl b/src/test/checksum/t/003_standby_checksum.pl new file mode 100644 index 0000000000..99c283e0b1 --- /dev/null +++ b/src/test/checksum/t/003_standby_checksum.pl @@ -0,0 +1,116 @@ +# Test suite for testing enabling data checksums in an online cluster with +# streaming replication +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More; + +# Initialize primary node +my $node_primary = get_new_node('primary'); +$node_primary->init(allows_streaming => 1); +$node_primary->start; +my $backup_name = 'my_backup'; + +# Take backup +$node_primary->backup($backup_name); + +# Create streaming standby linking to primary +my $node_standby_1 = get_new_node('standby_1'); +$node_standby_1->init_from_backup($node_primary, $backup_name, + has_streaming => 1); +$node_standby_1->start; + +# Create some content on the primary to have un-checksummed data in the cluster +$node_primary->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); + +# Wait for standbys to catch up +$node_primary->wait_for_catchup($node_standby_1, 'replay', + $node_primary->lsn('insert')); + +# Check that checksums are turned off on all nodes +my $result = $node_primary->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"); +is($result, "off", 'ensure checksums are turned off on primary'); + +$result = $node_standby_1->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"); +is($result, "off", 'ensure checksums are turned off on standby_1'); + +# Enable checksums for the cluster +$node_primary->safe_psql('postgres', "SELECT pg_enable_data_checksums();"); + +# Ensure that the primary switches to "inprogress-on" +$result = $node_primary->poll_query_until('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + "inprogress-on"); +is($result, 1, 'ensure checksums are in progress on primary'); + +# Wait for checksum enable to be replayed +$node_primary->wait_for_catchup($node_standby_1, 'replay'); + +# Ensure that the standby has switched to "inprogress-on" or "on". Normally it +# would be "inprogress-on", but it is theoretically possible for the primary to +# complete the checksum enabling *and* have the standby replay that record +# before we reach the check below. +$result = $node_standby_1->poll_query_until('postgres', + "SELECT setting = 'off' FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'f'); +is($result, 1, 'ensure standby has absorbed the inprogress-on barrier'); +$result = $node_standby_1->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"); +cmp_ok($result, '~~', ["inprogress-on", "on"], 'ensure checksums are on, or in progress, on standby_1'); + +# Insert some more data which should be checksummed on INSERT +$node_primary->safe_psql('postgres', + "INSERT INTO t VALUES (generate_series(1, 10000));"); + +# Wait for checksums enabled on the primary +$result = $node_primary->poll_query_until('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'on'); +is($result, 1, 'ensure checksums are enabled on the primary'); + +# Wait for checksums enabled on the standby +$result = $node_standby_1->poll_query_until('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'on'); +is($result, 1, 'ensure checksums are enabled on the standby'); + +$result = $node_primary->safe_psql('postgres', "SELECT count(a) FROM t"); +is ($result, '20000', 'ensure we can safely read all data with checksums'); + +$result = $node_primary->poll_query_until('postgres', + "SELECT count(*) FROM pg_stat_activity WHERE backend_type LIKE 'datachecksumsworker%';", + '0'); +is($result, 1, 'await datachecksums worker/launcher termination'); + +# Disable checksums and ensure it's propagated to standby and that we can +# still read all data +$node_primary->safe_psql('postgres', "SELECT pg_disable_data_checksums();"); +# Wait for checksum disable to be replayed +$node_primary->wait_for_catchup($node_standby_1, 'replay'); +$result = $node_primary->poll_query_until('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'off'); +is($result, 1, 'ensure data checksums are disabled on the primary 2'); +$result = $node_primary->poll_query_until('postgres', + "SELECT count(*) FROM pg_catalog.pg_class WHERE relhaschecksums;", + '0'); +is($result, '1', 'ensure no entries in pg_class has checksums recorded'); + +# Ensure that the standby has switched to off +$result = $node_standby_1->poll_query_until('postgres', + "SELECT count(*) FROM pg_catalog.pg_class WHERE relhaschecksums;", + '0'); +is($result, '1', 'ensure no entries in pg_class has checksums recorded'); +$result = $node_standby_1->poll_query_until('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'off'); +is($result, 1, 'ensure checksums are off on standby_1'); + +$result = $node_primary->safe_psql('postgres', "SELECT count(a) FROM t"); +is ($result, "20000", 'ensure we can safely read all data without checksums'); + +done_testing(); -- 2.21.1 (Apple Git-122.3)