diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 7bfbc87109..108e049a85 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -2011,6 +2011,42 @@ include_dir 'conf.d' + + Online Checksumming + + + + checksumhelper_cost_delay (integer) + + checksumhelper_cost_delay configuration parameter + + + + + The length of time, in milliseconds, that the process will sleep when + the cost limit has been exceeded. The default value is zero, which + disables the cost-based checksumming delay feature. Positive values + enable cost-based checksumming. + + + + + + checksumhelper_cost_limit (integer) + + checksumhelper_cost_limit configuration parameter + + + + + The accumulated cost that will cause the checksumming process to sleep. + It is turned off by default. + + + + + + Asynchronous Behavior diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 5dce8ef178..154cf40cd3 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -19582,6 +19582,74 @@ postgres=# SELECT * FROM pg_walfile_name_offset(pg_stop_backup()); + + Data Checksum Functions + + + The functions shown in can + be used to enable or disable data checksums in a running cluster. + See for details. + + + + Checksum <acronym>SQL</acronym> Functions + + + + Function + Return Type + Description + + + + + + + pg_enable_data_checksums + + pg_enable_data_checksums() + + + void + + + + Initiates data checksums for the cluster. This will switch the data checksums mode + to in progress, but will not initiate the checksumming process. + In order to start checksumming the data pages the database must be restarted. Upon + restart a background worker will start processing all data in the database and enable + checksums for it. When all data pages have had checksums enabled, the cluster will + automatically switch to checksums on. + + + The and + GUCs are used to + throttle the + speed of the process is throttled using the same principles as + Cost-based Vacuum Delay. + + + + + + + pg_disable_data_checksums + + pg_disable_data_checksums() + + + void + + + Disables data checksums for the cluster. This takes effect immediately. + + + + +
+ +
+ Database Object Management Functions diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml index 4489b585c7..be489e78b9 100644 --- a/doc/src/sgml/ref/initdb.sgml +++ b/doc/src/sgml/ref/initdb.sgml @@ -214,9 +214,9 @@ PostgreSQL documentation Use checksums on data pages to help detect corruption by the I/O system that would otherwise be silent. Enabling checksums - may incur a noticeable performance penalty. This option can only - be set during initialization, and cannot be changed later. If - set, checksums are calculated for all objects, in all databases. + may incur a noticeable performance penalty. If set, checksums + are calculated for all objects, in all databases. See + for details. diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml index 8727f3c26b..29a2fdb449 100644 --- a/doc/src/sgml/wal.sgml +++ b/doc/src/sgml/wal.sgml @@ -230,6 +230,86 @@ + + Data checksums + + checksums + + + + Data pages are not checksum protected by default, but this can optionally be enabled for a cluster. + When enabled, each data page will be assigned a checksum that is updated when the page is + written and verified every time the page is read. Only data pages are protected by checksums, + internal data structures and temporary files are not. + + + + Checksums are normally enabled when the cluster is initialized using + initdb. They + can also be enabled or disabled at runtime. In all cases, checksums are enabled or disabled + at the full cluster level, and cannot be specified individually for databases or tables. + + + + The current state of checksums in the cluster can be verified by viewing the value + of the read-only configuration variable by + issuing the command SHOW data_checksums. + + + + When attempting to recover from corrupt data it may be necessary to bypass the checksum + protection in order to recover data. To do this, temporarily set the configuration parameter + . + + + + On-line enabling of checksums + + + Checksums can be enabled or disabled online, by calling the appropriate + functions. + Disabling of checksums takes effect immediately when the function is called. + + + + Enabling checksums will put the cluster in inprogress mode, + but will not start the checksumming of data. In order to start checksumming + the data in the cluster, a restart is needed. When the cluster is restarted, + checksums will be written but not verified. In addition to + this, a background worker process is started that enables checksums on all + existing data in the cluster. Once this worker has completed processing all + databases in the cluster, the checksum mode will automatically switch to + on. + + + + The process will track all created databases so that it can be certain that + no database has been created from a non-checksummed template database. The + process wont set the checksum mode to on until no database + can be created from a non-checksummed template. If an application repeatedly + creates databases it may be necessary to terminate this application to allow + the process to complete. + + + + If the cluster is stopped while in inprogress mode, for + any reason, then this process must be started over. When the cluster is + restarted, the checksum process will again checksum all data in the cluster + It is not possible to resume the work, the process has to start over and + re-process the cluster. + + + + + Enabling checksums can cause significant I/O to the system, as most of the + database pages will need to be rewritten, and will be written both to the + data files and the WAL. + + + + + + Write-Ahead Logging (<acronym>WAL</acronym>) diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index 00741c7b09..a31f8b806a 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -17,6 +17,7 @@ #include "access/xlog.h" #include "access/xlog_internal.h" #include "catalog/pg_control.h" +#include "storage/bufpage.h" #include "utils/guc.h" #include "utils/timestamp.h" @@ -137,6 +138,18 @@ xlog_desc(StringInfo buf, XLogReaderState *record) xlrec.ThisTimeLineID, xlrec.PrevTimeLineID, timestamptz_to_str(xlrec.end_time)); } + else if (info == XLOG_CHECKSUMS) + { + xl_checksum_state xlrec; + + memcpy(&xlrec, rec, sizeof(xl_checksum_state)); + if (xlrec.new_checksumtype == PG_DATA_CHECKSUM_VERSION) + appendStringInfo(buf, "on"); + else if (xlrec.new_checksumtype == PG_DATA_CHECKSUM_INPROGRESS_VERSION) + appendStringInfo(buf, "inprogress"); + else + appendStringInfo(buf, "off"); + } } const char * @@ -182,6 +195,9 @@ xlog_identify(uint8 info) case XLOG_FPI_FOR_HINT: id = "FPI_FOR_HINT"; break; + case XLOG_CHECKSUMS: + id = "CHECKSUMS"; + break; } return id; diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 1a419aa49b..ccf8032847 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -856,6 +856,7 @@ static void SetLatestXTime(TimestampTz xtime); static void SetCurrentChunkStartTime(TimestampTz xtime); static void CheckRequiredParameterValues(void); static void XLogReportParameters(void); +static void XlogChecksums(uint32 new_type); static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI); static void LocalSetXLogInsertAllowed(void); @@ -4686,10 +4687,6 @@ ReadControlFile(void) (SizeOfXLogLongPHD - SizeOfXLogShortPHD); CalculateCheckpointSegments(); - - /* Make the initdb settings visible as GUC variables, too */ - SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no", - PGC_INTERNAL, PGC_S_OVERRIDE); } void @@ -4761,12 +4758,110 @@ GetMockAuthenticationNonce(void) * Are checksums enabled for data pages? */ bool -DataChecksumsEnabled(void) +DataChecksumsNeedWrite(void) { Assert(ControlFile != NULL); return (ControlFile->data_checksum_version > 0); } +bool +DataChecksumsNeedVerify(void) +{ + Assert(ControlFile != NULL); + + /* + * Only verify checksums if they are fully enabled in the cluster. In + * inprogress state they are only updated, not verified. + */ + return (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_VERSION); +} + +bool +DataChecksumsNeedVerifyLocked(void) +{ + bool ret; + + Assert(ControlFile != NULL); + + /* + * Only verify checksums if they are fully enabled in the cluster. In + * inprogress state they are only updated, not verified. + * Make the check while holding the ControlFileLock, to make sure we are + * looking at the latest version. + */ + LWLockAcquire(ControlFileLock, LW_SHARED); + ret = (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_VERSION); + LWLockRelease(ControlFileLock); + + return ret; +} + +bool +DataChecksumsInProgress(void) +{ + Assert(ControlFile != NULL); + return (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_VERSION); +} + +void +SetDataChecksumsInProgress(void) +{ + Assert(ControlFile != NULL); + if (ControlFile->data_checksum_version > 0) + return; + + XlogChecksums(PG_DATA_CHECKSUM_INPROGRESS_VERSION); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_VERSION; + UpdateControlFile(); + LWLockRelease(ControlFileLock); +} + +void +SetDataChecksumsOn(void) +{ + Assert(ControlFile != NULL); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + + if (ControlFile->data_checksum_version != PG_DATA_CHECKSUM_INPROGRESS_VERSION) + { + LWLockRelease(ControlFileLock); + elog(ERROR, "Checksums not in inprogress mode"); + } + + ControlFile->data_checksum_version = PG_DATA_CHECKSUM_VERSION; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + + XlogChecksums(PG_DATA_CHECKSUM_VERSION); +} + +void +SetDataChecksumsOff(void) +{ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + + ControlFile->data_checksum_version = 0; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + + XlogChecksums(0); +} + +/* guc hook */ +const char * +show_data_checksums(void) +{ + if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_VERSION) + return "on"; + else if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_VERSION) + return "inprogress"; + else + return "off"; +} + /* * Returns a fake LSN for unlogged relations. * @@ -9555,6 +9650,22 @@ XLogReportParameters(void) } /* + * Log the new state of checksums + */ +static void +XlogChecksums(uint32 new_type) +{ + xl_checksum_state xlrec; + + xlrec.new_checksumtype = new_type; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xl_checksum_state)); + + XLogInsert(RM_XLOG_ID, XLOG_CHECKSUMS); +} + +/* * Update full_page_writes in shared memory, and write an * XLOG_FPW_CHANGE record if necessary. * @@ -9991,6 +10102,17 @@ xlog_redo(XLogReaderState *record) /* Keep track of full_page_writes */ lastFullPageWrites = fpw; } + else if (info == XLOG_CHECKSUMS) + { + xl_checksum_state state; + + memcpy(&state, XLogRecGetData(record), sizeof(xl_checksum_state)); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->data_checksum_version = state.new_checksumtype; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + } } #ifdef WAL_DEBUG diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c index 9731742978..4de8ae7f4d 100644 --- a/src/backend/access/transam/xlogfuncs.c +++ b/src/backend/access/transam/xlogfuncs.c @@ -23,6 +23,7 @@ #include "catalog/pg_type.h" #include "funcapi.h" #include "miscadmin.h" +#include "postmaster/checksumhelper.h" #include "replication/walreceiver.h" #include "storage/smgr.h" #include "utils/builtins.h" @@ -697,3 +698,65 @@ pg_backup_start_time(PG_FUNCTION_ARGS) PG_RETURN_DATUM(xtime); } + +/* + * Disables checksums for the cluster, unless already disabled. + * + * Has immediate effect - the checksums are set to off right away. + */ +Datum +pg_disable_data_checksums(PG_FUNCTION_ARGS) +{ + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("checksum state cannot be changed during recovery."))); + /* + * If we don't need to write new checksums, then clearly they are already + * disabled. + */ + if (!DataChecksumsNeedWrite()) + ereport(ERROR, + (errmsg("data checksums already disabled"))); + + ShutdownChecksumHelperIfRunning(); + + SetDataChecksumsOff(); + + PG_RETURN_VOID(); +} + +/* + * Enables checksums for the cluster, unless already enabled. + * + * This sets the system into a pending state. To initiate the actual + * checksum updates, a restart is required to make sure there can be + * no parallel backends doing things we cannot work with here. + */ +Datum +pg_enable_data_checksums(PG_FUNCTION_ARGS) +{ + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("checksum state cannot be changed during recovery."))); + + if (DataChecksumsNeedVerify()) + ereport(ERROR, + (errmsg("data checksums already enabled"))); + + if (DataChecksumsNeedWrite()) + ereport(ERROR, + (errmsg("data checksums already pending"), + errhint("A restart may be required to complete the process"))); + + SetDataChecksumsInProgress(); + + ereport(NOTICE, + (errmsg("data checksums set to pending"), + errhint("To complete the operation, a restart of PostgreSQL is required"))); + + PG_RETURN_VOID(); +} diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile index 71c23211b2..ee8f8c1cd3 100644 --- a/src/backend/postmaster/Makefile +++ b/src/backend/postmaster/Makefile @@ -12,7 +12,8 @@ subdir = src/backend/postmaster top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -OBJS = autovacuum.o bgworker.o bgwriter.o checkpointer.o fork_process.o \ - pgarch.o pgstat.o postmaster.o startup.o syslogger.o walwriter.o +OBJS = autovacuum.o bgworker.o bgwriter.o checkpointer.o checksumhelper.o \ + fork_process.o pgarch.o pgstat.o postmaster.o startup.o syslogger.o \ + walwriter.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c index f651bb49b1..19529d77ad 100644 --- a/src/backend/postmaster/bgworker.c +++ b/src/backend/postmaster/bgworker.c @@ -20,6 +20,7 @@ #include "pgstat.h" #include "port/atomics.h" #include "postmaster/bgworker_internals.h" +#include "postmaster/checksumhelper.h" #include "postmaster/postmaster.h" #include "replication/logicallauncher.h" #include "replication/logicalworker.h" @@ -129,6 +130,12 @@ static const struct }, { "ApplyWorkerMain", ApplyWorkerMain + }, + { + "ChecksumHelperLauncherMain", ChecksumHelperLauncherMain + }, + { + "ChecksumHelperWorkerMain", ChecksumHelperWorkerMain } }; diff --git a/src/backend/postmaster/checksumhelper.c b/src/backend/postmaster/checksumhelper.c new file mode 100644 index 0000000000..42f6b8fa69 --- /dev/null +++ b/src/backend/postmaster/checksumhelper.c @@ -0,0 +1,811 @@ +/*------------------------------------------------------------------------- + * + * checksumhelper.c + * Background worker to walk the database and write checksums to pages + * + * When enabling data checksums on a database at initdb time, no extra process + * is required as each page is checksummed, and verified, at accesses. When + * enabling checksums on an already running cluster, which was not initialized + * with checksums, this helper worker will ensure that all pages are + * checksummed before verification of the checksums is turned on. + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/postmaster/checksumhelper.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/heapam.h" +#include "access/htup_details.h" +#include "access/xact.h" +#include "catalog/pg_database.h" +#include "commands/vacuum.h" +#include "common/relpath.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/bgworker.h" +#include "postmaster/bgwriter.h" +#include "postmaster/checksumhelper.h" +#include "storage/bufmgr.h" +#include "storage/checksum.h" +#include "storage/lmgr.h" +#include "storage/ipc.h" +#include "storage/procarray.h" +#include "storage/smgr.h" +#include "tcop/tcopprot.h" +#include "utils/hsearch.h" +#include "utils/lsyscache.h" +#include "utils/ps_status.h" + + +typedef enum +{ + SUCCESSFUL = 0, + ABORTED, + FAILED +} ChecksumHelperResult; + +typedef struct ChecksumHelperShmemStruct +{ + ChecksumHelperResult success; + bool process_shared_catalogs; + bool abort; +} ChecksumHelperShmemStruct; + +/* Shared memory segment for checksumhelper */ +static ChecksumHelperShmemStruct * ChecksumHelperShmem; + +/* Bookkeeping for work to do */ +typedef struct ChecksumHelperDatabase +{ + Oid dboid; + char *dbname; +} ChecksumHelperDatabase; + +typedef struct ChecksumHelperRelation +{ + Oid reloid; + char relkind; +} ChecksumHelperRelation; + +/* Prototypes */ +static List *BuildDatabaseList(void); +static List *BuildRelationList(bool include_shared); +static ChecksumHelperResult ProcessDatabase(ChecksumHelperDatabase * db); +static void WaitForAllTransactionsToFinish(void); +static void launcher_cancel_handler(SIGNAL_ARGS); +static void checksumhelper_sighup(SIGNAL_ARGS); + +/* GUCs */ +int checksumhelper_cost_limit; +int checksumhelper_cost_delay; + +/* Flags set by signal handlers */ +static volatile sig_atomic_t got_SIGHUP = false; + +/* + * Main entry point for checksumhelper launcher process. + */ +bool +ChecksumHelperLauncherRegister(void) +{ + BackgroundWorker bgw; + + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "ChecksumHelperLauncherMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, "checksumhelper launcher"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "checksumhelper launcher"); + bgw.bgw_restart_time = BGW_NEVER_RESTART; + bgw.bgw_notify_pid = 0; + bgw.bgw_main_arg = (Datum) 0; + + RegisterBackgroundWorker(&bgw); + + return true; +} + +/* + * ShutdownChecksumHelperIfRunning + * Request shutdown of the checksumhelper + * + * This does not turn off processing immediately, it signals the checksum + * process to end when done with the current block. + */ +void +ShutdownChecksumHelperIfRunning(void) +{ + ChecksumHelperShmem->abort = true; +} + +/* + * ProcessSingleRelationFork + * Enable checksums in a single relation/fork. + * + * Loops over all existing blocks in this fork and calculates the checksum on them, + * and writes them out. For any blocks added by another process extending this + * fork while we run checksums will already set by the process extending it, + * so we don't need to care about those. + * + * Returns true if successful, and false if *aborted*. On error, an actual + * error is raised in the lower levels. + */ +static bool +ProcessSingleRelationFork(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy) +{ + BlockNumber numblocks = RelationGetNumberOfBlocksInFork(reln, forkNum); + BlockNumber b; + char activity[NAMEDATALEN * 2 + 128]; + + for (b = 0; b < numblocks; b++) + { + Buffer buf = ReadBufferExtended(reln, forkNum, b, RBM_NORMAL, strategy); + + /* + * Report to pgstat every 100 blocks (so as not to "spam") + */ + if ((b % 100) == 0) + { + snprintf(activity, sizeof(activity) - 1, "processing: %s.%s (%s block %d/%d)", + get_namespace_name(RelationGetNamespace(reln)), RelationGetRelationName(reln), + forkNames[forkNum], b, numblocks); + pgstat_report_activity(STATE_RUNNING, activity); + } + + /* Need to get an exclusive lock before we can flag as dirty */ + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + /* + * Mark the buffer as dirty and force a full page write. We have to + * re-write the page to WAL even if the checksum hasn't changed, + * because if there is a replica it might have a slightly different + * version of the page with an invalid checksum, caused by unlogged + * changes (e.g. hintbits) on the master happening while checksums + * were off. This can happen if there was a valid checksum on the page + * at one point in the past, so only when checksums are first on, then + * off, and then turned on again. Full page writes should only happen + * for relations that are actually logged (not unlogged or temp + * tables), but we still need to mark their buffers as dirty so the + * local file gets updated. + */ + START_CRIT_SECTION(); + MarkBufferDirty(buf); + if (RelationNeedsWAL(reln)) + log_newpage_buffer(buf, false); + END_CRIT_SECTION(); + + UnlockReleaseBuffer(buf); + + /* + * This is the only place where we check if we are asked to abort, the + * aborting will bubble up from here. + */ + if (ChecksumHelperShmem->abort) + return false; + + /* + * Update cost based delay parameters if changed, and then initiate + * the cost delay point. + */ + if (got_SIGHUP) + { + got_SIGHUP = false; + ProcessConfigFile(PGC_SIGHUP); + if (checksumhelper_cost_delay >= 0) + VacuumCostDelay = checksumhelper_cost_delay; + if (checksumhelper_cost_limit >= 0) + VacuumCostLimit = checksumhelper_cost_limit; + VacuumCostActive = (VacuumCostDelay > 0); + } + + vacuum_delay_point(); + } + + return true; +} + +/* + * ProcessSingleRelationByOid + * Process a single relation based on oid. + * + * Returns true if successful, and false if *aborted*. On error, an actual error + * is raised in the lower levels. + */ +static bool +ProcessSingleRelationByOid(Oid relationId, BufferAccessStrategy strategy) +{ + Relation rel; + ForkNumber fnum; + bool aborted = false; + + StartTransactionCommand(); + + elog(DEBUG2, "Checksumhelper starting to process relation %d", relationId); + rel = try_relation_open(relationId, AccessShareLock); + if (rel == NULL) + { + /* + * Relation no longer exist. We consider this a success, since there + * are no pages in it that need checksums, and thus return true. + */ + elog(DEBUG1, "Checksumhelper skipping relation %d as it no longer exists", relationId); + CommitTransactionCommand(); + pgstat_report_activity(STATE_IDLE, NULL); + return true; + } + RelationOpenSmgr(rel); + + for (fnum = 0; fnum <= MAX_FORKNUM; fnum++) + { + if (smgrexists(rel->rd_smgr, fnum)) + { + if (!ProcessSingleRelationFork(rel, fnum, strategy)) + { + aborted = true; + break; + } + } + } + relation_close(rel, AccessShareLock); + elog(DEBUG2, "Checksumhelper done with relation %d: %s", + relationId, (aborted ? "aborted" : "finished")); + + CommitTransactionCommand(); + + pgstat_report_activity(STATE_IDLE, NULL); + + return !aborted; +} + +/* + * ProcessDatabase + * Enable checksums in a single database. + * + * We do this by launching a dynamic background worker into this database, and + * waiting for it to finish. We have to do this in a separate worker, since + * each process can only be connected to one database during its lifetime. + */ +static ChecksumHelperResult +ProcessDatabase(ChecksumHelperDatabase * db) +{ + BackgroundWorker bgw; + BackgroundWorkerHandle *bgw_handle; + BgwHandleStatus status; + pid_t pid; + char activity[NAMEDATALEN + 64]; + + ChecksumHelperShmem->success = FAILED; + + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "ChecksumHelperWorkerMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, "checksumhelper worker"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "checksumhelper worker"); + bgw.bgw_restart_time = BGW_NEVER_RESTART; + bgw.bgw_notify_pid = MyProcPid; + bgw.bgw_main_arg = ObjectIdGetDatum(db->dboid); + + if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle)) + { + ereport(LOG, + (errmsg("failed to start worker for checksumhelper in \"%s\"", + db->dbname))); + return FAILED; + } + + status = WaitForBackgroundWorkerStartup(bgw_handle, &pid); + if (status != BGWH_STARTED) + { + ereport(LOG, + (errmsg("failed to wait for worker startup for checksumhelper in \"%s\"", + db->dbname))); + return FAILED; + } + + elog(DEBUG1, "started background worker for checksums in \"%s\"", + db->dbname); + + snprintf(activity, sizeof(activity) - 1, + "Waiting for worker in database %s (pid %d)", db->dbname, pid); + pgstat_report_activity(STATE_RUNNING, activity); + + + status = WaitForBackgroundWorkerShutdown(bgw_handle); + if (status != BGWH_STOPPED) + { + ereport(LOG, + (errmsg("failed to wait for worker shutdown for checksumhelper in \"%s\"", + db->dbname))); + return FAILED; + } + + if (ChecksumHelperShmem->success == ABORTED) + ereport(LOG, + (errmsg("checksumhelper was aborted during processing in \"%s\"", + db->dbname))); + + elog(DEBUG1, "background worker for checksums in \"%s\" completed", + db->dbname); + + pgstat_report_activity(STATE_IDLE, NULL); + + return ChecksumHelperShmem->success; +} + +static void +launcher_exit(int code, Datum arg) +{ + ChecksumHelperShmem->abort = false; +} + +static void +launcher_cancel_handler(SIGNAL_ARGS) +{ + ChecksumHelperShmem->abort = true; +} + +static void +checksumhelper_sighup(SIGNAL_ARGS) +{ + got_SIGHUP = true; +} + +static void +WaitForAllTransactionsToFinish(void) +{ + TransactionId waitforxid; + + LWLockAcquire(XidGenLock, LW_SHARED); + waitforxid = ShmemVariableCache->nextXid; + LWLockRelease(XidGenLock); + + while (true) + { + TransactionId oldestxid = GetOldestActiveTransactionId(); + + elog(DEBUG1, "Waiting for old transactions to finish"); + if (TransactionIdPrecedes(oldestxid, waitforxid)) + { + char activity[64]; + + /* Oldest running xid is older than us, so wait */ + snprintf(activity, sizeof(activity), "Waiting for current transactions to finish (waiting for %d)", waitforxid); + pgstat_report_activity(STATE_RUNNING, activity); + + /* Retry every 5 seconds */ + ResetLatch(MyLatch); + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT, + 5000, + WAIT_EVENT_PG_SLEEP); + } + else + { + pgstat_report_activity(STATE_IDLE, NULL); + return; + } + } +} + +void +ChecksumHelperLauncherMain(Datum arg) +{ + List *DatabaseList; + HTAB *ProcessedDatabases = NULL; + List *FailedDatabases = NIL; + ListCell *lc, + *lc2; + HASHCTL hash_ctl; + bool found_failed = false; + + if (RecoveryInProgress()) + { + elog(DEBUG1, "not starting checksumhelper launcher, recovery is in progress"); + return; + } + + /* + * If a standby was restarted when in pending state, a background worker + * was registered to start. If it's later promoted after the master has + * completed enabling checksums, we need to terminate immediately and not + * do anything. If the cluster is still in pending state when promoted, + * the background worker should start to complete the job. + */ + if (DataChecksumsNeedVerifyLocked()) + { + elog(DEBUG1, "not starting checksumhelper launcher, checksums already enabled"); + return; + } + + on_shmem_exit(launcher_exit, 0); + + elog(DEBUG1, "checksumhelper launcher started"); + + pqsignal(SIGTERM, die); + pqsignal(SIGINT, launcher_cancel_handler); + + BackgroundWorkerUnblockSignals(); + + init_ps_display(pgstat_get_backend_desc(B_CHECKSUMHELPER_LAUNCHER), "", "", ""); + + memset(&hash_ctl, 0, sizeof(hash_ctl)); + hash_ctl.keysize = sizeof(Oid); + hash_ctl.entrysize = sizeof(ChecksumHelperResult); + ProcessedDatabases = hash_create("Processed databases", + 64, + &hash_ctl, + HASH_ELEM); + + /* + * Initialize a connection to shared catalogs only. + */ + BackgroundWorkerInitializeConnection(NULL, NULL, 0); + + /* + * Set up so first run processes shared catalogs, but not once in every + * db. + */ + ChecksumHelperShmem->process_shared_catalogs = true; + + while (true) + { + int processed_databases; + + /* + * Get a list of all databases to process. This may include databases + * that were created during our runtime. + * + * Since a database can be created as a copy of any other database + * (which may not have existed in our last run), we have to repeat + * this loop until no new databases show up in the list. Since we wait + * for all pre-existing transactions finish, this way we can be + * certain that there are no databases left without checksums. + */ + + DatabaseList = BuildDatabaseList(); + + /* + * If there are no databases at all to checksum, we can exit + * immediately as there is no work to do. This probably can never + * happen, but just in case. + */ + if (DatabaseList == NIL || list_length(DatabaseList) == 0) + return; + + processed_databases = 0; + + foreach(lc, DatabaseList) + { + ChecksumHelperDatabase *db = (ChecksumHelperDatabase *) lfirst(lc); + ChecksumHelperResult processing; + + if (hash_search(ProcessedDatabases, (void *) &db->dboid, HASH_FIND, NULL)) + /* This database has already been processed */ + continue; + + processing = ProcessDatabase(db); + hash_search(ProcessedDatabases, (void *) &db->dboid, HASH_ENTER, NULL); + processed_databases++; + + if (processing == SUCCESSFUL) + { + if (ChecksumHelperShmem->process_shared_catalogs) + + /* + * Now that one database has completed shared catalogs, we + * don't have to process them again. + */ + ChecksumHelperShmem->process_shared_catalogs = false; + } + else if (processing == FAILED) + { + /* + * Put failed databases on the list of failures. + */ + FailedDatabases = lappend(FailedDatabases, db); + } + else + /* Abort flag set, so exit the whole process */ + return; + } + + elog(DEBUG1, "Completed one loop of checksum enabling, %i databases processed", processed_databases); + if (processed_databases == 0) + + /* + * No databases processed in this run of the loop, we have now + * finished all databases and no concurrently created ones can + * exist. + */ + break; + } + + /* + * FailedDatabases now has all databases that failed one way or another. + * This can be because they actually failed for some reason, or because + * the database was dropped between us getting the database list and + * trying to process it. Get a fresh list of databases to detect the + * second case where the database was dropped before we had started + * processing it. If a database still exists, but enabling checksums + * failed then we fail the entire checksumming process and exit with an + * error. + */ + DatabaseList = BuildDatabaseList(); + + foreach(lc, FailedDatabases) + { + ChecksumHelperDatabase *db = (ChecksumHelperDatabase *) lfirst(lc); + bool found = false; + + foreach(lc2, DatabaseList) + { + ChecksumHelperDatabase *db2 = (ChecksumHelperDatabase *) lfirst(lc2); + + if (db->dboid == db2->dboid) + { + found = true; + ereport(WARNING, + (errmsg("failed to enable checksums in \"%s\"", + db->dbname))); + break; + } + } + + if (found) + found_failed = true; + else + { + ereport(LOG, + (errmsg("database \"%s\" has been dropped, skipping", + db->dbname))); + } + } + + if (found_failed) + { + /* Disable checksums on cluster, because we failed */ + SetDataChecksumsOff(); + ereport(ERROR, + (errmsg("checksumhelper failed to enable checksums in all databases, aborting"))); + } + + /* + * Force a checkpoint to get everything out to disk. XXX: this should + * probably not be an IMMEDIATE checkpoint, but leave it there for now for + * testing + */ + RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_IMMEDIATE); + + /* + * Everything has been processed, so flag checksums enabled. + */ + SetDataChecksumsOn(); + + ereport(LOG, + (errmsg("checksums enabled, checksumhelper launcher shutting down"))); +} + +/* + * ChecksumHelperShmemSize + * Compute required space for checksumhelper-related shared memory + */ +Size +ChecksumHelperShmemSize(void) +{ + Size size; + + size = sizeof(ChecksumHelperShmemStruct); + size = MAXALIGN(size); + + return size; +} + +/* + * ChecksumHelperShmemInit + * Allocate and initialize checksumhelper-related shared memory + */ +void +ChecksumHelperShmemInit(void) +{ + bool found; + + ChecksumHelperShmem = (ChecksumHelperShmemStruct *) + ShmemInitStruct("ChecksumHelper Data", + ChecksumHelperShmemSize(), + &found); + + if (!found) + { + MemSet(ChecksumHelperShmem, 0, ChecksumHelperShmemSize()); + } +} + +/* + * BuildDatabaseList + * Compile a list of all currently available databases in the cluster + * + * This creates the list of databases for the checksumhelper workers to add + * checksums to. + */ +static List * +BuildDatabaseList(void) +{ + List *DatabaseList = NIL; + Relation rel; + HeapScanDesc scan; + HeapTuple tup; + MemoryContext ctx = CurrentMemoryContext; + MemoryContext oldctx; + + StartTransactionCommand(); + + rel = heap_open(DatabaseRelationId, AccessShareLock); + + /* + * Before we do this, wait for all pending transactions to finish. This + * will ensure there are no concurrently running CREATE DATABASE, which + * could cause us to miss the creation of a database that was copied + * without checksums. + */ + WaitForAllTransactionsToFinish(); + + scan = heap_beginscan_catalog(rel, 0, NULL); + + while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) + { + Form_pg_database pgdb = (Form_pg_database) GETSTRUCT(tup); + ChecksumHelperDatabase *db; + + oldctx = MemoryContextSwitchTo(ctx); + + db = (ChecksumHelperDatabase *) palloc(sizeof(ChecksumHelperDatabase)); + + db->dboid = HeapTupleGetOid(tup); + db->dbname = pstrdup(NameStr(pgdb->datname)); + + DatabaseList = lappend(DatabaseList, db); + + MemoryContextSwitchTo(oldctx); + } + + heap_endscan(scan); + heap_close(rel, AccessShareLock); + + CommitTransactionCommand(); + + return DatabaseList; +} + +/* + * BuildRelationList + * Compile a list of all relations in the database + * + * If shared is true, both shared relations and local ones are returned, else + * all non-shared relations are returned. + * Temp tables are not included. + */ +static List * +BuildRelationList(bool include_shared) +{ + List *RelationList = NIL; + Relation rel; + HeapScanDesc scan; + HeapTuple tup; + MemoryContext ctx = CurrentMemoryContext; + MemoryContext oldctx; + + StartTransactionCommand(); + + rel = heap_open(RelationRelationId, AccessShareLock); + scan = heap_beginscan_catalog(rel, 0, NULL); + + while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) + { + Form_pg_class pgc = (Form_pg_class) GETSTRUCT(tup); + ChecksumHelperRelation *relentry; + + if (pgc->relpersistence == 't') + continue; + + if (pgc->relisshared && !include_shared) + continue; + + /* + * Only include relation types that has local storage. + */ + if (pgc->relkind == RELKIND_VIEW || + pgc->relkind == RELKIND_COMPOSITE_TYPE || + pgc->relkind == RELKIND_FOREIGN_TABLE) + continue; + + oldctx = MemoryContextSwitchTo(ctx); + relentry = (ChecksumHelperRelation *) palloc(sizeof(ChecksumHelperRelation)); + + relentry->reloid = HeapTupleGetOid(tup); + relentry->relkind = pgc->relkind; + + RelationList = lappend(RelationList, relentry); + + MemoryContextSwitchTo(oldctx); + } + + heap_endscan(scan); + heap_close(rel, AccessShareLock); + + CommitTransactionCommand(); + + return RelationList; +} + +/* + * Main function for enabling checksums in a single database + */ +void +ChecksumHelperWorkerMain(Datum arg) +{ + Oid dboid = DatumGetObjectId(arg); + List *RelationList = NIL; + ListCell *lc; + BufferAccessStrategy strategy; + bool aborted = false; + + pqsignal(SIGTERM, die); + pqsignal(SIGHUP, checksumhelper_sighup); + + BackgroundWorkerUnblockSignals(); + + init_ps_display(pgstat_get_backend_desc(B_CHECKSUMHELPER_WORKER), "", "", ""); + + elog(DEBUG1, "checksum worker starting for database oid %d", dboid); + + BackgroundWorkerInitializeConnectionByOid(dboid, InvalidOid, BGWORKER_BYPASS_ALLOWCONN); + + /* + * Enable vacuum cost delay, if any. + */ + if (checksumhelper_cost_delay >= 0) + VacuumCostDelay = checksumhelper_cost_delay; + if (checksumhelper_cost_limit >= 0) + VacuumCostLimit = checksumhelper_cost_limit; + VacuumCostActive = (VacuumCostDelay > 0); + VacuumCostBalance = 0; + VacuumPageHit = 0; + VacuumPageMiss = 0; + VacuumPageDirty = 0; + + /* + * Create and set the vacuum strategy as our buffer strategy. + */ + strategy = GetAccessStrategy(BAS_VACUUM); + + RelationList = BuildRelationList(ChecksumHelperShmem->process_shared_catalogs); + foreach(lc, RelationList) + { + ChecksumHelperRelation *rel = (ChecksumHelperRelation *) lfirst(lc); + + if (!ProcessSingleRelationByOid(rel->reloid, strategy)) + { + aborted = true; + break; + } + } + + if (aborted) + { + ChecksumHelperShmem->success = ABORTED; + elog(DEBUG1, "checksum worker aborted in database oid %d", dboid); + return; + } + + ChecksumHelperShmem->success = SUCCESSFUL; + elog(DEBUG1, "checksum worker completed in database oid %d", dboid); +} diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 084573e77c..bcd6086cea 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -4132,6 +4132,11 @@ pgstat_get_backend_desc(BackendType backendType) case B_WAL_WRITER: backendDesc = "walwriter"; break; + case B_CHECKSUMHELPER_LAUNCHER: + backendDesc = "checksumhelper launcher"; + break; + case B_CHECKSUMHELPER_WORKER: + backendDesc = "checksumhelper worker"; } return backendDesc; diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index a4b53b33cd..8b8d4d5f49 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -110,6 +110,7 @@ #include "port/pg_bswap.h" #include "postmaster/autovacuum.h" #include "postmaster/bgworker_internals.h" +#include "postmaster/checksumhelper.h" #include "postmaster/fork_process.h" #include "postmaster/pgarch.h" #include "postmaster/postmaster.h" @@ -988,6 +989,17 @@ PostmasterMain(int argc, char *argv[]) ApplyLauncherRegister(); /* + * If checksums are set to pending, start the checksum helper launcher + * to start enabling checksums. + */ + if (DataChecksumsInProgress()) + { + ereport(LOG, + (errmsg("data checksums in pending state, starting background worker to enable"))); + ChecksumHelperLauncherRegister(); + } + + /* * process any libraries that should be preloaded at postmaster start */ process_shared_preload_libraries(); diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c index 3f1eae38a9..2ab6afe99c 100644 --- a/src/backend/replication/basebackup.c +++ b/src/backend/replication/basebackup.c @@ -1386,7 +1386,7 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf _tarWriteHeader(tarfilename, NULL, statbuf, false); - if (!noverify_checksums && DataChecksumsEnabled()) + if (!noverify_checksums && DataChecksumsNeedVerify()) { char *filename; diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index 59c003de9c..30d80e7c54 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -199,6 +199,7 @@ DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) case XLOG_FPW_CHANGE: case XLOG_FPI_FOR_HINT: case XLOG_FPI: + case XLOG_CHECKSUMS: break; default: elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info); diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 0c86a581c0..853e1e472f 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -27,6 +27,7 @@ #include "postmaster/autovacuum.h" #include "postmaster/bgworker_internals.h" #include "postmaster/bgwriter.h" +#include "postmaster/checksumhelper.h" #include "postmaster/postmaster.h" #include "replication/logicallauncher.h" #include "replication/slot.h" @@ -261,6 +262,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port) WalSndShmemInit(); WalRcvShmemInit(); ApplyLauncherShmemInit(); + ChecksumHelperShmemInit(); /* * Set up other modules that need some shared memory space diff --git a/src/backend/storage/page/README b/src/backend/storage/page/README index 5127d98da3..5381016915 100644 --- a/src/backend/storage/page/README +++ b/src/backend/storage/page/README @@ -9,7 +9,10 @@ have a very low measured incidence according to research on large server farms, http://www.cs.toronto.edu/~bianca/papers/sigmetrics09.pdf, discussed 2010/12/22 on -hackers list. -Current implementation requires this be enabled system-wide at initdb time. +Checksums can be enabled at initdb time, but can also be turned on and off +using pg_enable_data_checksums()/pg_disable_data_checksums() at runtime. When +enabled via pg_enable_data_checksums() the server must be restarted for the +checksumming to take effect. The checksum is not valid at all times on a data page!! The checksum is valid when the page leaves the shared pool and is checked diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index dfbda5458f..3de09f03a1 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -93,12 +93,22 @@ PageIsVerified(Page page, BlockNumber blkno) */ if (!PageIsNew(page)) { - if (DataChecksumsEnabled()) + if (DataChecksumsNeedVerify()) { checksum = pg_checksum_page((char *) page, blkno); if (checksum != p->pd_checksum) - checksum_failure = true; + { + /* + * It is possible we get this failure because the user + * has disabled checksums, but we have not yet seen this + * in pg_control and therefor think we should verify it. + * To make sure we have seen any change, make a locked + * access to verify it as well. + */ + if (DataChecksumsNeedVerifyLocked()) + checksum_failure = true; + } } /* @@ -1168,7 +1178,7 @@ PageSetChecksumCopy(Page page, BlockNumber blkno) static char *pageCopy = NULL; /* If we don't need a checksum, just return the passed-in data */ - if (PageIsNew(page) || !DataChecksumsEnabled()) + if (PageIsNew(page) || !DataChecksumsNeedWrite()) return (char *) page; /* @@ -1195,7 +1205,7 @@ void PageSetChecksumInplace(Page page, BlockNumber blkno) { /* If we don't need a checksum, just return */ - if (PageIsNew(page) || !DataChecksumsEnabled()) + if (PageIsNew(page) || !DataChecksumsNeedWrite()) return; ((PageHeader) page)->pd_checksum = pg_checksum_page((char *) page, blkno); diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 859ef931e7..f84b916393 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -32,6 +32,7 @@ #include "access/transam.h" #include "access/twophase.h" #include "access/xact.h" +#include "access/xlog.h" #include "access/xlog_internal.h" #include "catalog/namespace.h" #include "catalog/pg_authid.h" @@ -59,6 +60,7 @@ #include "postmaster/autovacuum.h" #include "postmaster/bgworker_internals.h" #include "postmaster/bgwriter.h" +#include "postmaster/checksumhelper.h" #include "postmaster/postmaster.h" #include "postmaster/syslogger.h" #include "postmaster/walwriter.h" @@ -68,6 +70,7 @@ #include "replication/walreceiver.h" #include "replication/walsender.h" #include "storage/bufmgr.h" +#include "storage/checksum.h" #include "storage/dsm_impl.h" #include "storage/standby.h" #include "storage/fd.h" @@ -421,6 +424,17 @@ static const struct config_enum_entry password_encryption_options[] = { }; /* + * data_checksum used to be a boolean, but was only set by initdb so there is + * no need to support variants of boolean input. + */ +static const struct config_enum_entry data_checksum_options[] = { + {"on", PG_DATA_CHECKSUM_VERSION, true}, + {"off", 0, true}, + {"inprogress", PG_DATA_CHECKSUM_INPROGRESS_VERSION, true}, + {NULL, 0, false} +}; + +/* * Options for enum values stored in other modules */ extern const struct config_enum_entry wal_level_options[]; @@ -515,7 +529,7 @@ static int max_identifier_length; static int block_size; static int segment_size; static int wal_block_size; -static bool data_checksums; +static int data_checksums_tmp; /* only accessed locally! */ static bool integer_datetimes; static bool assert_enabled; @@ -589,6 +603,8 @@ const char *const config_group_names[] = gettext_noop("Resource Usage / Kernel Resources"), /* RESOURCES_VACUUM_DELAY */ gettext_noop("Resource Usage / Cost-Based Vacuum Delay"), + /* RESOURCES_CHECKSUMHELPER */ + gettext_noop("Resource Usage / Checksumhelper"), /* RESOURCES_BGWRITER */ gettext_noop("Resource Usage / Background Writer"), /* RESOURCES_ASYNCHRONOUS */ @@ -1706,17 +1722,6 @@ static struct config_bool ConfigureNamesBool[] = }, { - {"data_checksums", PGC_INTERNAL, PRESET_OPTIONS, - gettext_noop("Shows whether data checksums are turned on for this cluster."), - NULL, - GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE - }, - &data_checksums, - false, - NULL, NULL, NULL - }, - - { {"syslog_sequence_numbers", PGC_SIGHUP, LOGGING_WHERE, gettext_noop("Add sequence number to syslog messages to avoid duplicate suppression."), NULL @@ -2205,6 +2210,27 @@ static struct config_int ConfigureNamesInt[] = }, { + {"checksumhelper_cost_delay", PGC_SIGHUP, RESOURCES_CHECKSUMHELPER, + gettext_noop("Checksum helper cost delay in milliseconds."), + NULL, + GUC_UNIT_MS + }, + &checksumhelper_cost_delay, + 20, -1, 100, + NULL, NULL, NULL + }, + + { + {"checksumhelper_cost_limit", PGC_SIGHUP, RESOURCES_CHECKSUMHELPER, + gettext_noop("Checksum helper cost amount available before napping."), + NULL + }, + &checksumhelper_cost_limit, + -1, -1, 10000, + NULL, NULL, NULL + }, + + { {"max_files_per_process", PGC_POSTMASTER, RESOURCES_KERNEL, gettext_noop("Sets the maximum number of simultaneously open files for each server process."), NULL @@ -4150,6 +4176,17 @@ static struct config_enum ConfigureNamesEnum[] = NULL, NULL, NULL }, + { + {"data_checksums", PGC_INTERNAL, PRESET_OPTIONS, + gettext_noop("Shows whether data checksums are turned on for this cluster."), + NULL, + GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE + }, + &data_checksums_tmp, + 0, data_checksum_options, + NULL, NULL, show_data_checksums + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, 0, NULL, NULL, NULL, NULL diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 9e39baf466..9159643634 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -154,6 +154,11 @@ #vacuum_cost_page_dirty = 20 # 0-10000 credits #vacuum_cost_limit = 200 # 1-10000 credits +# - Checksumhelper - + +#checksumhelper_cost_delay = 20 # 0-100 milliseconds, -1 to use vacuum_cost_delay +#checksumhelper_cost_limit = -1 # 0-10000 credits, -1 to use vacuum_cost_limit + # - Background Writer - #bgwriter_delay = 200ms # 10-10000ms between rounds diff --git a/src/bin/pg_upgrade/controldata.c b/src/bin/pg_upgrade/controldata.c index 0fe98a550e..1a82e1ddad 100644 --- a/src/bin/pg_upgrade/controldata.c +++ b/src/bin/pg_upgrade/controldata.c @@ -11,6 +11,8 @@ #include "pg_upgrade.h" +#include "storage/bufpage.h" + #include /* @@ -591,6 +593,15 @@ check_control_data(ControlData *oldctrl, */ /* + * If checksums have been turned on in the old cluster, but the + * checksumhelper have yet to finish, then disallow upgrading. The user + * should either let the process finish, or turn off checksums, before + * retrying. + */ + if (oldctrl->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_VERSION) + pg_fatal("transition to data checksums not completed in old cluster\n"); + + /* * We might eventually allow upgrades from checksum to no-checksum * clusters. */ diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index 7e5e971294..449a703c47 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -226,7 +226,7 @@ typedef struct uint32 large_object; bool date_is_int; bool float8_pass_by_value; - bool data_checksum_version; + uint32 data_checksum_version; } ControlData; /* diff --git a/src/bin/pg_verify_checksums/pg_verify_checksums.c b/src/bin/pg_verify_checksums/pg_verify_checksums.c index 28c975446e..c2e8b55109 100644 --- a/src/bin/pg_verify_checksums/pg_verify_checksums.c +++ b/src/bin/pg_verify_checksums/pg_verify_checksums.c @@ -314,7 +314,10 @@ main(int argc, char *argv[]) printf(_("Data checksum version: %d\n"), ControlFile->data_checksum_version); printf(_("Files scanned: %" INT64_MODIFIER "d\n"), files); printf(_("Blocks scanned: %" INT64_MODIFIER "d\n"), blocks); - printf(_("Bad checksums: %" INT64_MODIFIER "d\n"), badblocks); + if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_VERSION) + printf(_("Blocks left in progress: %" INT64_MODIFIER "d\n"), badblocks); + else + printf(_("Bad checksums: %" INT64_MODIFIER "d\n"), badblocks); if (badblocks > 0) return 1; diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 421ba6d775..63438ec8f4 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -154,7 +154,7 @@ extern PGDLLIMPORT int wal_level; * of the bits make it to disk, but the checksum wouldn't match. Also WAL-log * them if forced by wal_log_hints=on. */ -#define XLogHintBitIsNeeded() (DataChecksumsEnabled() || wal_log_hints) +#define XLogHintBitIsNeeded() (DataChecksumsNeedWrite() || wal_log_hints) /* Do we need to WAL-log information required only for Hot Standby and logical replication? */ #define XLogStandbyInfoActive() (wal_level >= WAL_LEVEL_REPLICA) @@ -257,7 +257,14 @@ extern char *XLogFileNameP(TimeLineID tli, XLogSegNo segno); extern void UpdateControlFile(void); extern uint64 GetSystemIdentifier(void); extern char *GetMockAuthenticationNonce(void); -extern bool DataChecksumsEnabled(void); +extern bool DataChecksumsNeedWrite(void); +extern bool DataChecksumsNeedVerify(void); +extern bool DataChecksumsNeedVerifyLocked(void); +extern bool DataChecksumsInProgress(void); +extern void SetDataChecksumsInProgress(void); +extern void SetDataChecksumsOn(void); +extern void SetDataChecksumsOff(void); +extern const char *show_data_checksums(void); extern XLogRecPtr GetFakeLSNForUnloggedRel(void); extern Size XLOGShmemSize(void); extern void XLOGShmemInit(void); diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 7c766836db..043efc89ed 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -25,6 +25,7 @@ #include "lib/stringinfo.h" #include "pgtime.h" #include "storage/block.h" +#include "storage/checksum.h" #include "storage/relfilenode.h" @@ -240,6 +241,12 @@ typedef struct xl_restore_point char rp_name[MAXFNAMELEN]; } xl_restore_point; +/* Information logged when checksum level is changed */ +typedef struct xl_checksum_state +{ + uint32 new_checksumtype; +} xl_checksum_state; + /* End of recovery mark, when we don't do an END_OF_RECOVERY checkpoint */ typedef struct xl_end_of_recovery { diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index 773d9e6eba..33c59f9a63 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -76,6 +76,7 @@ typedef struct CheckPoint #define XLOG_END_OF_RECOVERY 0x90 #define XLOG_FPI_FOR_HINT 0xA0 #define XLOG_FPI 0xB0 +#define XLOG_CHECKSUMS 0xC0 /* diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 40d54ed030..c57b1bb436 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -10176,6 +10176,13 @@ proargnames => '{max_data_alignment,database_block_size,blocks_per_segment,wal_block_size,bytes_per_wal_segment,max_identifier_length,max_index_columns,max_toast_chunk_size,large_object_chunk_size,float4_pass_by_value,float8_pass_by_value,data_page_checksum_version}', prosrc => 'pg_control_init' }, +{ oid => '3996', descr => 'disable data checksums', + proname => 'pg_disable_data_checksums', provolatile => 'v', + proparallel => 'u', prorettype => 'void', proargtypes => '', prosrc => 'pg_disable_data_checksums' }, +{ oid => '3998', descr => 'enable data checksums', + proname => 'pg_enable_data_checksums', provolatile => 'v', + proparallel => 'u', prorettype => 'void', proargtypes => '', prosrc => 'pg_enable_data_checksums' }, + # collation management functions { oid => '3445', descr => 'import collations from operating system', proname => 'pg_import_system_collations', procost => '100', diff --git a/src/include/pgstat.h b/src/include/pgstat.h index be2f59239b..4ed9ed76cc 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -710,7 +710,9 @@ typedef enum BackendType B_STARTUP, B_WAL_RECEIVER, B_WAL_SENDER, - B_WAL_WRITER + B_WAL_WRITER, + B_CHECKSUMHELPER_LAUNCHER, + B_CHECKSUMHELPER_WORKER } BackendType; diff --git a/src/include/postmaster/checksumhelper.h b/src/include/postmaster/checksumhelper.h new file mode 100644 index 0000000000..a1ff73e31f --- /dev/null +++ b/src/include/postmaster/checksumhelper.h @@ -0,0 +1,35 @@ +/*------------------------------------------------------------------------- + * + * checksumhelper.h + * header file for checksum helper background worker + * + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/postmaster/checksumhelper.h + * + *------------------------------------------------------------------------- + */ +#ifndef CHECKSUMHELPER_H +#define CHECKSUMHELPER_H + +/* Shared memory */ +extern Size ChecksumHelperShmemSize(void); +extern void ChecksumHelperShmemInit(void); + +/* Start the background processes for enabling checksums */ +bool ChecksumHelperLauncherRegister(void); + +/* Shutdown the background processes, if any */ +void ShutdownChecksumHelperIfRunning(void); + +/* Background worker entrypoints */ +void ChecksumHelperLauncherMain(Datum arg); +void ChecksumHelperWorkerMain(Datum arg); + +/* GUCs */ +extern int checksumhelper_cost_limit; +extern int checksumhelper_cost_delay; + +#endif /* CHECKSUMHELPER_H */ diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index 85dd10c45a..bd46bf2ce6 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -194,6 +194,7 @@ typedef PageHeaderData *PageHeader; */ #define PG_PAGE_LAYOUT_VERSION 4 #define PG_DATA_CHECKSUM_VERSION 1 +#define PG_DATA_CHECKSUM_INPROGRESS_VERSION 2 /* ---------------------------------------------------------------- * page support macros diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h index 668d9efd35..4d6bd12581 100644 --- a/src/include/utils/guc_tables.h +++ b/src/include/utils/guc_tables.h @@ -63,6 +63,7 @@ enum config_group RESOURCES_DISK, RESOURCES_KERNEL, RESOURCES_VACUUM_DELAY, + RESOURCES_CHECKSUMHELPER, RESOURCES_BGWRITER, RESOURCES_ASYNCHRONOUS, WAL, diff --git a/src/test/Makefile b/src/test/Makefile index efb206aa75..6469ac94a4 100644 --- a/src/test/Makefile +++ b/src/test/Makefile @@ -12,7 +12,8 @@ subdir = src/test top_builddir = ../.. include $(top_builddir)/src/Makefile.global -SUBDIRS = perl regress isolation modules authentication recovery subscription +SUBDIRS = perl regress isolation modules authentication recovery subscription \ + checksum # Test suites that are not safe by default but can be run if selected # by the user via the whitespace-separated list in variable diff --git a/src/test/checksum/.gitignore b/src/test/checksum/.gitignore new file mode 100644 index 0000000000..871e943d50 --- /dev/null +++ b/src/test/checksum/.gitignore @@ -0,0 +1,2 @@ +# Generated by test suite +/tmp_check/ diff --git a/src/test/checksum/Makefile b/src/test/checksum/Makefile new file mode 100644 index 0000000000..f3ad9dfae1 --- /dev/null +++ b/src/test/checksum/Makefile @@ -0,0 +1,24 @@ +#------------------------------------------------------------------------- +# +# Makefile for src/test/checksum +# +# Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group +# Portions Copyright (c) 1994, Regents of the University of California +# +# src/test/checksum/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/test/checksum +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +check: + $(prove_check) + +installcheck: + $(prove_installcheck) + +clean distclean maintainer-clean: + rm -rf tmp_check + diff --git a/src/test/checksum/README b/src/test/checksum/README new file mode 100644 index 0000000000..e3fbd2bdb5 --- /dev/null +++ b/src/test/checksum/README @@ -0,0 +1,22 @@ +src/test/checksum/README + +Regression tests for data checksums +=================================== + +This directory contains a test suite for enabling data checksums +in a running cluster with streaming replication. + +Running the tests +================= + + make check + +or + + make installcheck + +NOTE: This creates a temporary installation (in the case of "check"), +with multiple nodes, be they master or standby(s) for the purpose of +the tests. + +NOTE: This requires the --enable-tap-tests argument to configure. diff --git a/src/test/checksum/t/001_standby_checksum.pl b/src/test/checksum/t/001_standby_checksum.pl new file mode 100644 index 0000000000..71f4cb3d7c --- /dev/null +++ b/src/test/checksum/t/001_standby_checksum.pl @@ -0,0 +1,104 @@ +# Test suite for testing enabling data checksums with streaming replication +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More tests => 10; + +my $MAX_TRIES = 30; + +# Initialize master node +my $node_master = get_new_node('master'); +$node_master->init(allows_streaming => 1); +$node_master->start; +my $backup_name = 'my_backup'; + +# Take backup +$node_master->backup($backup_name); + +# Create streaming standby linking to master +my $node_standby_1 = get_new_node('standby_1'); +$node_standby_1->init_from_backup($node_master, $backup_name, + has_streaming => 1); +$node_standby_1->start; + +# Create some content on master to have un-checksummed data in the cluster +$node_master->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); + +# Wait for standbys to catch up +$node_master->wait_for_catchup($node_standby_1, 'replay', + $node_master->lsn('insert')); + +# Check that checksums are turned off +my $result = $node_master->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"); +is($result, "off", 'ensure checksums are turned off on master'); + +$result = $node_standby_1->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"); +is($result, "off", 'ensure checksums are turned off on standby_1'); + +# Enable checksums for the cluster +$node_master->safe_psql('postgres', "SELECT pg_enable_data_checksums();"); + +# Ensure that the master has switched to inprogress immediately +$result = $node_master->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"); +is($result, "inprogress", 'ensure checksums are in progress on master'); + +# Wait for checksum enable to be replayed +$node_master->wait_for_catchup($node_standby_1, 'replay', $node_master->lsn('insert')); + +# Ensure that the standby has switched to inprogress +$result = $node_standby_1->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"); +is($result, "inprogress", 'ensure checksums are in progress on standby_1'); + +# Restart master to trigger background worker to enable checksums +$node_master->restart(); + +# Insert some more data which should be checksummed on INSERT +$node_master->safe_psql('postgres', + "INSERT INTO t VALUES (generate_series(1,10000));"); + +# Wait for checksums enabled on the master +for (my $i = 0; $i < $MAX_TRIES; $i++) +{ + $result = $node_master->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"); + last if ($result eq 'on'); + sleep(1); +} +is ($result, "on", 'ensure checksums are enabled on master'); + +# Wait for checksums enabled on the standby +for (my $i = 0; $i < $MAX_TRIES; $i++) +{ + $result = $node_standby_1->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"); + last if ($result eq 'on'); + sleep(1); +} +is ($result, "on", 'ensure checksums are enabled on standby'); + +$result = $node_master->safe_psql('postgres', "SELECT count(a) FROM t"); +is ($result, "20000", 'ensure we can safely read all data with checksums'); + +# Disable checksums and ensure it's propagated to standby and that we can +# still read all data +$node_master->safe_psql('postgres', "SELECT pg_disable_data_checksums();"); +$result = $node_master->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"); +is($result, "off", 'ensure checksums are in progress on master'); + +# Wait for checksum disable to be replayed +$node_master->wait_for_catchup($node_standby_1, 'replay', $node_master->lsn('insert')); + +# Ensure that the standby has switched to off +$result = $node_standby_1->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"); +is($result, "off", 'ensure checksums are in progress on standby_1'); + +$result = $node_master->safe_psql('postgres', "SELECT count(a) FROM t"); +is ($result, "20000", 'ensure we can safely read all data without checksums');