From 85d251c614bb8ea7692bb1f403a982a2c6c007da Mon Sep 17 00:00:00 2001 From: Bruce Momjian Date: Tue, 6 Apr 2021 14:23:46 -0400 Subject: [PATCH] cfe-10-hint_over_cfe-09-test squash commit --- doc/src/sgml/config.sgml | 7 +- src/backend/access/rmgrdesc/xlogdesc.c | 6 +- src/backend/access/transam/xlog.c | 4 + src/backend/access/transam/xloginsert.c | 24 +++++ src/backend/replication/logical/decode.c | 1 + src/backend/storage/buffer/bufmgr.c | 128 +++++++++++++++++------ src/include/access/xlog.h | 14 +-- src/include/access/xloginsert.h | 2 + src/include/catalog/pg_control.h | 1 + 9 files changed, 143 insertions(+), 44 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index d7b6e368d0..6bcd9c64df 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -3093,9 +3093,10 @@ include_dir 'conf.d' - If data checksums are enabled, hint bit updates are always WAL-logged - and this setting is ignored. You can use this setting to test how much - extra WAL-logging would occur if your database had data checksums + If data checksums or cluster file encryption is enabled, + hint bit updates are always WAL-logged and this setting is + ignored. You can use this setting to test how much extra + WAL-logging would occur if your database had data checksums enabled. diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index e6090a9dad..d5474990d6 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -80,7 +80,8 @@ xlog_desc(StringInfo buf, XLogReaderState *record) appendStringInfoString(buf, xlrec->rp_name); } - else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT) + else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT || + info == XLOG_ENCRYPTION_LSN) { /* no further information to print */ } @@ -184,6 +185,9 @@ xlog_identify(uint8 info) case XLOG_FPI_FOR_HINT: id = "FPI_FOR_HINT"; break; + case XLOG_ENCRYPTION_LSN: + id = "ENCRYPTION_LSN"; + break; } return id; diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 51165f6593..dba690bc5e 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -10353,6 +10353,10 @@ xlog_redo(XLogReaderState *record) UnlockReleaseBuffer(buffer); } } + else if (info == XLOG_ENCRYPTION_LSN) + { + /* nothing to do here */ + } else if (info == XLOG_BACKUP_END) { XLogRecPtr startpoint; diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 7052dc245e..c76ed1f881 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -980,6 +980,30 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std) return recptr; } +/* + * This function returns either a WAL or fake LSN, for use by encryption. + */ +XLogRecPtr +LSNForEncryption(bool use_wal_lsn) +{ + if (use_wal_lsn) + { + int dummy = 0; + + Assert(FileEncryptionEnabled); + /* + * Records other than SWITCH_WAL must have content. We use an integer 0 to + * follow the restriction. + */ + XLogBeginInsert(); + XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT); + XLogRegisterData((char *) &dummy, sizeof(dummy)); + return XLogInsert(RM_XLOG_ID, XLOG_ENCRYPTION_LSN); + } + else + return GetFakeLSNForUnloggedRel(); +} + /* * Write a WAL record containing a full image of a page. Caller is responsible * for writing the page to disk after calling this routine. diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index 97be4b0f23..48400fde2a 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -224,6 +224,7 @@ DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) case XLOG_FPW_CHANGE: case XLOG_FPI_FOR_HINT: case XLOG_FPI: + case XLOG_ENCRYPTION_LSN: break; default: elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info); diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 852138f9c9..367fed5e85 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -3766,11 +3766,12 @@ IncrBufferRefCount(Buffer buffer) * This is essentially the same as MarkBufferDirty, except: * * 1. The caller does not write WAL; so if checksums are enabled, we may need - * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages. + * to write an XLOG_FPI_FOR_HINT record to protect against torn pages, or + * XLOG_ENCRYPTION_LSN to generate a new LSN for the page. * 2. The caller might have only share-lock instead of exclusive-lock on the - * buffer's content lock. + * buffer's content lock. * 3. This function does not guarantee that the buffer is always marked dirty - * (due to a race condition), so it cannot be used for important changes. + * (due to a race condition), so it cannot be used for important changes. */ void MarkBufferDirtyHint(Buffer buffer, bool buffer_std) @@ -3816,51 +3817,110 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) * If we need to protect hint bit updates from torn writes, WAL-log a * full page image of the page. This full page image is only necessary * if the hint bit update is the first change to the page since the - * last checkpoint. + * last checkpoint. If cluster file encryption is enabled, we also + * need to generate new page LSNs for all other cases of page writes. * * We don't check full_page_writes here because that logic is included * when we call XLogInsert() since the value changes dynamically. */ - if (XLogHintBitIsNeeded() && - (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT)) + if (XLogHintBitIsNeeded()) { /* - * If we must not write WAL, due to a relfilenode-specific - * condition or being in recovery, don't dirty the page. We can - * set the hint, just not dirty the page as a result so the hint - * is lost when we evict the page or shutdown. + * If we must not write WAL during recovery so don't dirty the page. + * We can set the hint, just not dirty the page as a result so the + * hint is lost when we evict the page or shutdown. * * See src/backend/storage/page/README for longer discussion. + * XXX Can this be improved? */ - if (RecoveryInProgress() || - RelFileNodeSkippingWAL(bufHdr->tag.rnode)) + if (RecoveryInProgress() || + (RelFileNodeSkippingWAL(bufHdr->tag.rnode) && + !FileEncryptionEnabled)) return; /* - * If the block is already dirty because we either made a change - * or set a hint already, then we don't need to write a full page - * image. Note that aggressive cleaning of blocks dirtied by hint - * bit setting would increase the call rate. Bulk setting of hint - * bits would reduce the call rate... - * - * We must issue the WAL record before we mark the buffer dirty. - * Otherwise we might write the page before we write the WAL. That - * causes a race condition, since a checkpoint might occur between - * writing the WAL record and marking the buffer dirty. We solve - * that with a kluge, but one that is already in use during - * transaction commit to prevent race conditions. Basically, we - * simply prevent the checkpoint WAL record from being written - * until we have marked the buffer dirty. We don't start the - * checkpoint flush until we have marked dirty, so our checkpoint - * must flush the change to disk successfully or the checkpoint - * never gets written, so crash recovery will fix. + * Non-BM_PERMANENT objects don't need full page images because + * they are not restored. WAL-skipped relfilenodes should never + * have full page images generated. + */ + if (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT && + !RelFileNodeSkippingWAL(bufHdr->tag.rnode)) + { + /* + * If the block is already dirty because we either made a change + * or set a hint already, then we don't need to write a full + * page image. Note that aggressive cleaning of blocks dirtied + * by hint bit setting would increase the call rate. Bulk + * setting of hint bits would reduce the call rate... + * + * We must issue the WAL record before we mark the buffer + * dirty. Otherwise we might write the page before we write + * the WAL. That causes a race condition, since a checkpoint + * might occur between writing the WAL record and marking the + * buffer dirty. We solve that with a kluge, but one that is + * already in use during transaction commit to prevent race + * conditions. Basically, we simply prevent the checkpoint WAL + * record from being written until we have marked the buffer + * dirty. We don't start the checkpoint flush until we have + * marked dirty, so our checkpoint must flush the change to disk + * successfully or the checkpoint never gets written, so crash + * recovery will fix. + * + * It's possible we may enter here without an xid, so it + * is essential that CreateCheckpoint waits for virtual + * transactions rather than full transactionids. + */ + MyProc->delayChkpt = delayChkpt = true; + lsn = XLogSaveBufferForHint(buffer, buffer_std); + } + + /* + * Above, for hint bit changes, we might have generated a new page + * LSN and a full-page WAL record for a page's first-clean-to-dirty + * during a checkpoint for permanent, non-WAL-skipped relfilenodes. + * If we didn't (the lsn variable is invalid), and we are + * doing cluster file encryption, we must generate a new + * page LSN here for either non-permanent relations or page + * non-first-clean-to-dirty during a checkpoint. (Cluster file + * encryption does not support WAL-skip relfilenodes.) We must + * update the page LSN even if the page with the hint bit change is + * later overwritten in the file system with an earlier version of + * the page during crash recovery. * - * It's possible we may enter here without an xid, so it is - * essential that CreateCheckpoint waits for virtual transactions - * rather than full transactionids. + * XXX Can we rely on the full page write above with no lock being + * held to avoid torn pages? Above, the LSN and page image are + * tied together, but here is just the page LSN update. */ - MyProc->delayChkpt = delayChkpt = true; - lsn = XLogSaveBufferForHint(buffer, buffer_std); + if (XLogRecPtrIsInvalid(lsn) && FileEncryptionEnabled) + { + /* + * For cluster file encryption we need a new page LSN because + * the LSN is used, with the page number and permanent flag, as + * part of the nonce, and the nonce must be unique for every + * page write. If we reencrypt a page with hint bit changes + * using the same nonce as previous writes, it would expose the + * hint bit change locations. To avoid this, we write a simple + * WAL record to advance the lsn, which can then be assigned to + * the page below. + * + * Above we are relying on the full page writes to revert + * any partial pages writes caused by this LSN change for + * permanent, non-WAL-skip relfilenodes. For non-permanent + * relations, they crash recover as empty. For WAL-skip + * relfilenodes, they recover with their original contents, so + * that works too. + */ + /* XXX Do we need the checkpoint delay here? */ + MyProc->delayChkpt = delayChkpt = true; + /* + * XXX We probably don't need to replay this WAL on the primary + * since the full page image is restored, but do we have + * to replay this on the repicas (for relations that are + * replicated)? + */ + lsn = LSNForEncryption( + pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT); + } } buf_state = LockBufHdr(bufHdr); diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index ab66a660a5..52f5a42f50 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -203,13 +203,15 @@ extern PGDLLIMPORT int wal_level; /* * Is a full-page image needed for hint bit updates? * - * Normally, we don't WAL-log hint bit updates, but if checksums are enabled, - * we have to protect them against torn page writes. When you only set - * individual bits on a page, it's still consistent no matter what combination - * of the bits make it to disk, but the checksum wouldn't match. Also WAL-log - * them if forced by wal_log_hints=on. + * Normally, we don't WAL-log hint bit updates, but if checksums or encryption + * is enabled, we have to protect them against torn page writes. When you + * only set individual bits on a page, it's still consistent no matter what + * combination of the bits make it to disk, but the checksum wouldn't match. + * Cluster file encryption requires a new LSN for hint bit changes, and can't + * tolerate torn pages. Also WAL-log them if forced by wal_log_hints=on. */ -#define XLogHintBitIsNeeded() (DataChecksumsEnabled() || wal_log_hints) +#define XLogHintBitIsNeeded() \ + (DataChecksumsEnabled() || FileEncryptionEnabled || wal_log_hints) /* Do we need to WAL-log information required only for Hot Standby and logical replication? */ #define XLogStandbyInfoActive() (wal_level >= WAL_LEVEL_REPLICA) diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h index f1d8c39edf..e7ee674dd4 100644 --- a/src/include/access/xloginsert.h +++ b/src/include/access/xloginsert.h @@ -61,6 +61,8 @@ extern void log_newpage_range(Relation rel, ForkNumber forkNum, BlockNumber startblk, BlockNumber endblk, bool page_std); extern XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std); +extern XLogRecPtr LSNForEncryption(bool use_wal_lsn); + extern void InitXLogInsert(void); #endif /* XLOGINSERT_H */ diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index 6216d1faf5..e23693da4f 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -76,6 +76,7 @@ typedef struct CheckPoint #define XLOG_END_OF_RECOVERY 0x90 #define XLOG_FPI_FOR_HINT 0xA0 #define XLOG_FPI 0xB0 +#define XLOG_ENCRYPTION_LSN 0xC0 /* -- 2.20.1