From ec1d0780fdf5cce01235528701a4dc353d902995 Mon Sep 17 00:00:00 2001 From: Bruce Momjian Date: Fri, 25 Jun 2021 16:57:03 -0400 Subject: [PATCH 10/12] cfe-10-hint_over_cfe-09-test squash commit --- doc/src/sgml/config.sgml | 7 +- src/backend/access/rmgrdesc/xlogdesc.c | 6 +- src/backend/access/transam/xlog.c | 4 + src/backend/access/transam/xloginsert.c | 24 +++++ src/backend/replication/logical/decode.c | 1 + src/backend/storage/buffer/bufmgr.c | 131 ++++++++++++++++------- src/bin/pg_alterckey/pg_alterckey.c | 76 +++++++++++-- src/include/access/xlog.h | 14 +-- src/include/access/xloginsert.h | 2 + src/include/catalog/pg_control.h | 1 + 10 files changed, 214 insertions(+), 52 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 69ee55a568..3da17c76f2 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -3164,9 +3164,10 @@ include_dir 'conf.d' - If data checksums are enabled, hint bit updates are always WAL-logged - and this setting is ignored. You can use this setting to test how much - extra WAL-logging would occur if your database had data checksums + If data checksums or cluster file encryption is enabled, + hint bit updates are always WAL-logged and this setting is + ignored. You can use this setting to test how much extra + WAL-logging would occur if your database had data checksums enabled. diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index 3fd7185f21..e08c364d03 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -80,7 +80,8 @@ xlog_desc(StringInfo buf, XLogReaderState *record) appendStringInfoString(buf, xlrec->rp_name); } - else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT) + else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT || + info == XLOG_ENCRYPTION_LSN) { /* no further information to print */ } @@ -196,6 +197,9 @@ xlog_identify(uint8 info) case XLOG_FPI_FOR_HINT: id = "FPI_FOR_HINT"; break; + case XLOG_ENCRYPTION_LSN: + id = "ENCRYPTION_LSN"; + break; } return id; diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index de92e88d81..43429647b5 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -8012,6 +8012,10 @@ xlog_redo(XLogReaderState *record) UnlockReleaseBuffer(buffer); } } + else if (info == XLOG_ENCRYPTION_LSN) + { + /* nothing to do here */ + } else if (info == XLOG_BACKUP_END) { /* nothing to do here, handled in xlogrecovery_redo() */ diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index f811523448..b4bbacae26 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -1081,6 +1081,30 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std) return recptr; } +/* + * This function returns either a WAL or fake LSN, for use by encryption. + */ +XLogRecPtr +LSNForEncryption(bool use_wal_lsn) +{ + if (use_wal_lsn) + { + int dummy = 0; + + Assert(FileEncryptionEnabled); + /* + * Records other than SWITCH_WAL must have content. We use an integer 0 to + * follow the restriction. + */ + XLogBeginInsert(); + XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT); + XLogRegisterData((char *) &dummy, sizeof(dummy)); + return XLogInsert(RM_XLOG_ID, XLOG_ENCRYPTION_LSN); + } + else + return GetFakeLSNForUnloggedRel(); +} + /* * Write a WAL record containing a full image of a page. Caller is responsible * for writing the page to disk after calling this routine. diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index 2cc0ac9eb0..6922f6c0e2 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -162,6 +162,7 @@ xlog_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) case XLOG_FPI_FOR_HINT: case XLOG_FPI: case XLOG_OVERWRITE_CONTRECORD: + case XLOG_ENCRYPTION_LSN: break; default: elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info); diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 6b95381481..5ec3e405ab 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -3991,11 +3991,12 @@ IncrBufferRefCount(Buffer buffer) * This is essentially the same as MarkBufferDirty, except: * * 1. The caller does not write WAL; so if checksums are enabled, we may need - * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages. + * to write an XLOG_FPI_FOR_HINT record to protect against torn pages, or + * XLOG_ENCRYPTION_LSN to generate a new LSN for the page. * 2. The caller might have only share-lock instead of exclusive-lock on the - * buffer's content lock. + * buffer's content lock. * 3. This function does not guarantee that the buffer is always marked dirty - * (due to a race condition), so it cannot be used for important changes. + * (due to a race condition), so it cannot be used for important changes. */ void MarkBufferDirtyHint(Buffer buffer, bool buffer_std) @@ -4041,53 +4042,111 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) * If we need to protect hint bit updates from torn writes, WAL-log a * full page image of the page. This full page image is only necessary * if the hint bit update is the first change to the page since the - * last checkpoint. + * last checkpoint. If cluster file encryption is enabled, we also + * need to generate new page LSNs for all other cases of page writes. * * We don't check full_page_writes here because that logic is included * when we call XLogInsert() since the value changes dynamically. */ - if (XLogHintBitIsNeeded() && - (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT)) + if (XLogHintBitIsNeeded()) { /* - * If we must not write WAL, due to a relfilelocator-specific - * condition or being in recovery, don't dirty the page. We can - * set the hint, just not dirty the page as a result so the hint - * is lost when we evict the page or shutdown. + * If we must not write WAL during recovery so don't dirty the page. + * We can set the hint, just not dirty the page as a result so the + * hint is lost when we evict the page or shutdown. * * See src/backend/storage/page/README for longer discussion. */ if (RecoveryInProgress() || - RelFileLocatorSkippingWAL(BufTagGetRelFileLocator(&bufHdr->tag))) + (RelFileLocatorSkippingWAL(BufTagGetRelFileLocator(&bufHdr->tag)) && + !FileEncryptionEnabled)) return; /* - * If the block is already dirty because we either made a change - * or set a hint already, then we don't need to write a full page - * image. Note that aggressive cleaning of blocks dirtied by hint - * bit setting would increase the call rate. Bulk setting of hint - * bits would reduce the call rate... - * - * We must issue the WAL record before we mark the buffer dirty. - * Otherwise we might write the page before we write the WAL. That - * causes a race condition, since a checkpoint might occur between - * writing the WAL record and marking the buffer dirty. We solve - * that with a kluge, but one that is already in use during - * transaction commit to prevent race conditions. Basically, we - * simply prevent the checkpoint WAL record from being written - * until we have marked the buffer dirty. We don't start the - * checkpoint flush until we have marked dirty, so our checkpoint - * must flush the change to disk successfully or the checkpoint - * never gets written, so crash recovery will fix. - * - * It's possible we may enter here without an xid, so it is - * essential that CreateCheckPoint waits for virtual transactions - * rather than full transactionids. + * Non-BM_PERMANENT objects don't need full page images because + * they are not restored. WAL-skipped relfilenodes should never + * have full page images generated. */ - Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0); - MyProc->delayChkptFlags |= DELAY_CHKPT_START; - delayChkptFlags = true; - lsn = XLogSaveBufferForHint(buffer, buffer_std); + if (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT && + !RelFileLocatorSkippingWAL(BufTagGetRelFileLocator(&bufHdr->tag))) + { + /* + * If the block is already dirty because we either made a change + * or set a hint already, then we don't need to write a full + * page image. Note that aggressive cleaning of blocks dirtied + * by hint bit setting would increase the call rate. Bulk + * setting of hint bits would reduce the call rate... + * + * We must issue the WAL record before we mark the buffer + * dirty. Otherwise we might write the page before we write + * the WAL. That causes a race condition, since a checkpoint + * might occur between writing the WAL record and marking the + * buffer dirty. We solve that with a kluge, but one that is + * already in use during transaction commit to prevent race + * conditions. Basically, we simply prevent the checkpoint WAL + * record from being written until we have marked the buffer + * dirty. We don't start the checkpoint flush until we have + * marked dirty, so our checkpoint must flush the change to disk + * successfully or the checkpoint never gets written, so crash + * recovery will fix. + * + * It's possible we may enter here without an xid, so it + * is essential that CreateCheckpoint waits for virtual + * transactions rather than full transactionids. + */ + MyProc->delayChkptFlags |= DELAY_CHKPT_START; + delayChkptFlags = true; + lsn = XLogSaveBufferForHint(buffer, buffer_std); + } + + /* + * Above, for hint bit changes, we might have generated a new page + * LSN and a full-page WAL record for a page's first-clean-to-dirty + * during a checkpoint for permanent, non-WAL-skipped relfilenodes. + * If we didn't (the lsn variable is invalid), and we are + * doing cluster file encryption, we must generate a new + * page LSN here for either non-permanent relations or page + * non-first-clean-to-dirty during a checkpoint. (Cluster file + * encryption does not support WAL-skip relfilenodes.) We must + * update the page LSN even if the page with the hint bit change is + * later overwritten in the file system with an earlier version of + * the page during crash recovery. + * + * XXX Can we rely on the full page write above with no lock being + * held to avoid torn pages? Above, the LSN and page image are + * tied together, but here is just the page LSN update. + */ + if (XLogRecPtrIsInvalid(lsn) && FileEncryptionEnabled) + { + /* + * For cluster file encryption we need a new page LSN because + * the LSN is used, with the page number and permanent flag, as + * part of the nonce, and the nonce must be unique for every + * page write. If we reencrypt a page with hint bit changes + * using the same nonce as previous writes, it would expose the + * hint bit change locations. To avoid this, we write a simple + * WAL record to advance the lsn, which can then be assigned to + * the page below. + * + * Above we are relying on the full page writes to revert + * any partial pages writes caused by this LSN change for + * permanent, non-WAL-skip relfilenodes. For non-permanent + * relations, they crash recover as empty. For WAL-skip + * relfilenodes, they recover with their original contents, so + * that works too. + */ + /* XXX Do we need the checkpoint delay here? */ + MyProc->delayChkptFlags |= DELAY_CHKPT_START; + delayChkptFlags = true; + /* + * XXX We probably don't need to replay this WAL on the primary + * since the full page image is restored, but do we have + * to replay this on the repicas (for relations that are + * replicated)? + */ + lsn = LSNForEncryption( + pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT); + } } buf_state = LockBufHdr(bufHdr); diff --git a/src/bin/pg_alterckey/pg_alterckey.c b/src/bin/pg_alterckey/pg_alterckey.c index 7e3d4ad915..c699c59761 100644 --- a/src/bin/pg_alterckey/pg_alterckey.c +++ b/src/bin/pg_alterckey/pg_alterckey.c @@ -38,10 +38,9 @@ #include "common/file_perm.h" #include "common/file_utils.h" -#include "common/hex.h" #include "common/restricted_token.h" -#include "crypto/kmgr.h" #include "common/logging.h" +#include "crypto/kmgr.h" #include "getopt_long.h" #include "pg_getopt.h" @@ -84,6 +83,9 @@ static void bzero_keys_and_exit(exit_action action); static void reencrypt_data_keys(void); static void install_new_keys(void); +static uint64 hex_decode(const char *src, size_t len, char *dst); + + static void usage(const char *progname) { @@ -511,8 +513,8 @@ retrieve_cluster_keys(void) (char *) cluster_key_hex, ALLOC_KMGR_CLUSTER_KEY_LEN, live_path, terminal_fd); - if (pg_hex_decode(cluster_key_hex, cluster_key_len, - (char *) old_cluster_key, KMGR_CLUSTER_KEY_LEN) != + if (hex_decode(cluster_key_hex, cluster_key_len, + (char *) old_cluster_key) != KMGR_CLUSTER_KEY_LEN) { pg_log_error("cluster key must be at %d hex bytes", KMGR_CLUSTER_KEY_LEN); @@ -534,8 +536,8 @@ retrieve_cluster_keys(void) (char *) cluster_key_hex, ALLOC_KMGR_CLUSTER_KEY_LEN, new_path, terminal_fd); - if (pg_hex_decode(cluster_key_hex, cluster_key_len, - (char *) new_cluster_key, KMGR_CLUSTER_KEY_LEN) != + if (hex_decode(cluster_key_hex, cluster_key_len, + (char *) new_cluster_key) != KMGR_CLUSTER_KEY_LEN) { pg_log_error("cluster key must be at %d hex bytes", KMGR_CLUSTER_KEY_LEN); @@ -722,3 +724,65 @@ bzero_keys_and_exit(exit_action action) /* return 0 or 1 */ exit(action != SUCCESS_EXIT); } + +/* + * HEX + */ + +static const int8 hexlookup[128] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, +}; + +static inline char +get_hex(const char *cp) +{ + unsigned char c = (unsigned char) *cp; + int res = -1; + + if (c < 127) + res = hexlookup[c]; + + if (res < 0) + pg_fatal("invalid hexadecimal digit: \"%s\"",cp); + + return (char) res; +} + +static uint64 +hex_decode(const char *src, size_t len, char *dst) +{ + const char *s, + *srcend; + char v1, + v2, + *p; + + srcend = src + len; + s = src; + p = dst; + while (s < srcend) + { + if (*s == ' ' || *s == '\n' || *s == '\t' || *s == '\r') + { + s++; + continue; + } + v1 = get_hex(s) << 4; + s++; + if (s >= srcend) + pg_fatal("invalid hexadecimal data: odd number of digits"); + + v2 = get_hex(s); + s++; + *p++ = v1 | v2; + } + + return p - dst; +} diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 8f219fbacc..78def5753a 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -107,13 +107,15 @@ extern PGDLLIMPORT int wal_level; /* * Is a full-page image needed for hint bit updates? * - * Normally, we don't WAL-log hint bit updates, but if checksums are enabled, - * we have to protect them against torn page writes. When you only set - * individual bits on a page, it's still consistent no matter what combination - * of the bits make it to disk, but the checksum wouldn't match. Also WAL-log - * them if forced by wal_log_hints=on. + * Normally, we don't WAL-log hint bit updates, but if checksums or encryption + * is enabled, we have to protect them against torn page writes. When you + * only set individual bits on a page, it's still consistent no matter what + * combination of the bits make it to disk, but the checksum wouldn't match. + * Cluster file encryption requires a new LSN for hint bit changes, and can't + * tolerate torn pages. Also WAL-log them if forced by wal_log_hints=on. */ -#define XLogHintBitIsNeeded() (DataChecksumsEnabled() || wal_log_hints) +#define XLogHintBitIsNeeded() \ + (DataChecksumsEnabled() || FileEncryptionEnabled || wal_log_hints) /* Do we need to WAL-log information required only for Hot Standby and logical replication? */ #define XLogStandbyInfoActive() (wal_level >= WAL_LEVEL_REPLICA) diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h index 001ff2f521..dfb501a7c3 100644 --- a/src/include/access/xloginsert.h +++ b/src/include/access/xloginsert.h @@ -61,6 +61,8 @@ extern void log_newpage_range(Relation rel, ForkNumber forknum, BlockNumber startblk, BlockNumber endblk, bool page_std); extern XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std); +extern XLogRecPtr LSNForEncryption(bool use_wal_lsn); + extern void InitXLogInsert(void); #endif /* XLOGINSERT_H */ diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index db7de77287..1c97ffd2cd 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -78,6 +78,7 @@ typedef struct CheckPoint #define XLOG_FPI 0xB0 /* 0xC0 is used in Postgres 9.5-11 */ #define XLOG_OVERWRITE_CONTRECORD 0xD0 +#define XLOG_ENCRYPTION_LSN 0xE0 /* -- 2.37.0 (Apple Git-136)