From ec1d0780fdf5cce01235528701a4dc353d902995 Mon Sep 17 00:00:00 2001
From: Bruce Momjian <bruce@momjian.us>
Date: Fri, 25 Jun 2021 16:57:03 -0400
Subject: [PATCH 10/12] cfe-10-hint_over_cfe-09-test squash commit

---
 doc/src/sgml/config.sgml                 |   7 +-
 src/backend/access/rmgrdesc/xlogdesc.c   |   6 +-
 src/backend/access/transam/xlog.c        |   4 +
 src/backend/access/transam/xloginsert.c  |  24 +++++
 src/backend/replication/logical/decode.c |   1 +
 src/backend/storage/buffer/bufmgr.c      | 131 ++++++++++++++++-------
 src/bin/pg_alterckey/pg_alterckey.c      |  76 +++++++++++--
 src/include/access/xlog.h                |  14 +--
 src/include/access/xloginsert.h          |   2 +
 src/include/catalog/pg_control.h         |   1 +
 10 files changed, 214 insertions(+), 52 deletions(-)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 69ee55a568..3da17c76f2 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -3164,9 +3164,10 @@ include_dir 'conf.d'
        </para>
 
        <para>
-        If data checksums are enabled, hint bit updates are always WAL-logged
-        and this setting is ignored. You can use this setting to test how much
-        extra WAL-logging would occur if your database had data checksums
+        If data checksums or cluster file encryption is enabled,
+        hint bit updates are always WAL-logged and this setting is
+        ignored. You can use this setting to test how much extra
+        WAL-logging would occur if your database had data checksums
         enabled.
        </para>
 
diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c
index 3fd7185f21..e08c364d03 100644
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -80,7 +80,8 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
 
 		appendStringInfoString(buf, xlrec->rp_name);
 	}
-	else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
+	else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
+			 info == XLOG_ENCRYPTION_LSN)
 	{
 		/* no further information to print */
 	}
@@ -196,6 +197,9 @@ xlog_identify(uint8 info)
 		case XLOG_FPI_FOR_HINT:
 			id = "FPI_FOR_HINT";
 			break;
+		case XLOG_ENCRYPTION_LSN:
+			id = "ENCRYPTION_LSN";
+			break;
 	}
 
 	return id;
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index de92e88d81..43429647b5 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -8012,6 +8012,10 @@ xlog_redo(XLogReaderState *record)
 			UnlockReleaseBuffer(buffer);
 		}
 	}
+	else if (info == XLOG_ENCRYPTION_LSN)
+	{
+		/* nothing to do here */
+	}
 	else if (info == XLOG_BACKUP_END)
 	{
 		/* nothing to do here, handled in xlogrecovery_redo() */
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index f811523448..b4bbacae26 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -1081,6 +1081,30 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
 	return recptr;
 }
 
+/*
+ * This function returns either a WAL or fake LSN, for use by encryption.
+ */
+XLogRecPtr
+LSNForEncryption(bool use_wal_lsn)
+{
+	if (use_wal_lsn)
+	{
+		int			dummy = 0;
+
+		Assert(FileEncryptionEnabled);
+		/*
+		 * Records other than SWITCH_WAL must have content. We use an integer 0 to
+		 * follow the restriction.
+		 */
+		XLogBeginInsert();
+		XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
+		XLogRegisterData((char *) &dummy, sizeof(dummy));
+		return XLogInsert(RM_XLOG_ID, XLOG_ENCRYPTION_LSN);
+	}
+	else
+		return GetFakeLSNForUnloggedRel();
+}
+
 /*
  * Write a WAL record containing a full image of a page. Caller is responsible
  * for writing the page to disk after calling this routine.
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
index 2cc0ac9eb0..6922f6c0e2 100644
--- a/src/backend/replication/logical/decode.c
+++ b/src/backend/replication/logical/decode.c
@@ -162,6 +162,7 @@ xlog_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 		case XLOG_FPI_FOR_HINT:
 		case XLOG_FPI:
 		case XLOG_OVERWRITE_CONTRECORD:
+		case XLOG_ENCRYPTION_LSN:
 			break;
 		default:
 			elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 6b95381481..5ec3e405ab 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3991,11 +3991,12 @@ IncrBufferRefCount(Buffer buffer)
  * This is essentially the same as MarkBufferDirty, except:
  *
  * 1. The caller does not write WAL; so if checksums are enabled, we may need
- *	  to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
+ *    to write an XLOG_FPI_FOR_HINT record to protect against torn pages, or
+ *    XLOG_ENCRYPTION_LSN to generate a new LSN for the page.
  * 2. The caller might have only share-lock instead of exclusive-lock on the
- *	  buffer's content lock.
+ *    buffer's content lock.
  * 3. This function does not guarantee that the buffer is always marked dirty
- *	  (due to a race condition), so it cannot be used for important changes.
+ *    (due to a race condition), so it cannot be used for important changes.
  */
 void
 MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
@@ -4041,53 +4042,111 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 		 * If we need to protect hint bit updates from torn writes, WAL-log a
 		 * full page image of the page. This full page image is only necessary
 		 * if the hint bit update is the first change to the page since the
-		 * last checkpoint.
+		 * last checkpoint.  If cluster file encryption is enabled, we also
+		 * need to generate new page LSNs for all other cases of page writes.
 		 *
 		 * We don't check full_page_writes here because that logic is included
 		 * when we call XLogInsert() since the value changes dynamically.
 		 */
-		if (XLogHintBitIsNeeded() &&
-			(pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
+		if (XLogHintBitIsNeeded())
 		{
 			/*
-			 * If we must not write WAL, due to a relfilelocator-specific
-			 * condition or being in recovery, don't dirty the page.  We can
-			 * set the hint, just not dirty the page as a result so the hint
-			 * is lost when we evict the page or shutdown.
+ 			 * If we must not write WAL during recovery so don't dirty the page.
+ 			 * We can set the hint, just not dirty the page as a result so the
+ 			 * hint is lost when we evict the page or shutdown.
 			 *
 			 * See src/backend/storage/page/README for longer discussion.
 			 */
 			if (RecoveryInProgress() ||
-				RelFileLocatorSkippingWAL(BufTagGetRelFileLocator(&bufHdr->tag)))
+				(RelFileLocatorSkippingWAL(BufTagGetRelFileLocator(&bufHdr->tag)) &&
+				 !FileEncryptionEnabled))
 				return;
 
 			/*
-			 * If the block is already dirty because we either made a change
-			 * or set a hint already, then we don't need to write a full page
-			 * image.  Note that aggressive cleaning of blocks dirtied by hint
-			 * bit setting would increase the call rate. Bulk setting of hint
-			 * bits would reduce the call rate...
-			 *
-			 * We must issue the WAL record before we mark the buffer dirty.
-			 * Otherwise we might write the page before we write the WAL. That
-			 * causes a race condition, since a checkpoint might occur between
-			 * writing the WAL record and marking the buffer dirty. We solve
-			 * that with a kluge, but one that is already in use during
-			 * transaction commit to prevent race conditions. Basically, we
-			 * simply prevent the checkpoint WAL record from being written
-			 * until we have marked the buffer dirty. We don't start the
-			 * checkpoint flush until we have marked dirty, so our checkpoint
-			 * must flush the change to disk successfully or the checkpoint
-			 * never gets written, so crash recovery will fix.
-			 *
-			 * It's possible we may enter here without an xid, so it is
-			 * essential that CreateCheckPoint waits for virtual transactions
-			 * rather than full transactionids.
+			 * Non-BM_PERMANENT objects don't need full page images because
+			 * they are not restored.  WAL-skipped relfilenodes should never
+			 * have full page images generated.
 			 */
-			Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
-			MyProc->delayChkptFlags |= DELAY_CHKPT_START;
-			delayChkptFlags = true;
-			lsn = XLogSaveBufferForHint(buffer, buffer_std);
+			if (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT &&
+				!RelFileLocatorSkippingWAL(BufTagGetRelFileLocator(&bufHdr->tag)))
+			{
+				/*
+				 * If the block is already dirty because we either made a change
+				 * or set a hint already, then we don't need to write a full
+				 * page image.  Note that aggressive cleaning of blocks dirtied
+				 * by hint bit setting would increase the call rate. Bulk
+				 * setting of hint bits would reduce the call rate...
+				 *
+				 * We must issue the WAL record before we mark the buffer
+				 * dirty.  Otherwise we might write the page before we write
+				 * the WAL. That causes a race condition, since a checkpoint
+				 * might occur between writing the WAL record and marking the
+				 * buffer dirty. We solve that with a kluge, but one that is
+				 * already in use during transaction commit to prevent race
+				 * conditions. Basically, we simply prevent the checkpoint WAL
+				 * record from being written until we have marked the buffer
+				 * dirty. We don't start the checkpoint flush until we have
+				 * marked dirty, so our checkpoint must flush the change to disk
+				 * successfully or the checkpoint never gets written, so crash
+				 * recovery will fix.
+				 *
+				 * It's possible we may enter here without an xid, so it
+				 * is essential that CreateCheckpoint waits for virtual
+				 * transactions rather than full transactionids.
+				 */
+				MyProc->delayChkptFlags |= DELAY_CHKPT_START;
+				delayChkptFlags = true;
+				lsn = XLogSaveBufferForHint(buffer, buffer_std);
+			}
+
+			/*
+			 * Above, for hint bit changes, we might have generated a new page
+			 * LSN and a full-page WAL record for a page's first-clean-to-dirty
+			 * during a checkpoint for permanent, non-WAL-skipped relfilenodes.
+			 * If we didn't (the lsn variable is invalid), and we are
+			 * doing cluster file encryption, we must generate a new
+			 * page LSN here for either non-permanent relations or page
+			 * non-first-clean-to-dirty during a checkpoint.  (Cluster file
+			 * encryption does not support WAL-skip relfilenodes.)  We must
+			 * update the page LSN even if the page with the hint bit change is
+			 * later overwritten in the file system with an earlier version of
+			 * the page during crash recovery.
+  			 *
+			 * XXX Can we rely on the full page write above with no lock being
+			 * held to avoid torn pages?  Above, the LSN and page image are
+			 * tied together, but here is just the page LSN update.
+  			 */
+			if (XLogRecPtrIsInvalid(lsn) && FileEncryptionEnabled)
+			{
+				/*
+				 * For cluster file encryption we need a new page LSN because
+				 * the LSN is used, with the page number and permanent flag, as
+				 * part of the nonce, and the nonce must be unique for every
+				 * page write.  If we reencrypt a page with hint bit changes
+				 * using the same nonce as previous writes, it would expose the
+				 * hint bit change locations.  To avoid this, we write a simple
+				 * WAL record to advance the lsn, which can then be assigned to
+				 * the page below.
+				 *
+				 * Above we are relying on the full page writes to revert
+				 * any partial pages writes caused by this LSN change for
+				 * permanent, non-WAL-skip relfilenodes.  For non-permanent
+				 * relations, they crash recover as empty.  For WAL-skip
+				 * relfilenodes, they recover with their original contents, so
+				 * that works too.
+				 */
+				/* XXX Do we need the checkpoint delay here? */
+				MyProc->delayChkptFlags |= DELAY_CHKPT_START;
+				delayChkptFlags = true;
+				/*
+				 * XXX We probably don't need to replay this WAL on the primary
+				 * since the full page image is restored, but do we have
+				 * to replay this on the repicas (for relations that are
+				 * replicated)?
+				 */
+				lsn = LSNForEncryption(
+						pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT);
+			}
 		}
 
 		buf_state = LockBufHdr(bufHdr);
diff --git a/src/bin/pg_alterckey/pg_alterckey.c b/src/bin/pg_alterckey/pg_alterckey.c
index 7e3d4ad915..c699c59761 100644
--- a/src/bin/pg_alterckey/pg_alterckey.c
+++ b/src/bin/pg_alterckey/pg_alterckey.c
@@ -38,10 +38,9 @@
 
 #include "common/file_perm.h"
 #include "common/file_utils.h"
-#include "common/hex.h"
 #include "common/restricted_token.h"
-#include "crypto/kmgr.h"
 #include "common/logging.h"
+#include "crypto/kmgr.h"
 #include "getopt_long.h"
 #include "pg_getopt.h"
 
@@ -84,6 +83,9 @@ static void bzero_keys_and_exit(exit_action action);
 static void reencrypt_data_keys(void);
 static void install_new_keys(void);
 
+static uint64 hex_decode(const char *src, size_t len, char *dst);
+
+
 static void
 usage(const char *progname)
 {
@@ -511,8 +513,8 @@ retrieve_cluster_keys(void)
 												   (char *) cluster_key_hex,
 												   ALLOC_KMGR_CLUSTER_KEY_LEN,
 												   live_path, terminal_fd);
-	if (pg_hex_decode(cluster_key_hex, cluster_key_len,
-					  (char *) old_cluster_key, KMGR_CLUSTER_KEY_LEN) !=
+	if (hex_decode(cluster_key_hex, cluster_key_len,
+					  (char *) old_cluster_key) !=
 		KMGR_CLUSTER_KEY_LEN)
 	{
 		pg_log_error("cluster key must be at %d hex bytes", KMGR_CLUSTER_KEY_LEN);
@@ -534,8 +536,8 @@ retrieve_cluster_keys(void)
 												   (char *) cluster_key_hex,
 												   ALLOC_KMGR_CLUSTER_KEY_LEN,
 												   new_path, terminal_fd);
-	if (pg_hex_decode(cluster_key_hex, cluster_key_len,
-					  (char *) new_cluster_key, KMGR_CLUSTER_KEY_LEN) !=
+	if (hex_decode(cluster_key_hex, cluster_key_len,
+					  (char *) new_cluster_key) !=
 		KMGR_CLUSTER_KEY_LEN)
 	{
 		pg_log_error("cluster key must be at %d hex bytes", KMGR_CLUSTER_KEY_LEN);
@@ -722,3 +724,65 @@ bzero_keys_and_exit(exit_action action)
 	/* return 0 or 1 */
 	exit(action != SUCCESS_EXIT);
 }
+
+/*
+ * HEX
+ */
+
+static const int8 hexlookup[128] = {
+	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1,
+	-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+
+static inline char
+get_hex(const char *cp)
+{
+	unsigned char c = (unsigned char) *cp;
+	int			res = -1;
+
+	if (c < 127)
+		res = hexlookup[c];
+
+	if (res < 0)
+		pg_fatal("invalid hexadecimal digit: \"%s\"",cp);
+
+	return (char) res;
+}
+
+static uint64
+hex_decode(const char *src, size_t len, char *dst)
+{
+	const char *s,
+			   *srcend;
+	char		v1,
+				v2,
+			   *p;
+
+	srcend = src + len;
+	s = src;
+	p = dst;
+	while (s < srcend)
+	{
+		if (*s == ' ' || *s == '\n' || *s == '\t' || *s == '\r')
+		{
+			s++;
+			continue;
+		}
+		v1 = get_hex(s) << 4;
+		s++;
+		if (s >= srcend)
+			pg_fatal("invalid hexadecimal data: odd number of digits");
+
+		v2 = get_hex(s);
+		s++;
+		*p++ = v1 | v2;
+	}
+
+	return p - dst;
+}
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 8f219fbacc..78def5753a 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -107,13 +107,15 @@ extern PGDLLIMPORT int wal_level;
 /*
  * Is a full-page image needed for hint bit updates?
  *
- * Normally, we don't WAL-log hint bit updates, but if checksums are enabled,
- * we have to protect them against torn page writes.  When you only set
- * individual bits on a page, it's still consistent no matter what combination
- * of the bits make it to disk, but the checksum wouldn't match.  Also WAL-log
- * them if forced by wal_log_hints=on.
+ * Normally, we don't WAL-log hint bit updates, but if checksums or encryption
+ * is enabled, we have to protect them against torn page writes.  When you
+ * only set individual bits on a page, it's still consistent no matter what
+ * combination of the bits make it to disk, but the checksum wouldn't match.
+ * Cluster file encryption requires a new LSN for hint bit changes, and can't
+ * tolerate torn pages.  Also WAL-log them if forced by wal_log_hints=on.
  */
-#define XLogHintBitIsNeeded() (DataChecksumsEnabled() || wal_log_hints)
+#define XLogHintBitIsNeeded() \
+		(DataChecksumsEnabled() || FileEncryptionEnabled || wal_log_hints)
 
 /* Do we need to WAL-log information required only for Hot Standby and logical replication? */
 #define XLogStandbyInfoActive() (wal_level >= WAL_LEVEL_REPLICA)
diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h
index 001ff2f521..dfb501a7c3 100644
--- a/src/include/access/xloginsert.h
+++ b/src/include/access/xloginsert.h
@@ -61,6 +61,8 @@ extern void log_newpage_range(Relation rel, ForkNumber forknum,
 							  BlockNumber startblk, BlockNumber endblk, bool page_std);
 extern XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std);
 
+extern XLogRecPtr LSNForEncryption(bool use_wal_lsn);
+
 extern void InitXLogInsert(void);
 
 #endif							/* XLOGINSERT_H */
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index db7de77287..1c97ffd2cd 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -78,6 +78,7 @@ typedef struct CheckPoint
 #define XLOG_FPI						0xB0
 /* 0xC0 is used in Postgres 9.5-11 */
 #define XLOG_OVERWRITE_CONTRECORD		0xD0
+#define XLOG_ENCRYPTION_LSN				0xE0
 
 
 /*
-- 
2.37.0 (Apple Git-136)