From e33633ba036ff521482fb24e8984b5865c8515c8 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Sun, 21 Jan 2024 15:33:22 +0700
Subject: [PATCH v15 4/4] WIP: comment edits

Clarify detection of zero bytes when hashing aligned C strings

Discussion: https://postgr.es/m/48e8f8bbe0be9c789f98776c7438244ab7a7cc63.camel%40j-davis.com
---
 src/include/common/hashfn_unstable.h | 38 ++++++++++++++++------------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/src/include/common/hashfn_unstable.h b/src/include/common/hashfn_unstable.h
index 8e829297fd..8c42e876be 100644
--- a/src/include/common/hashfn_unstable.h
+++ b/src/include/common/hashfn_unstable.h
@@ -209,26 +209,33 @@ fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str)
 {
 	const char *const start = str;
 	int			remainder;
-	uint64		zero_bytes_le;
+	uint64		zero_byte_low;
 
 	Assert(PointerIsAligned(start, uint64));
+
+	/*
+	 * For every chunk of input, check for zero bytes before mixing into the
+	 * hash. The chunk with zeros must contain the NUL terminator. We arrange
+	 * so that zero_byte_low tells us not only that a zero exists, but also
+	 * where it is, so we can hash the remainder of the string.
+	 *
+	 * The haszero64 calculation will set bits corresponding to the lowest
+	 * byte where a zero exists, so that suffices for little-endian machines.
+	 * For big-endian machines, we would need bits set for the highest zero
+	 * byte in the chunk, since the trailing junk past the terminator could
+	 * contain additional zeros. haszero64 does not give us that, so we
+	 * byteswap the chunk first.
+	 */
 	for (;;)
 	{
 		uint64		chunk = *(uint64 *) str;
 
-		/*
-		 * With little-endian representation, we can use this calculation,
-		 * which sets bits in the first byte in the result word that
-		 * corresponds to a zero byte in the original word. The rest of the
-		 * bytes are indeterminate, so cannot be used on big-endian machines
-		 * without either swapping or a bytewise check.
-		 */
 #ifdef WORDS_BIGENDIAN
-		zero_bytes_le = haszero64(pg_bswap64(chunk));
+		zero_byte_low = haszero64(pg_bswap64(chunk));
 #else
-		zero_bytes_le = haszero64(chunk);
+		zero_byte_low = haszero64(chunk);
 #endif
-		if (zero_bytes_le)
+		if (zero_byte_low)
 			break;
 
 		hs->accum = chunk;
@@ -237,12 +244,11 @@ fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str)
 	}
 
 	/*
-	 * For the last word, only use bytes up to the NUL for the hash. Bytes
-	 * with set bits will be 0x80, so calculate the first occurrence of a zero
-	 * byte within the input word by counting the number of trailing (because
-	 * little-endian) zeros and dividing the result by 8.
+	 * Bytes with set bits will be 0x80, so the number of trailing zeros will
+	 * be in the range 7, 15, ..., 63. We turn this into the byte position by
+	 * dividing by 8.
 	 */
-	remainder = pg_rightmost_one_pos64(zero_bytes_le) / BITS_PER_BYTE;
+	remainder = pg_rightmost_one_pos64(zero_byte_low) / BITS_PER_BYTE;
 	fasthash_accum(hs, str, remainder);
 	str += remainder;
 
-- 
2.43.0