From e33633ba036ff521482fb24e8984b5865c8515c8 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Sun, 21 Jan 2024 15:33:22 +0700 Subject: [PATCH v15 4/4] WIP: comment edits Clarify detection of zero bytes when hashing aligned C strings Discussion: https://postgr.es/m/48e8f8bbe0be9c789f98776c7438244ab7a7cc63.camel%40j-davis.com --- src/include/common/hashfn_unstable.h | 38 ++++++++++++++++------------ 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/src/include/common/hashfn_unstable.h b/src/include/common/hashfn_unstable.h index 8e829297fd..8c42e876be 100644 --- a/src/include/common/hashfn_unstable.h +++ b/src/include/common/hashfn_unstable.h @@ -209,26 +209,33 @@ fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str) { const char *const start = str; int remainder; - uint64 zero_bytes_le; + uint64 zero_byte_low; Assert(PointerIsAligned(start, uint64)); + + /* + * For every chunk of input, check for zero bytes before mixing into the + * hash. The chunk with zeros must contain the NUL terminator. We arrange + * so that zero_byte_low tells us not only that a zero exists, but also + * where it is, so we can hash the remainder of the string. + * + * The haszero64 calculation will set bits corresponding to the lowest + * byte where a zero exists, so that suffices for little-endian machines. + * For big-endian machines, we would need bits set for the highest zero + * byte in the chunk, since the trailing junk past the terminator could + * contain additional zeros. haszero64 does not give us that, so we + * byteswap the chunk first. + */ for (;;) { uint64 chunk = *(uint64 *) str; - /* - * With little-endian representation, we can use this calculation, - * which sets bits in the first byte in the result word that - * corresponds to a zero byte in the original word. The rest of the - * bytes are indeterminate, so cannot be used on big-endian machines - * without either swapping or a bytewise check. - */ #ifdef WORDS_BIGENDIAN - zero_bytes_le = haszero64(pg_bswap64(chunk)); + zero_byte_low = haszero64(pg_bswap64(chunk)); #else - zero_bytes_le = haszero64(chunk); + zero_byte_low = haszero64(chunk); #endif - if (zero_bytes_le) + if (zero_byte_low) break; hs->accum = chunk; @@ -237,12 +244,11 @@ fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str) } /* - * For the last word, only use bytes up to the NUL for the hash. Bytes - * with set bits will be 0x80, so calculate the first occurrence of a zero - * byte within the input word by counting the number of trailing (because - * little-endian) zeros and dividing the result by 8. + * Bytes with set bits will be 0x80, so the number of trailing zeros will + * be in the range 7, 15, ..., 63. We turn this into the byte position by + * dividing by 8. */ - remainder = pg_rightmost_one_pos64(zero_bytes_le) / BITS_PER_BYTE; + remainder = pg_rightmost_one_pos64(zero_byte_low) / BITS_PER_BYTE; fasthash_accum(hs, str, remainder); str += remainder; -- 2.43.0