From f246bba2d9090116f8914c20114ecc3f4b9daeea Mon Sep 17 00:00:00 2001 From: John Naylor Date: Tue, 18 Mar 2025 12:56:32 +0700 Subject: [PATCH v10 2/3] Use template file for parallel CRC computation --- src/port/pg_crc32c_armv8.c | 65 ++++++++++++++++------------------ src/port/pg_crc32c_parallel.h | 66 +++++++++++++++++++++++++++++++++++ src/port/pg_crc32c_sb8.c | 2 ++ src/port/pg_crc32c_sse42.c | 65 +++++++--------------------------- 4 files changed, 112 insertions(+), 86 deletions(-) create mode 100644 src/port/pg_crc32c_parallel.h diff --git a/src/port/pg_crc32c_armv8.c b/src/port/pg_crc32c_armv8.c index 0265a2a13d7..4767406bef3 100644 --- a/src/port/pg_crc32c_armv8.c +++ b/src/port/pg_crc32c_armv8.c @@ -18,13 +18,42 @@ #include "port/pg_crc32c.h" +#define DEBUG_CRC /* XXX not for commit */ + +static pg_crc32c pg_comp_crc32c_armv8_tail(pg_crc32c crc, const void *data, size_t len); + + pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len) +{ + const unsigned char *p = data; + pg_crc32c crc0 = crc; + +#ifdef DEBUG_CRC + const size_t orig_len PG_USED_FOR_ASSERTS_ONLY = len; +#endif + +/* min size to compute multiple segments in parallel */ +#define MIN_PARALLEL_LENGTH 600 + +#define PG_CRC32C_1B(c, w) __crc32cb(c, w) +#define PG_CRC32C_8B(c, w) __crc32cd(c, w) +#include "pg_crc32c_parallel.h" + + crc0 = pg_comp_crc32c_armv8_tail(crc0, p, len); + +#ifdef DEBUG_CRC + Assert(crc0 == pg_comp_crc32c_sb8(crc, data, orig_len)); +#endif + + return crc0; +} + +static pg_crc32c +pg_comp_crc32c_armv8_tail(pg_crc32c crc, const void *data, size_t len) { const unsigned char *p = data; const unsigned char *pend = p + len; - const size_t min_blocklen = 42; /* Min size to consider interleaving */ - const pg_crc32c orig_crc = crc; // XXX not for commit /* * ARMv8 doesn't require alignment, but aligned memory access is @@ -50,36 +79,6 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len) p += 4; } - /* See pg_crc32c_sse42.c for explanation */ - while (p + min_blocklen * CRC_BYTES_PER_ITER <= pend) - { - const size_t block_len = Min(CRC_MAX_BLOCK_LEN, (pend - p) / CRC_BYTES_PER_ITER); - const uint64 *in64 = (const uint64 *) (p); - pg_crc32c crc0 = crc, - crc1 = 0, - crc2 = 0; - uint64 mul0, - mul1, - precompute; - - for (int i = 0; i < block_len; i++, in64++) - { - crc0 = __crc32cd(crc0, *(in64)); - crc1 = __crc32cd(crc1, *(in64 + block_len)); - crc2 = __crc32cd(crc2, *(in64 + block_len * 2)); - } - - precompute = combine_crc_lookup[block_len - 1]; - mul0 = pg_clmul(crc0, (uint32) precompute); - mul1 = pg_clmul(crc1, (uint32) (precompute >> 32)); - - crc0 = __crc32cd(0, mul0); - crc1 = __crc32cd(0, mul1); - crc = crc0 ^ crc1 ^ crc2; - - p += block_len * CRC_BYTES_PER_ITER; - } - /* Process eight bytes at a time, as far as we can. */ while (p + 8 <= pend) { @@ -103,7 +102,5 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len) crc = __crc32cb(crc, *p); } - // XXX not for commit - Assert(crc == pg_comp_crc32c_sb8(orig_crc, data, len)); return crc; } diff --git a/src/port/pg_crc32c_parallel.h b/src/port/pg_crc32c_parallel.h new file mode 100644 index 00000000000..caee564726e --- /dev/null +++ b/src/port/pg_crc32c_parallel.h @@ -0,0 +1,66 @@ +/*------------------------------------------------------------------------- + * + * pg_crc32c_parallel.h + * Hardware-independent template for parallel CRC computation. + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * src/port/pg_crc32c_parallel.h + * + *------------------------------------------------------------------------- + */ +#ifndef PG_CRC32C_H +#define PG_CRC32C_H + +if (unlikely(len >= MIN_PARALLEL_LENGTH)) +{ + /* + * Align pointer regardless of architecture to avoid straddling cacheline + * boundaries, since we issue three loads per loop iteration below. + */ + for (; (uintptr_t) p & 7; len--) + crc0 = PG_CRC32C_1B(crc0, *p++); + + /* + * A CRC instruction can be issued every cycle on many architectures, but + * the latency of its result will take several cycles. We can take + * advantage of this by dividing the input into 3 equal blocks and + * computing the CRC of each independently. + */ + while (len >= MIN_PARALLEL_LENGTH) + { + const size_t block_len = Min(CRC_MAX_BLOCK_LEN, + len / CRC_BYTES_PER_ITER); + const uint64 *in64 = (const uint64 *) (p); + pg_crc32c crc1 = 0, + crc2 = 0; + uint64 mul0, + mul1, + precompute; + + for (int i = 0; i < block_len; i++, in64++) + { + crc0 = PG_CRC32C_8B(crc0, *(in64)); + crc1 = PG_CRC32C_8B(crc1, *(in64 + block_len)); + crc2 = PG_CRC32C_8B(crc2, *(in64 + block_len * 2)); + } + + /* + * Combine the partial CRCs using carryless multiplication on + * pre-computed length-specific constants. + */ + precompute = combine_crc_lookup[block_len - 1]; + mul0 = pg_clmul(crc0, (uint32) precompute); + mul1 = pg_clmul(crc1, (uint32) (precompute >> 32)); + crc0 = PG_CRC32C_8B(0, mul0); + crc0 ^= PG_CRC32C_8B(0, mul1); + crc0 ^= crc2; + + p += block_len * CRC_BYTES_PER_ITER; + len -= block_len * CRC_BYTES_PER_ITER; + } +} + +#endif /* PG_CRC32C_H */ diff --git a/src/port/pg_crc32c_sb8.c b/src/port/pg_crc32c_sb8.c index 004fe92d70b..d0ec8c5bed0 100644 --- a/src/port/pg_crc32c_sb8.c +++ b/src/port/pg_crc32c_sb8.c @@ -1169,6 +1169,8 @@ static const uint32 pg_crc32c_table[8][256] = { }; +/* platform-independent infrastructure for parallel CRC computation */ + /* * Carryless multiplication in software */ diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index f674d3f71d7..3fe8716601f 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -18,8 +18,7 @@ #include "port/pg_crc32c.h" -/* min size to compute multiple segments in parallel */ -#define MIN_PARALLEL_LENGTH 600 +#define DEBUG_CRC /* XXX not for commit */ static pg_crc32c pg_comp_crc32c_sse42_tail(pg_crc32c crc, const void *data, size_t len); @@ -31,64 +30,26 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len) const unsigned char *p = data; pg_crc32c crc0 = crc; - /* XXX not for commit */ +#ifdef DEBUG_CRC const size_t orig_len PG_USED_FOR_ASSERTS_ONLY = len; +#endif #if SIZEOF_VOID_P >= 8 - if (unlikely(len >= MIN_PARALLEL_LENGTH)) - { - /* - * Align pointer to avoid straddling cacheline boundaries, since we - * issue three loads per loop iteration below. - */ - for (; (uintptr_t) p & 7; len--) - crc0 = _mm_crc32_u8(crc0, *p++); - - /* - * A CRC instruction can be issued every cycle but the latency of its - * result will take several cycles. We can take advantage of this by - * dividing the input into 3 equal blocks and computing the CRC of - * each independently. - */ - while (len >= MIN_PARALLEL_LENGTH) - { - const size_t block_len = Min(CRC_MAX_BLOCK_LEN, - len / CRC_BYTES_PER_ITER); - const uint64 *in64 = (const uint64 *) (p); - pg_crc32c crc1 = 0, - crc2 = 0; - uint64 mul0, - mul1, - precompute; - - for (int i = 0; i < block_len; i++, in64++) - { - crc0 = _mm_crc32_u64(crc0, *(in64)); - crc1 = _mm_crc32_u64(crc1, *(in64 + block_len)); - crc2 = _mm_crc32_u64(crc2, *(in64 + block_len * 2)); - } - - /* - * Combine the partial CRCs using carryless multiplication on - * pre-computed length-specific constants. - */ - precompute = combine_crc_lookup[block_len - 1]; - mul0 = pg_clmul(crc0, (uint32) precompute); - mul1 = pg_clmul(crc1, (uint32) (precompute >> 32)); - crc0 = _mm_crc32_u64(0, mul0); - crc0 ^= _mm_crc32_u64(0, mul1); - crc0 ^= crc2; - - p += block_len * CRC_BYTES_PER_ITER; - len -= block_len * CRC_BYTES_PER_ITER; - } - } + +/* min size to compute multiple segments in parallel */ +#define MIN_PARALLEL_LENGTH 600 + +#define PG_CRC32C_1B(c, w) _mm_crc32_u8(c, w) +#define PG_CRC32C_8B(c, w) _mm_crc32_u64(c, w) +#include "pg_crc32c_parallel.h" + #endif crc0 = pg_comp_crc32c_sse42_tail(crc0, p, len); - /* XXX not for commit */ +#ifdef DEBUG_CRC Assert(crc0 == pg_comp_crc32c_sb8(crc, data, orig_len)); +#endif return crc0; } -- 2.48.1