From ebbd072d558574f78bd4489c3431a13fd831f254 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Fri, 28 Feb 2025 16:27:30 +0700 Subject: [PATCH v12 3/6] Inline CRC computation for small fixed-length input --- src/include/port/pg_crc32c.h | 21 ++++++- src/include/port/pg_crc32c_sse42_impl.h | 74 +++++++++++++++++++++++++ src/port/pg_crc32c_sse42.c | 46 +-------------- 3 files changed, 96 insertions(+), 45 deletions(-) create mode 100644 src/include/port/pg_crc32c_sse42_impl.h diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index 65ebeacf4b1..5ccc79295c0 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -43,12 +43,31 @@ typedef uint32 pg_crc32c; #if defined(USE_SSE42_CRC32C) /* Use Intel SSE4.2 instructions. */ + +#include "pg_crc32c_sse42_impl.h" + #define COMP_CRC32C(crc, data, len) \ - ((crc) = pg_comp_crc32c_sse42((crc), (data), (len))) + ((crc) = pg_comp_crc32c_dispatch((crc), (data), (len))) #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len); +static inline +pg_crc32c +pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len) +{ + if (__builtin_constant_p(len) && len < 64) + { + /* + * For small constant inputs, inline the computation. This allows the + * compiler to unroll loops. + */ + return pg_comp_crc32c_sse42_inline(crc, data, len); + } + else + return pg_comp_crc32c_sse42(crc, data, len); +} + #elif defined(USE_ARMV8_CRC32C) /* Use ARMv8 CRC Extension instructions. */ diff --git a/src/include/port/pg_crc32c_sse42_impl.h b/src/include/port/pg_crc32c_sse42_impl.h new file mode 100644 index 00000000000..e10ad777618 --- /dev/null +++ b/src/include/port/pg_crc32c_sse42_impl.h @@ -0,0 +1,74 @@ +/*------------------------------------------------------------------------- + * + * pg_crc32c_sse42_impl.h + * Inline implementation of CRC computation using SSE 4.2 + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/port/pg_crc32c_sse42_impl.h + * + *------------------------------------------------------------------------- + */ +#ifndef PG_CRC32C_SSE42_IMPL_H +#define PG_CRC32C_SSE42_IMPL_H + +#include "c.h" + +#include + +pg_attribute_no_sanitize_alignment() +pg_attribute_target("sse4.2") +static inline +pg_crc32c +pg_comp_crc32c_sse42_inline(pg_crc32c crc, const void *data, size_t len) +{ + const unsigned char *p = data; + const unsigned char *pend = p + len; + + /* + * Process eight bytes of data at a time. + * + * NB: We do unaligned accesses here. The Intel architecture allows that, + * and performance testing didn't show any performance gain from aligning + * the begin address. + */ +#ifdef __x86_64__ + while (p + 8 <= pend) + { + crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p)); + p += 8; + } + + /* Process remaining full four bytes if any */ + if (p + 4 <= pend) + { + crc = _mm_crc32_u32(crc, *((const unsigned int *) p)); + p += 4; + } +#else + + /* + * Process four bytes at a time. (The eight byte instruction is not + * available on the 32-bit x86 architecture). + */ + while (p + 4 <= pend) + { + crc = _mm_crc32_u32(crc, *((const unsigned int *) p)); + p += 4; + } +#endif /* __x86_64__ */ + + /* Process any remaining bytes one at a time. */ + while (p < pend) + { + crc = _mm_crc32_u8(crc, *p); + p++; + } + + return crc; +} + +#endif /* PG_CRC32C_SSE42_IMPL_H */ diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index 22c2137df31..6a35f7fdc67 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -17,54 +17,12 @@ #include #include "port/pg_crc32c.h" +#include "port/pg_crc32c_sse42_impl.h" pg_attribute_no_sanitize_alignment() pg_attribute_target("sse4.2") pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len) { - const unsigned char *p = data; - const unsigned char *pend = p + len; - - /* - * Process eight bytes of data at a time. - * - * NB: We do unaligned accesses here. The Intel architecture allows that, - * and performance testing didn't show any performance gain from aligning - * the begin address. - */ -#ifdef __x86_64__ - while (p + 8 <= pend) - { - crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p)); - p += 8; - } - - /* Process remaining full four bytes if any */ - if (p + 4 <= pend) - { - crc = _mm_crc32_u32(crc, *((const unsigned int *) p)); - p += 4; - } -#else - - /* - * Process four bytes at a time. (The eight byte instruction is not - * available on the 32-bit x86 architecture). - */ - while (p + 4 <= pend) - { - crc = _mm_crc32_u32(crc, *((const unsigned int *) p)); - p += 4; - } -#endif /* __x86_64__ */ - - /* Process any remaining bytes one at a time. */ - while (p < pend) - { - crc = _mm_crc32_u8(crc, *p); - p++; - } - - return crc; + return pg_comp_crc32c_sse42_inline(crc, data, len); } -- 2.48.1