From ebbd072d558574f78bd4489c3431a13fd831f254 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Fri, 28 Feb 2025 16:27:30 +0700
Subject: [PATCH v12 3/6] Inline CRC computation for small fixed-length input

---
 src/include/port/pg_crc32c.h            | 21 ++++++-
 src/include/port/pg_crc32c_sse42_impl.h | 74 +++++++++++++++++++++++++
 src/port/pg_crc32c_sse42.c              | 46 +--------------
 3 files changed, 96 insertions(+), 45 deletions(-)
 create mode 100644 src/include/port/pg_crc32c_sse42_impl.h

diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 65ebeacf4b1..5ccc79295c0 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -43,12 +43,31 @@ typedef uint32 pg_crc32c;
 
 #if defined(USE_SSE42_CRC32C)
 /* Use Intel SSE4.2 instructions. */
+
+#include "pg_crc32c_sse42_impl.h"
+
 #define COMP_CRC32C(crc, data, len) \
-	((crc) = pg_comp_crc32c_sse42((crc), (data), (len)))
+	((crc) = pg_comp_crc32c_dispatch((crc), (data), (len)))
 #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
 
 extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
 
+static inline
+pg_crc32c
+pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len)
+{
+	if (__builtin_constant_p(len) && len < 64)
+	{
+		/*
+		 * For small constant inputs, inline the computation. This allows the
+		 * compiler to unroll loops.
+		 */
+		return pg_comp_crc32c_sse42_inline(crc, data, len);
+	}
+	else
+		return pg_comp_crc32c_sse42(crc, data, len);
+}
+
 #elif defined(USE_ARMV8_CRC32C)
 /* Use ARMv8 CRC Extension instructions. */
 
diff --git a/src/include/port/pg_crc32c_sse42_impl.h b/src/include/port/pg_crc32c_sse42_impl.h
new file mode 100644
index 00000000000..e10ad777618
--- /dev/null
+++ b/src/include/port/pg_crc32c_sse42_impl.h
@@ -0,0 +1,74 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_crc32c_sse42_impl.h
+ *	  Inline implementation of CRC computation using SSE 4.2
+ *
+ * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_crc32c_sse42_impl.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_CRC32C_SSE42_IMPL_H
+#define PG_CRC32C_SSE42_IMPL_H
+
+#include "c.h"
+
+#include <nmmintrin.h>
+
+pg_attribute_no_sanitize_alignment()
+pg_attribute_target("sse4.2")
+static inline
+pg_crc32c
+pg_comp_crc32c_sse42_inline(pg_crc32c crc, const void *data, size_t len)
+{
+	const unsigned char *p = data;
+	const unsigned char *pend = p + len;
+
+	/*
+	 * Process eight bytes of data at a time.
+	 *
+	 * NB: We do unaligned accesses here. The Intel architecture allows that,
+	 * and performance testing didn't show any performance gain from aligning
+	 * the begin address.
+	 */
+#ifdef __x86_64__
+	while (p + 8 <= pend)
+	{
+		crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p));
+		p += 8;
+	}
+
+	/* Process remaining full four bytes if any */
+	if (p + 4 <= pend)
+	{
+		crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
+		p += 4;
+	}
+#else
+
+	/*
+	 * Process four bytes at a time. (The eight byte instruction is not
+	 * available on the 32-bit x86 architecture).
+	 */
+	while (p + 4 <= pend)
+	{
+		crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
+		p += 4;
+	}
+#endif							/* __x86_64__ */
+
+	/* Process any remaining bytes one at a time. */
+	while (p < pend)
+	{
+		crc = _mm_crc32_u8(crc, *p);
+		p++;
+	}
+
+	return crc;
+}
+
+#endif							/* PG_CRC32C_SSE42_IMPL_H */
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 22c2137df31..6a35f7fdc67 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -17,54 +17,12 @@
 #include <nmmintrin.h>
 
 #include "port/pg_crc32c.h"
+#include "port/pg_crc32c_sse42_impl.h"
 
 pg_attribute_no_sanitize_alignment()
 pg_attribute_target("sse4.2")
 pg_crc32c
 pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
 {
-	const unsigned char *p = data;
-	const unsigned char *pend = p + len;
-
-	/*
-	 * Process eight bytes of data at a time.
-	 *
-	 * NB: We do unaligned accesses here. The Intel architecture allows that,
-	 * and performance testing didn't show any performance gain from aligning
-	 * the begin address.
-	 */
-#ifdef __x86_64__
-	while (p + 8 <= pend)
-	{
-		crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p));
-		p += 8;
-	}
-
-	/* Process remaining full four bytes if any */
-	if (p + 4 <= pend)
-	{
-		crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
-		p += 4;
-	}
-#else
-
-	/*
-	 * Process four bytes at a time. (The eight byte instruction is not
-	 * available on the 32-bit x86 architecture).
-	 */
-	while (p + 4 <= pend)
-	{
-		crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
-		p += 4;
-	}
-#endif							/* __x86_64__ */
-
-	/* Process any remaining bytes one at a time. */
-	while (p < pend)
-	{
-		crc = _mm_crc32_u8(crc, *p);
-		p++;
-	}
-
-	return crc;
+	return pg_comp_crc32c_sse42_inline(crc, data, len);
 }
-- 
2.48.1