From 61be097b9484204f0c6c5af64e6767e0b42649e1 Mon Sep 17 00:00:00 2001 From: "xiang.gao" Date: Wed, 13 Sep 2023 15:13:37 +0800 Subject: [PATCH] PostgreSQL: CRC32C optimization Crc32c Parallel computation optimization Algorithm comes from Intel whitepaper: crc-iscsi-polynomial-crc32-instruction-paper Input data is divided into three equal-sized blocks. Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes. One Block: 42(BLK_LEN) * 8 bytes Crc32c unitest: https://gist.github.com/gaoxyt/138fd53ca1eead8102eeb9204067f7e4 Crc32c benchmark: https://gist.github.com/gaoxyt/4506c10fc06b3501445e32c4257113e9 It gets ~2x speedup compared to linear Arm crc32c instructions. Signed-off-by: xiang.gao Change-Id: If876bbca5bbc3940946a7d72e14fe9fdf54682c1 --- config/c-compiler.m4 | 25 ++++++++ configure | 59 ++++++++++++++++++- configure.ac | 22 +++++++- meson.build | 24 ++++++++ src/include/pg_config.h.in | 3 + src/include/port/pg_crc32c.h | 19 ++++--- src/port/meson.build | 2 + src/port/pg_crc32c_armv8.c | 94 +++++++++++++++++++++++++++++++ src/port/pg_crc32c_armv8_choose.c | 49 +++++++++++++++- 9 files changed, 285 insertions(+), 12 deletions(-) diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 5db02b2ab7..483d4724d1 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -662,6 +662,31 @@ fi undefine([Ac_cachevar])dnl ])# PGAC_ARMV8_CRC32C_INTRINSICS +# PGAC_ARMV8_VMULL_INTRINSICS +# ---------------------------- +# Check if the compiler supports the vmull_p64 +# intrinsic functions. These instructions +# were first introduced in ARMv8 crypto Extension. +# +# An optional compiler flag can be passed as argument (e.g. +# -march=armv8-a+crypto). If the intrinsics are supported, sets +# pgac_armv8_vmull_intrinsics, and CFLAGS_VMULL. +AC_DEFUN([PGAC_ARMV8_VMULL_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_armv8_vmull_intrinsics_$1])])dnl +AC_CACHE_CHECK([for vmull_p64 with CFLAGS=$1], [Ac_cachevar], +[pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS $1" +AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], + [return ((uint64_t)vmull_p64(0x12345678, 0x9abcde01) == 0x8860e9abc170678);])], + [Ac_cachevar=yes], + [Ac_cachevar=no]) +CFLAGS="$pgac_save_CFLAGS"]) +if test x"$Ac_cachevar" = x"yes"; then + pgac_armv8_vmull_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_ARMV8_VMULL_INTRINSICS + # PGAC_LOONGARCH_CRC32C_INTRINSICS # --------------------------- # Check if the compiler supports the LoongArch CRCC instructions, using diff --git a/configure b/configure index cfd968235f..9b6118164d 100755 --- a/configure +++ b/configure @@ -18038,6 +18038,44 @@ fi +# Check for ARMv8 VMULL intrinsics to do polynomial multiplication +# +# Check if vmull_p64 intrinsics can be used with the compiler +# flag -march=armv8-a+crypto. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for vmull_p64 with CFLAGS=-march=armv8-a+crypto" >&5 +$as_echo_n "checking for vmull_p64 with CFLAGS=-march=armv8-a+crypto... " >&6; } +if ${pgac_cv_armv8_vmull_intrinsics__march_armv8_apcrypto+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS -march=armv8-a+crypto" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +int +main () +{ +return ((uint64_t)vmull_p64(0x12345678, 0x9abcde01) == 0x8860e9abc170678); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_armv8_vmull_intrinsics__march_armv8_apcrypto=yes +else + pgac_cv_armv8_vmull_intrinsics__march_armv8_apcrypto=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_armv8_vmull_intrinsics__march_armv8_apcrypto" >&5 +$as_echo "$pgac_cv_armv8_vmull_intrinsics__march_armv8_apcrypto" >&6; } +if test x"$pgac_cv_armv8_vmull_intrinsics__march_armv8_apcrypto" = x"yes"; then + pgac_armv8_vmull_intrinsics=yes +fi + + # Select CRC-32C implementation. # # If we are targeting a processor that has Intel SSE 4.2 instructions, we can @@ -18089,6 +18127,13 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && fi fi +# Use ARM VMULL if available and ARM CRC32C intrinsic is avaliable too. +if test x"$USE_ARMV8_VMULL" = x"" && (test x"$USE_ARMV8_CRC32C" = x"1" || test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"); then + if test x"$pgac_armv8_vmull_intrinsics" = x"yes"; then + USE_ARMV8_VMULL=1 + fi +fi + # Set PG_CRC32C_OBJS appropriately depending on the selected implementation. { $as_echo "$as_me:${as_lineno-$LINENO}: checking which CRC-32C implementation to use" >&5 $as_echo_n "checking which CRC-32C implementation to use... " >&6; } @@ -18112,7 +18157,7 @@ $as_echo "SSE 4.2 with runtime check" >&6; } $as_echo "#define USE_ARMV8_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_armv8.o" + PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions" >&5 $as_echo "ARMv8 CRC instructions" >&6; } else @@ -18145,6 +18190,18 @@ $as_echo "slicing-by-8" >&6; } fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to use ARM VMULL intrinsic" >&5 +$as_echo_n "checking whether to use ARM VMULL intrinsic... " >&6; } +if test x"$USE_ARMV8_VMULL" = x"1"; then + +$as_echo "#define USE_ARMV8_VMULL 1" >>confdefs.h + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi # Select semaphore implementation type. if test "$PORTNAME" != "win32"; then diff --git a/configure.ac b/configure.ac index f220b379b3..71a84bb151 100644 --- a/configure.ac +++ b/configure.ac @@ -2107,6 +2107,12 @@ PGAC_LOONGARCH_CRC32C_INTRINSICS() AC_SUBST(CFLAGS_CRC) +# Check for ARMv8 VMULL intrinsics to do polynomial multiplication +# +# Check if vmull_p64 intrinsics can be used with the compiler +# flag -march=armv8-a+crypto. +PGAC_ARMV8_VMULL_INTRINSICS([-march=armv8-a+crypto]) + # Select CRC-32C implementation. # # If we are targeting a processor that has Intel SSE 4.2 instructions, we can @@ -2158,6 +2164,13 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && fi fi +# Use ARM VMULL if available and ARM CRC32C intrinsic is avaliable too. +if test x"$USE_ARMV8_VMULL" = x"" && (test x"$USE_ARMV8_CRC32C" = x"1" || test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"); then + if test x"$pgac_armv8_vmull_intrinsics" = x"yes"; then + USE_ARMV8_VMULL=1 + fi +fi + # Set PG_CRC32C_OBJS appropriately depending on the selected implementation. AC_MSG_CHECKING([which CRC-32C implementation to use]) if test x"$USE_SSE42_CRC32C" = x"1"; then @@ -2172,7 +2185,7 @@ else else if test x"$USE_ARMV8_CRC32C" = x"1"; then AC_DEFINE(USE_ARMV8_CRC32C, 1, [Define to 1 to use ARMv8 CRC Extension.]) - PG_CRC32C_OBJS="pg_crc32c_armv8.o" + PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" AC_MSG_RESULT(ARMv8 CRC instructions) else if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then @@ -2195,6 +2208,13 @@ else fi AC_SUBST(PG_CRC32C_OBJS) +AC_MSG_CHECKING([whether to use ARM VMULL intrinsic]) +if test x"$USE_ARMV8_VMULL" = x"1"; then + AC_DEFINE(USE_ARMV8_VMULL, 1, [Define to 1 to use ARMv8 VMULL Extension.]) + AC_MSG_RESULT(yes) +else + AC_MSG_RESULT(no) +fi # Select semaphore implementation type. if test "$PORTNAME" != "win32"; then diff --git a/meson.build b/meson.build index 2d516c8f37..f1615c3549 100644 --- a/meson.build +++ b/meson.build @@ -2101,6 +2101,30 @@ endif +############################################################### +# Check for ARMv8 VMULL intrinsics to do polynomial multiplication +############################################################### + +if (host_cpu == 'arm' or host_cpu == 'aarch64') + + prog = ''' +#include + +int main(void) +{ + return ((uint64_t)vmull_p64(0x12345678, 0x9abcde01) == 0x8860e9abc170678); +} +''' + + if cc.links(prog, name: 'vmull_p64 with -march=armv8-a+crypto', + args: test_c_args + ['-march=armv8-a+crypto']) + # Use ARM VMULL Extension unconditionally + cdata.set('USE_ARMV8_VMULL', 1) + endif +endif + + + ############################################################### # Other CPU specific stuff ############################################################### diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index d8a2985567..65cd43e156 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -689,6 +689,9 @@ /* Define to 1 to use ARMv8 CRC Extension with a runtime check. */ #undef USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK +/* Define to 1 to use ARMv8 VMULL Extension. */ +#undef USE_ARMV8_VMULL + /* Define to 1 to build with assertion checks. (--enable-cassert) */ #undef USE_ASSERT_CHECKING diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index d085f1dc00..35eb689a3b 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -49,14 +49,20 @@ typedef uint32 pg_crc32c; extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len); -#elif defined(USE_ARMV8_CRC32C) +#elif defined(USE_ARMV8_CRC32C) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) /* Use ARMv8 CRC Extension instructions. */ - #define COMP_CRC32C(crc, data, len) \ - ((crc) = pg_comp_crc32c_armv8((crc), (data), (len))) + ((crc) = pg_comp_crc32c((crc), (data), (len))) #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) +extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); +extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); + +#if defined(USE_ARMV8_VMULL) +#include +extern pg_crc32c pg_comp_crc32c_with_vmull_armv8(pg_crc32c crc, const void *data, size_t len); +#endif #elif defined(USE_LOONGARCH_CRC32C) /* Use LoongArch CRCC instructions. */ @@ -67,10 +73,10 @@ extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t le extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len); -#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) +#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) /* - * Use Intel SSE 4.2 or ARMv8 instructions, but perform a runtime check first + * Use Intel SSE 4.2 instructions, but perform a runtime check first * to check that they are available. */ #define COMP_CRC32C(crc, data, len) \ @@ -83,9 +89,6 @@ extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) #ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len); #endif -#ifdef USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK -extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); -#endif #else /* diff --git a/src/port/meson.build b/src/port/meson.build index a0d0a9583a..35e347de59 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -89,7 +89,9 @@ replace_funcs_pos = [ # arm / aarch64 ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'], ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 'crc'], + ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C'], ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], + ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C'], ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], # loongarch diff --git a/src/port/pg_crc32c_armv8.c b/src/port/pg_crc32c_armv8.c index d8fae510cf..672a4e417b 100644 --- a/src/port/pg_crc32c_armv8.c +++ b/src/port/pg_crc32c_armv8.c @@ -2,6 +2,7 @@ * * pg_crc32c_armv8.c * Compute CRC-32C checksum using ARMv8 CRC Extension instructions + * with ARMv8 VMULL Extentsion instructions or not * * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -18,6 +19,99 @@ #include "port/pg_crc32c.h" +#if defined(USE_ARMV8_VMULL) +#include +__attribute__((target("+crypto"))) +pg_crc32c +pg_comp_crc32c_with_vmull_armv8(pg_crc32c crc, const void *data, size_t len) +{ + const unsigned char *p = data; + const unsigned char *pend = p + len; + + /* + * ARMv8 doesn't require alignment, but aligned memory access is + * significantly faster. Process leading bytes so that the loop below + * starts with a pointer aligned to eight bytes. + */ + if (!PointerIsAligned(p, uint16) && + p + 1 <= pend) + { + crc = __crc32cb(crc, *p); + p += 1; + } + if (!PointerIsAligned(p, uint32) && + p + 2 <= pend) + { + crc = __crc32ch(crc, *(uint16 *) p); + p += 2; + } + if (!PointerIsAligned(p, uint64) && + p + 4 <= pend) + { + crc = __crc32cw(crc, *(uint32 *) p); + p += 4; + } + +/* + * Crc32c parallel computation Input data is divided into three + * equal-sized blocks. Block length : 42 words(42 * 8 bytes). + * CRC0: 0 ~ 41 * 8, + * CRC1: 42 * 8 ~ (42 * 2 - 1) * 8, + * CRC2: 42 * 2 * 8 ~ (42 * 3 - 1) * 8. + */ + while (p + 1024 <= pend) + { +#define BLOCK_LEN 42 + const uint64_t *in64 = (const uint64_t *) (p); + uint32_t crc0 = crc, + crc1 = 0, + crc2 = 0; + + for (int i = 0; i < BLOCK_LEN; i++, in64++) + { + crc0 = __crc32cd(crc0, *(in64)); + crc1 = __crc32cd(crc1, *(in64 + BLOCK_LEN)); + crc2 = __crc32cd(crc2, *(in64 + BLOCK_LEN * 2)); + } + in64 += BLOCK_LEN * 2; + crc0 = __crc32cd(0, vmull_p64(crc0, 0xcec3662e)); + crc1 = __crc32cd(0, vmull_p64(crc1, 0xa60ce07b)); + crc = crc0 ^ crc1 ^ crc2; + + crc = __crc32cd(crc, *in64++); + crc = __crc32cd(crc, *in64++); + + p += 1024; +#undef BLOCK_LEN + } + + /* Process eight bytes at a time, as far as we can. */ + while (p + 8 <= pend) + { + crc = __crc32cd(crc, *(uint64 *) p); + p += 8; + } + + /* Process remaining 0-7 bytes. */ + if (p + 4 <= pend) + { + crc = __crc32cw(crc, *(uint32 *) p); + p += 4; + } + if (p + 2 <= pend) + { + crc = __crc32ch(crc, *(uint16 *) p); + p += 2; + } + if (p < pend) + { + crc = __crc32cb(crc, *p); + } + + return crc; +} +#endif + pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len) { diff --git a/src/port/pg_crc32c_armv8_choose.c b/src/port/pg_crc32c_armv8_choose.c index 0fdddccaf7..2a3b8ba907 100644 --- a/src/port/pg_crc32c_armv8_choose.c +++ b/src/port/pg_crc32c_armv8_choose.c @@ -4,8 +4,8 @@ * Choose between ARMv8 and software CRC-32C implementation. * * On first call, checks if the CPU we're running on supports the ARMv8 - * CRC Extension. If it does, use the special instructions for CRC-32C - * computation. Otherwise, fall back to the pure software implementation + * CRC Extension and VMULL Extension. If it does, use the special instructions + * for CRC-32C computation. Otherwise, fall back to the pure software implementation * (slicing-by-8). * * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group @@ -77,6 +77,36 @@ pg_crc32c_armv8_available(void) return (result > 0); } +#if defined(USE_ARMV8_VMULL) +__attribute__((target("+crypto"))) +static bool +pg_vmull_armv8_available(void) +{ + int result; + + pqsignal(SIGILL, illegal_instruction_handler); + if (sigsetjmp(illegal_instruction_jump, 1) == 0) + { + result = ((uint64_t) vmull_p64(0x12345678, 0x9abcde01) == 0x8860e9abc170678); + } + else + { + /* We got the SIGILL trap */ + result = -1; + } + pqsignal(SIGILL, SIG_DFL); + +#ifndef FRONTEND + /* We don't expect this case, so complain loudly */ + if (result == 0) + elog(ERROR, "vmull_p64 hardware results error"); + + elog(DEBUG1, "using armv8 vmull_p64 hardware = %d", (result > 0)); +#endif + return (result > 0); +} +#endif + /* * This gets called on the first call. It replaces the function pointer * so that subsequent calls are routed directly to the chosen implementation. @@ -85,9 +115,24 @@ static pg_crc32c pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) { if (pg_crc32c_armv8_available()) + { +#if defined(USE_ARMV8_VMULL) + if (pg_vmull_armv8_available()) + { + pg_comp_crc32c = pg_comp_crc32c_with_vmull_armv8; + } + else + { + pg_comp_crc32c = pg_comp_crc32c_armv8; + } +#else pg_comp_crc32c = pg_comp_crc32c_armv8; +#endif + } else + { pg_comp_crc32c = pg_comp_crc32c_sb8; + } return pg_comp_crc32c(crc, data, len); } -- 2.34.1