From 57668e64b861b9a722e30d452967376231841e59 Mon Sep 17 00:00:00 2001 From: Chiranmoy Bhattacharya Date: Wed, 19 Feb 2025 12:30:47 +0530 Subject: [PATCH v4] SVE support for popcount and popcount masked --- config/c-compiler.m4 | 36 ++++++++++ configure | 56 +++++++++++++++ configure.ac | 9 +++ meson.build | 33 +++++++++ src/include/pg_config.h.in | 3 + src/include/port/pg_bitutils.h | 13 ++++ src/port/Makefile | 1 + src/port/meson.build | 1 + src/port/pg_bitutils.c | 44 +++++++++++- src/port/pg_popcount_sve.c | 123 +++++++++++++++++++++++++++++++++ 10 files changed, 318 insertions(+), 1 deletion(-) create mode 100644 src/port/pg_popcount_sve.c diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 8534cc54c13..c3c2d6fe29d 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -704,3 +704,39 @@ if test x"$Ac_cachevar" = x"yes"; then fi undefine([Ac_cachevar])dnl ])# PGAC_AVX512_POPCNT_INTRINSICS + +# PGAC_ARM_SVE_POPCNT_INTRINSICS +# ------------------------------ +# Check if the compiler supports the ARM SVE popcount instructions using the +# svdup_u64, svwhilelt_b8, svcntb, svaddv, svadd_x, svcnt_x, svld1, +# svptrue_b64 and svand_x intrinsic functions. +# +# If the intrinsics are supported, sets pgac_arm_sve_popcnt_intrinsics. +AC_DEFUN([PGAC_ARM_SVE_POPCNT_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_arm_sve_popcnt_intrinsics])])dnl +AC_CACHE_CHECK([for svcnt_x and other intrinsics], [Ac_cachevar], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include + #if defined(__has_attribute) && __has_attribute(target) + __attribute__((target("arch=armv8-a+sve"))) + #endif + static int sve_popcount_test(void) + { + int popcnt = 0; + const char buf@<:@sizeof(uint64_t)@:>@; + svbool_t pred8 = svwhilelt_b8(0, 8), pred64 = svptrue_b64(); + svuint64_t accum = svdup_u64(0), vec; + if (svcntb() > 0) + popcnt = svaddv(pred8, svcnt_x(pred8, svld1(pred8, (const uint8_t *) buf))); + vec = svand_x(pred64, svld1(pred64, (const uint64_t *) buf), 0xf0f0); + accum = svadd_x(pred64, accum, svcnt_x(pred64, vec)); + popcnt += svaddv(pred64, accum); + return popcnt; + }], + [return sve_popcount_test();])], + [Ac_cachevar=yes], + [Ac_cachevar=no])]) +if test x"$Ac_cachevar" = x"yes"; then + pgac_arm_sve_popcnt_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_ARM_SVE_POPCNT_INTRINSICS diff --git a/configure b/configure index 0ffcaeb4367..b227b826092 100755 --- a/configure +++ b/configure @@ -17049,6 +17049,62 @@ $as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h fi fi +# Check for ARM SVE popcount intrinsics +# +if test x"$host_cpu" = x"aarch64"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for svcnt_x and other intrinsics" >&5 +$as_echo_n "checking for svcnt_x and other intrinsics... " >&6; } +if ${pgac_cv_arm_sve_popcnt_intrinsics+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include + #if defined(__has_attribute) && __has_attribute(target) + __attribute__((target("arch=armv8-a+sve"))) + #endif + static int sve_popcount_test(void) + { + int popcnt = 0; + const char buf[sizeof(uint64_t)]; + svbool_t pred8 = svwhilelt_b8(0, 8), pred64 = svptrue_b64(); + svuint64_t accum = svdup_u64(0), vec; + if (svcntb() > 0) + popcnt = svaddv(pred8, svcnt_x(pred8, svld1(pred8, (const uint8_t *) buf))); + vec = svand_x(pred64, svld1(pred64, (const uint64_t *) buf), 0xf0f0); + accum = svadd_x(pred64, accum, svcnt_x(pred64, vec)); + popcnt += svaddv(pred64, accum); + return popcnt; + } +int +main () +{ +return sve_popcount_test(); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_arm_sve_popcnt_intrinsics=yes +else + pgac_cv_arm_sve_popcnt_intrinsics=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_arm_sve_popcnt_intrinsics" >&5 +$as_echo "$pgac_cv_arm_sve_popcnt_intrinsics" >&6; } +if test x"$pgac_cv_arm_sve_popcnt_intrinsics" = x"yes"; then + pgac_arm_sve_popcnt_intrinsics=yes +fi + + if test x"$pgac_arm_sve_popcnt_intrinsics" = x"yes"; then + +$as_echo "#define USE_SVE_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h + + fi +fi + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32" >&5 diff --git a/configure.ac b/configure.ac index f56681e0d91..5c649870550 100644 --- a/configure.ac +++ b/configure.ac @@ -2016,6 +2016,15 @@ if test x"$host_cpu" = x"x86_64"; then fi fi +# Check for ARM SVE popcount intrinsics +# +if test x"$host_cpu" = x"aarch64"; then + PGAC_ARM_SVE_POPCNT_INTRINSICS() + if test x"$pgac_arm_sve_popcnt_intrinsics" = x"yes"; then + AC_DEFINE(USE_SVE_POPCNT_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARM popcount instructions.]) + fi +fi + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # PGAC_SSE42_CRC32_INTRINSICS() diff --git a/meson.build b/meson.build index 7dd7110318d..db3120442b5 100644 --- a/meson.build +++ b/meson.build @@ -2195,6 +2195,39 @@ int main(void) endif +############################################################### +# Check for the availability of ARM SVE popcount intrinsics. +############################################################### + +if host_cpu == 'aarch64' + + prog = ''' +#include +#if defined(__has_attribute) && __has_attribute(target) + __attribute__((target("arch=armv8-a+sve"))) +#endif +int main () +{ + int popcnt = 0; + const char buf[sizeof(uint64_t)]; + svbool_t pred8 = svwhilelt_b8(0, 8), pred64 = svptrue_b64(); + svuint64_t accum = svdup_u64(0), vec; + if (svcntb() > 0) + popcnt = svaddv(pred8, svcnt_x(pred8, svld1(pred8, (const uint8_t *) buf))); + vec = svand_x(pred64, svld1(pred64, (const uint64_t *) buf), 0xf0f0); + accum = svadd_x(pred64, accum, svcnt_x(pred64, vec)); + popcnt += svaddv(pred64, accum); + return popcnt; +} +''' + + if cc.links(prog, name: 'ARM SVE popcount', args: test_c_args) + cdata.set('USE_SVE_POPCNT_WITH_RUNTIME_CHECK', 1) + endif + +endif + + ############################################################### # Select CRC-32C implementation. # diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 07b2f798abd..29c32bbbbe3 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -648,6 +648,9 @@ /* Define to 1 to use AVX-512 popcount instructions with a runtime check. */ #undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK +/* Define to 1 to use SVE popcount instructions with a runtime check. */ +#undef USE_SVE_POPCNT_WITH_RUNTIME_CHECK + /* Define to 1 to build with Bonjour support. (--with-bonjour) */ #undef USE_BONJOUR diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 62554ce685a..b73feb30da5 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -315,6 +315,19 @@ extern uint64 pg_popcount_avx512(const char *buf, int bytes); extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask); #endif +/* + * On AArch64, try using SVE popcount instructions, but only if + * we can verify that the CPU supports it via a runtime check. + */ +#elif USE_SVE_POPCNT_WITH_RUNTIME_CHECK +extern int pg_popcount32(uint32 word); +extern int pg_popcount64(uint64 word); +extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes); +extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask); +extern bool pg_popcount_sve_available(void); +extern uint64 pg_popcount_sve(const char *buf, int bytes); +extern uint64 pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask); + #else /* Use a portable implementation -- no need for a function pointer. */ extern int pg_popcount32(uint32 word); diff --git a/src/port/Makefile b/src/port/Makefile index 4c224319512..61a8bcec15c 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -45,6 +45,7 @@ OBJS = \ path.o \ pg_bitutils.o \ pg_popcount_avx512.o \ + pg_popcount_sve.o \ pg_strong_random.o \ pgcheckdir.o \ pgmkdirp.o \ diff --git a/src/port/meson.build b/src/port/meson.build index 7fcfa728d43..4a3429c21a9 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -8,6 +8,7 @@ pgport_sources = [ 'path.c', 'pg_bitutils.c', 'pg_popcount_avx512.c', + 'pg_popcount_sve.c', 'pg_strong_random.c', 'pgcheckdir.c', 'pgmkdirp.c', diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index 5677525693d..f0d92bf317b 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -339,6 +339,47 @@ pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask) #endif /* TRY_POPCNT_FAST */ +#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK +static uint64 pg_popcount_choose_aarch64(const char *buf, int bytes); +static uint64 pg_popcount_masked_choose_aarch64(const char *buf, int bytes, bits8 mask); +uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose_aarch64; +uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose_aarch64; + +/* + * On AArch64 these functions are invoked on the first call to pg_popcount and + * pg_popcount_masked. They detect whether we can use the SVE implementations, + * and replace the function pointers so that subsequent calls are routed + * directly to the chosen implementation. + */ +static inline void +choose_popcount_functions_aarch64(void) +{ + if (pg_popcount_sve_available()) + { + pg_popcount_optimized = pg_popcount_sve; + pg_popcount_masked_optimized = pg_popcount_masked_sve; + } + else + { + pg_popcount_optimized = pg_popcount_slow; + pg_popcount_masked_optimized = pg_popcount_masked_slow; + } +} + +static uint64 +pg_popcount_choose_aarch64(const char *buf, int bytes) +{ + choose_popcount_functions_aarch64(); + return pg_popcount_optimized(buf, bytes); +} + +static uint64 +pg_popcount_masked_choose_aarch64(const char *buf, int bytes, bits8 mask) +{ + choose_popcount_functions_aarch64(); + return pg_popcount_masked(buf, bytes, mask); +} +#endif /* USE_SVE_POPCNT_WITH_RUNTIME_CHECK */ /* * pg_popcount32_slow @@ -507,6 +548,7 @@ pg_popcount64(uint64 word) return pg_popcount64_slow(word); } +#ifndef USE_SVE_POPCNT_WITH_RUNTIME_CHECK /* * pg_popcount_optimized * Returns the number of 1-bits in buf @@ -526,5 +568,5 @@ pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask) { return pg_popcount_masked_slow(buf, bytes, mask); } - +#endif /* !USE_SVE_POPCNT_WITH_RUNTIME_CHECK */ #endif /* !TRY_POPCNT_FAST */ diff --git a/src/port/pg_popcount_sve.c b/src/port/pg_popcount_sve.c new file mode 100644 index 00000000000..060193eec11 --- /dev/null +++ b/src/port/pg_popcount_sve.c @@ -0,0 +1,123 @@ +/*------------------------------------------------------------------------- + * + * pg_popcount_sve.c + * Holds the SVE pg_popcount() implementation. + * + * Copyright (c) 2024, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/port/pg_popcount_sve.c + * + *------------------------------------------------------------------------- + */ +#include "c.h" + +#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK + +#include "port/pg_bitutils.h" +#include + +#if defined(HAVE_ELF_AUX_INFO) || defined(HAVE_GETAUXVAL) +#include +#endif + +/* + * Returns true if the CPU supports the instructions required for the SVE + * pg_popcount() implementation. + */ +bool +pg_popcount_sve_available(void) +{ +#if defined(HAVE_ELF_AUX_INFO) && defined(__aarch64__) /* FreeBSD */ + unsigned long hwcap; + return elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)) == 0 && + (hwcap & HWCAP_SVE) != 0; +#elif defined(HAVE_GETAUXVAL) && defined(__aarch64__) /* Linux */ + return (getauxval(AT_HWCAP) & HWCAP_SVE) != 0; +#else + return false; +#endif +} + +/* + * pg_popcount_sve + * Returns the number of 1-bits in buf + */ +pg_attribute_target("arch=armv8-a+sve") +uint64 +pg_popcount_sve(const char *buf, int bytes) +{ + svbool_t pred = svptrue_b64(); + svuint64_t vec64, + accum1 = svdup_u64(0), + accum2 = svdup_u64(0); + uint32 i = 0, + vec_len = svcntb(), + loop_bytes = bytes & ~(vec_len * 2 - 1); + uint64 popcnt = 0; + + /* Process 2 complete vectors */ + for (; i < loop_bytes; i += vec_len * 2) + { + vec64 = svld1(pred, (const uint64 *) (buf + i)); + accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec64)); + vec64 = svld1(pred, (const uint64 *) (buf + i + vec_len)); + accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec64)); + } + + /* Reduce the accumulators */ + popcnt += svaddv(pred, svadd_x(pred, accum1, accum2)); + + /* Process the last incomplete vector */ + for(; i < bytes; i += vec_len) + { + pred = svwhilelt_b8(i, (uint32) bytes); + popcnt += svaddv(pred, svcnt_x(pred, svld1(pred, (const uint8 *) (buf + i)))); + } + + return popcnt; +} + +/* + * pg_popcount_masked_sve + * Returns the number of 1-bits in buf after applying the mask + */ +pg_attribute_target("arch=armv8-a+sve") +uint64 +pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask) +{ + svbool_t pred = svptrue_b64(); + svuint8_t vec8; + svuint64_t vec64, + accum1 = svdup_u64(0), + accum2 = svdup_u64(0); + uint32 i = 0, + vec_len = svcntb(), + loop_bytes = bytes & ~(vec_len * 2 - 1); + uint64 popcnt = 0, + mask64 = ~UINT64CONST(0) / 0xFF * mask; + + /* Process 2 complete vectors */ + for (; i < loop_bytes; i += vec_len * 2) + { + vec64 = svand_x(pred, svld1(pred, (const uint64 *) (buf + i)), mask64); + accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec64)); + vec64 = svand_x(pred, svld1(pred, (const uint64 *) (buf + i + vec_len)), mask64); + accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec64)); + } + + /* Reduce the accumulators */ + popcnt += svaddv(pred, svadd_x(pred, accum1, accum2)); + + /* Process the last incomplete vectors */ + for(; i < bytes; i += vec_len) + { + pred = svwhilelt_b8(i, (uint32) bytes); + vec8 = svand_x(pred, svld1(pred, (const uint8 *) (buf + i)), mask); + popcnt += svaddv(pred, svcnt_x(pred, vec8)); + } + + return popcnt; +} + +#endif /* USE_SVE_POPCNT_WITH_RUNTIME_CHECK */ -- 2.34.1