diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 5db02b2ab7..63af1331bf 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -694,3 +694,38 @@ if test x"$Ac_cachevar" = x"yes"; then fi undefine([Ac_cachevar])dnl ])# PGAC_LOONGARCH_CRC32C_INTRINSICS + +# PGAC_AVX512_POPCNT_INTRINSICS +# --------------------------- +# Check if the compiler supports the x86_64 AVX512 POPCNT instructions using +# intrinsics used in CPUID features AVX512F and AVX512VPOPCNTDQ. +# +# Optional compiler flags can be passed as argument (e.g. -mavx512vpopcntdq). +# If the intrinsics are supported then pgac_avx512_popcnt_intrinsics and +# CFLAGS_AVX512_POPCNT are set. +AC_DEFUN([PGAC_AVX512_POPCNT_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_popcnt_intrinsics_$1])])dnl +AC_CACHE_CHECK([for _mm512_popcnt_epi64 with CFLAGS=$1], [Ac_cachevar], +[pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS $1" +AC_LINK_IFELSE([AC_LANG_PROGRAM([#include +#include ], + [ + __m512i tmp __attribute__((aligned(64))); + __m512i input = _mm512_setzero_si512(); + __m512i output = _mm512_popcnt_epi64(input); + uint64_t cnt = 999; + _mm512_store_si512(&tmp, output); + cnt = _mm512_reduce_add_epi64(tmp); + /* return computed value, to prevent the above being optimized away */ + return cnt == 0; + ])], + [Ac_cachevar=yes], + [Ac_cachevar=no]) +CFLAGS="$pgac_save_CFLAGS"]) +if test x"$Ac_cachevar" = x"yes"; then + CFLAGS_AVX512_POPCNT="$1" + pgac_avx512_popcnt_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_AVX512_POPCNT_INTRINSICS diff --git a/configure b/configure index 2a1ee251f2..96c8f39254 100755 --- a/configure +++ b/configure @@ -647,6 +647,7 @@ MSGFMT_FLAGS MSGFMT PG_CRC32C_OBJS CFLAGS_CRC +CFLAGS_AVX512_POPCNT LIBOBJS OPENSSL ZSTD @@ -15209,7 +15210,7 @@ else We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ -#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) +#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; @@ -15255,7 +15256,7 @@ else We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ -#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) +#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; @@ -15279,7 +15280,7 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ -#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) +#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; @@ -15324,7 +15325,7 @@ else We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ -#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) +#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; @@ -15348,7 +15349,7 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ -#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) +#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; @@ -17736,6 +17737,100 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h fi +# Check for Intel AVX512 intrinsics to do POPCNT calculations. +# +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=" >&5 +$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=... " >&6; } +if ${pgac_cv_avx512_popcnt_intrinsics_+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS " +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +int +main () +{ + + __m512i tmp __attribute__((aligned(64))); + __m512i input = _mm512_setzero_si512(); + __m512i output = _mm512_popcnt_epi64(input); + uint64_t cnt = 999; + _mm512_store_si512(&tmp, output); + cnt = _mm512_reduce_add_epi64(tmp); + /* return computed value, to prevent the above being optimized away */ + return cnt == 0; + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_avx512_popcnt_intrinsics_=yes +else + pgac_cv_avx512_popcnt_intrinsics_=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics_" >&5 +$as_echo "$pgac_cv_avx512_popcnt_intrinsics_" >&6; } +if test x"$pgac_cv_avx512_popcnt_intrinsics_" = x"yes"; then + CFLAGS_AVX512_POPCNT="" + pgac_avx512_popcnt_intrinsics=yes +fi + +if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq -mavx512f" >&5 +$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq -mavx512f... " >&6; } +if ${pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS -mavx512vpopcntdq -mavx512f" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +int +main () +{ + + __m512i tmp __attribute__((aligned(64))); + __m512i input = _mm512_setzero_si512(); + __m512i output = _mm512_popcnt_epi64(input); + uint64_t cnt = 999; + _mm512_store_si512(&tmp, output); + cnt = _mm512_reduce_add_epi64(tmp); + /* return computed value, to prevent the above being optimized away */ + return cnt == 0; + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f=yes +else + pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f" >&5 +$as_echo "$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f" >&6; } +if test x"$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f" = x"yes"; then + CFLAGS_AVX512_POPCNT="-mavx512vpopcntdq -mavx512f" + pgac_avx512_popcnt_intrinsics=yes +fi + +fi + + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used diff --git a/configure.ac b/configure.ac index 52fd7af446..d5fe701c9c 100644 --- a/configure.ac +++ b/configure.ac @@ -2078,6 +2078,14 @@ if test x"$pgac_cv__cpuid" = x"yes"; then AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.]) fi +# Check for Intel AVX512 intrinsics to do POPCNT calculations. +# +PGAC_AVX512_POPCNT_INTRINSICS([]) +if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then + PGAC_AVX512_POPCNT_INTRINSICS([-mavx512vpopcntdq -mavx512f]) +fi +AC_SUBST(CFLAGS_AVX512_POPCNT) + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 8b3f8c24e0..089f49b7f3 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -263,6 +263,7 @@ CXXFLAGS_SL_MODULE = @CXXFLAGS_SL_MODULE@ CFLAGS_UNROLL_LOOPS = @CFLAGS_UNROLL_LOOPS@ CFLAGS_VECTORIZE = @CFLAGS_VECTORIZE@ CFLAGS_CRC = @CFLAGS_CRC@ +CFLAGS_AVX512_POPCNT = @CFLAGS_AVX512_POPCNT@ PERMIT_DECLARATION_AFTER_STATEMENT = @PERMIT_DECLARATION_AFTER_STATEMENT@ CXXFLAGS = @CXXFLAGS@ diff --git a/src/port/Makefile b/src/port/Makefile index dcc8737e68..6a01a7d89a 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -87,6 +87,11 @@ pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_sse42_srv.o: CFLAGS+=$(CFLAGS_CRC) +# Newer Intel processors can use AVX-512 POPCNT Capabilities (01/30/2024) +pg_bitutils.o: CFLAGS+=$(CFLAGS_AVX512_POPCNT) +pg_bitutils_shlib.o: CFLAGS+=$(CFLAGS_AVX512_POPCNT) +pg_bitutils_srv.o:CFLAGS+=$(CFLAGS_AVX512_POPCNT) + # all versions of pg_crc32c_armv8.o need CFLAGS_CRC pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC) diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index 640a89561a..7db3cd44ce 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -19,6 +19,8 @@ #include #endif +#include + #include "port/pg_bitutils.h" @@ -110,12 +112,16 @@ static int pg_popcount64_slow(uint64 word); static bool pg_popcount_available(void); static int pg_popcount32_choose(uint32 word); static int pg_popcount64_choose(uint64 word); +static int pg_popcount512_choose(const char* buf, int bytes); static int pg_popcount32_fast(uint32 word); static int pg_popcount64_fast(uint64 word); +static uint64 pg_popcount512_fast(const char* buf, int bytes); +static uint64 pg_popcount512_slow(const char* buf, int bytes); int (*pg_popcount32) (uint32 word) = pg_popcount32_choose; int (*pg_popcount64) (uint64 word) = pg_popcount64_choose; -#endif /* TRY_POPCNT_FAST */ +uint64 (*pg_popcount512) (const char* buf, int bytes) = pg_popcount512_choose; +#endif /* TRY_POPCNT_FAST */ #ifdef TRY_POPCNT_FAST @@ -138,6 +144,36 @@ pg_popcount_available(void) return (exx[2] & (1 << 23)) != 0; /* POPCNT */ } +/* + * Return true if CPUID indicates that the AVX512_POPCNT instruction is available. This is + * simular to the method above see this URL. + * + * https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features + * + * Finally we make sure the xgetbv result is conistent with the CPUID results. + */ +static bool +pg_popcount512_available(void) +{ + unsigned int exx[4] = {0, 0, 0, 0}; +#if defined(HAVE__GET_CPUID) + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUID) + __cpuidex(exx, 7, 0); +#else +#error cpuid instruction not available +#endif + if ((exx[2] & (0x00004000)) != 0 && (exx[1] & (0x00010000)) != 0) // Check for AVX512VPOPCNTDQ and AVX512F + { + uint64 xcr = 0; + uint32 high; + uint32 low; + __asm__ __volatile__("xgetbv\t\n" : "=a"(low), "=d"(high) : "c"(xcr)); + return (low & 0xE0) != 0; + } /* POPCNT 512 */ + return false; +} + /* * These functions get called on the first call to pg_popcount32 etc. * They detect whether we can use the asm implementations, and replace @@ -178,6 +214,19 @@ pg_popcount64_choose(uint64 word) return pg_popcount64(word); } +static int +pg_popcount512_choose(const char* buf, int bytes) { + if (pg_popcount512_available()) + { + pg_popcount512 = pg_popcount512_fast; + } + else + { + pg_popcount512 = pg_popcount512_slow; + } + return pg_popcount512(buf, bytes); +} + /* * pg_popcount32_fast * Return the number of 1 bits set in word @@ -212,6 +261,30 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc"); #endif } +static uint64 +pg_popcount512_fast(const char *buf, int bytes) +{ + uint64 popcnt = 0; + __m512i accumulator = _mm512_setzero_si512(); + while (bytes >= 64) + { + const __m512i v = _mm512_loadu_si512((const __m512i *)buf); + const __m512i p = _mm512_popcnt_epi64(v); + accumulator = _mm512_add_epi64(accumulator, p); + bytes -= 64; + buf += 64; + } + + popcnt = _mm512_reduce_add_epi64(accumulator); + bytes = bytes % 64; + + /* Process any remaining bytes */ + while (bytes--) + popcnt += pg_number_of_ones[(unsigned char)*buf++]; + + return popcnt; +} + #endif /* TRY_POPCNT_FAST */ @@ -265,6 +338,29 @@ pg_popcount64_slow(uint64 word) #endif /* HAVE__BUILTIN_POPCOUNT */ } +static uint64 +pg_popcount512_slow(const char* buf, int bytes) { + uint64 popcnt = 0; + if (buf == (const char *)TYPEALIGN(8, buf)) + { + const uint64 *words = (const uint64 *) buf; + + while (bytes >= 8) + { + popcnt += pg_popcount64(*words++); + bytes -= 8; + } + + buf = (const char *) words; + } + + /* Process any remaining bytes */ + while (bytes--) + popcnt += pg_number_of_ones[(unsigned char) *buf++]; + + return popcnt; +} + #ifndef TRY_POPCNT_FAST /* @@ -286,6 +382,13 @@ pg_popcount64(uint64 word) return pg_popcount64_slow(word); } +static uint64 pg_popcount512_slow(const char *buf, int bytes); +inline uint64 +pg_popcount512(const char *buf, int bytes) +{ + return pg_popcount512_slow(buf, bytes); +} + #endif /* !TRY_POPCNT_FAST */ /* @@ -298,22 +401,10 @@ pg_popcount(const char *buf, int bytes) uint64 popcnt = 0; #if SIZEOF_VOID_P >= 8 - /* Process in 64-bit chunks if the buffer is aligned. */ - if (buf == (const char *) TYPEALIGN(8, buf)) - { - const uint64 *words = (const uint64 *) buf; - - while (bytes >= 8) - { - popcnt += pg_popcount64(*words++); - bytes -= 8; - } - - buf = (const char *) words; - } + return pg_popcount512(buf, bytes); #else - /* Process in 32-bit chunks if the buffer is aligned. */ - if (buf == (const char *) TYPEALIGN(4, buf)) + /* Process in 32-bit chunks if the buffer is aligned. */ + if (buf == (const char *)TYPEALIGN(4, buf)) { const uint32 *words = (const uint32 *) buf;