diff --git a/src/port/Makefile b/src/port/Makefile index 4320dee0d1..1f6cbe362f 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -93,6 +93,7 @@ pg_crc32c_sse42_srv.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC) +pg_bitutils.o: CFLAGS+=$(CFLAGS_AVX512) # # Shared library versions of object files diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index 1f3dea2d4b..443b8b63ce 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -21,6 +21,21 @@ #include "port/pg_bitutils.h" +#if (defined(__linux__) || defined(__linux) || defined(linux)) +#if defined(__x86_64) && defined(AVX512_POPCNT) +/* Set macro for AVX-512 inclusion in the binary. */ +#define NEED_AVX512_POPCNTDQ 1 + +#include + +/* Forward ref for AVX-512 private implementation */ +uint64 popcount_512_impl_unaligned(const char *buf, int bytes); +#endif /* Platform and Flag for AVX-512 */ +#endif /* Linux */ + +/* Forward refs for private refactor of 64-bit implementation */ +uint64 popcount_64_impl(const char *buf, int bytes); +uint64 popcount_impl(const char *buf, int bytes); /* * Array giving the position of the left-most set bit for each possible @@ -288,48 +303,99 @@ pg_popcount64(uint64 word) #endif /* !TRY_POPCNT_FAST */ +inline uint64 +pg_popcnt_software(const char *buf, int bytes) +{ + uint64 popcnt = 0; + while (bytes--) + popcnt += pg_number_of_ones[(unsigned char)*buf++]; + return popcnt; +} + /* * pg_popcount * Returns the number of 1-bits in buf */ -uint64 +inline uint64 pg_popcount(const char *buf, int bytes) -{ - uint64 popcnt = 0; - +{ /* Refatored for reuse in AVX-512 implementaitons. */ #if SIZEOF_VOID_P >= 8 /* Process in 64-bit chunks if the buffer is aligned. */ if (buf == (const char *) TYPEALIGN(8, buf)) - { - const uint64 *words = (const uint64 *) buf; + return popcount_impl(buf, bytes); + else /* If not aligned use software only */ + return pg_popcnt_software(buf, bytes); +#else + return pg_popcnt_software(buf, bytes); +#endif +} - while (bytes >= 8) - { - popcnt += pg_popcount64(*words++); - bytes -= 8; - } +/* + * Refatored 64-bit algorithm using the refactored software + * algorithm for trailing bytes. + */ +inline uint64 +popcount_64_impl(const char *buf, int bytes) +{ + uint64 popcnt = 0; - buf = (const char *) words; - } -#else - /* Process in 32-bit chunks if the buffer is aligned. */ - if (buf == (const char *) TYPEALIGN(4, buf)) + while (bytes >= sizeof(uint64)) { - const uint32 *words = (const uint32 *) buf; + popcnt += pg_popcount64(*((const uint64 *)buf)); + buf += sizeof(uint64); + bytes -= sizeof(uint64); + } + + /* Process remaining bytes... */ + popcnt += pg_popcnt_software(buf, bytes); + return popcnt; +} - while (bytes >= 4) - { - popcnt += pg_popcount32(*words++); - bytes -= 4; - } +#if defined(NEED_AVX512_POPCNTDQ) - buf = (const char *) words; +#define LINE_SIZE_LOCAL 8192 +/* + * AVX-512 implementation for popcount using 64-bit algorithm + * for 512-bit unaligned leading and trailing portions. + */ +inline uint64 +popcount_512_impl_unaligned(const char *buf, int bytes) +{ + uint64 popcnt = 0; + uint64 remainder = ((uint64)buf) % 64; + popcnt += popcount_64_impl(buf, remainder); + bytes -= remainder; + buf += remainder; + + __m512i *vectors = (__m512i *)buf; + while (bytes >= 64) { + popcnt += (uint64)_mm512_reduce_add_epi64( + _mm512_popcnt_epi64(*(vectors++))); + bytes -= 64; } -#endif - - /* Process any remaining bytes */ - while (bytes--) - popcnt += pg_number_of_ones[(unsigned char) *buf++]; + buf = (const char *)vectors; + popcnt += popcount_64_impl(buf, bytes); return popcnt; } +#endif + +/* + * Called by pg_popcount when architecture is 64-bit and aligned. + * Will default to the original 64-bit algorithm if conditions for AVX-512 + * are not met. + */ +inline uint64 +popcount_impl(const char *buf, int bytes) +{ +#if defined(NEED_AVX512_POPCNTDQ) + if(bytes >= 25165824) /* 24MiB */ + /* After testing, this is the threshhold where benefits for AVX-512 + starts. */ + return popcount_512_impl_unaligned(buf, bytes); + else + return popcount_64_impl(buf, bytes); +#else + return popcount_64_impl(buf, bytes); +#endif +}