From 8ea529dd315723ca3e8ad4243853148da23f1202 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Sun, 31 Mar 2024 22:22:15 -0500 Subject: [PATCH v19 4/4] optimize visibilitymap_count() with AVX512 --- src/backend/access/heap/visibilitymap.c | 25 ++---- src/include/port/pg_bitutils.h | 6 +- src/port/pg_bitutils.c | 113 ++++++++++++++++++++++++ src/port/pg_popcount_avx512.c | 21 +++++ 4 files changed, 144 insertions(+), 21 deletions(-) diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 1ab6c865e3..8b24e7bc33 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -119,10 +119,8 @@ #define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK) /* Masks for counting subsets of bits in the visibility map. */ -#define VISIBLE_MASK64 UINT64CONST(0x5555555555555555) /* The lower bit of each - * bit pair */ -#define FROZEN_MASK64 UINT64CONST(0xaaaaaaaaaaaaaaaa) /* The upper bit of each - * bit pair */ +#define VISIBLE_MASK8 (0x55) /* The lower bit of each bit pair */ +#define FROZEN_MASK8 (0xaa) /* The upper bit of each bit pair */ /* prototypes for internal routines */ static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend); @@ -396,7 +394,6 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro { Buffer mapBuffer; uint64 *map; - int i; /* * Read till we fall off the end of the map. We assume that any extra @@ -414,21 +411,9 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro */ map = (uint64 *) PageGetContents(BufferGetPage(mapBuffer)); - StaticAssertStmt(MAPSIZE % sizeof(uint64) == 0, - "unsupported MAPSIZE"); - if (all_frozen == NULL) - { - for (i = 0; i < MAPSIZE / sizeof(uint64); i++) - nvisible += pg_popcount64(map[i] & VISIBLE_MASK64); - } - else - { - for (i = 0; i < MAPSIZE / sizeof(uint64); i++) - { - nvisible += pg_popcount64(map[i] & VISIBLE_MASK64); - nfrozen += pg_popcount64(map[i] & FROZEN_MASK64); - } - } + nvisible += pg_popcount_masked((const char *) map, MAPSIZE, VISIBLE_MASK8); + if (all_frozen) + nfrozen += pg_popcount_masked((const char *) map, MAPSIZE, FROZEN_MASK8); ReleaseBuffer(mapBuffer); } diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 1a92c56bcd..16145c746e 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -303,9 +303,11 @@ pg_ceil_log2_64(uint64 num) extern PGDLLIMPORT int (*pg_popcount32) (uint32 word); extern PGDLLIMPORT int (*pg_popcount64) (uint64 word); extern PGDLLIMPORT uint64 (*pg_popcount) (const char *buf, int bytes); +extern PGDLLIMPORT uint64 (*pg_popcount_masked) (const char *buf, int bytes, bits8 mask); -/* Export pg_popcount_fast() for use in the AVX512 implementation. */ +/* Exported for use in the AVX512 implementation. */ extern uint64 pg_popcount_fast(const char *buf, int bytes); +extern uint64 pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask); /* * We can also try to use the AVX512 popcount instruction on some systems. @@ -317,6 +319,7 @@ extern uint64 pg_popcount_fast(const char *buf, int bytes); #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK extern bool pg_popcount_avx512_available(void); extern uint64 pg_popcount_avx512(const char *buf, int bytes); +extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask); #endif #else @@ -324,6 +327,7 @@ extern uint64 pg_popcount_avx512(const char *buf, int bytes); extern int pg_popcount32(uint32 word); extern int pg_popcount64(uint64 word); extern uint64 pg_popcount(const char *buf, int bytes); +extern uint64 pg_popcount_masked(const char *buf, int bytes); #endif /* TRY_POPCNT_FAST */ diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index 177509518f..902ecdebbf 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -106,18 +106,21 @@ const uint8 pg_number_of_ones[256] = { static inline int pg_popcount32_slow(uint32 word); static inline int pg_popcount64_slow(uint64 word); static uint64 pg_popcount_slow(const char *buf, int bytes); +static uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask); #ifdef TRY_POPCNT_FAST static bool pg_popcount_available(void); static int pg_popcount32_choose(uint32 word); static int pg_popcount64_choose(uint64 word); static uint64 pg_popcount_choose(const char *buf, int bytes); +static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask); static inline int pg_popcount32_fast(uint32 word); static inline int pg_popcount64_fast(uint64 word); int (*pg_popcount32) (uint32 word) = pg_popcount32_choose; int (*pg_popcount64) (uint64 word) = pg_popcount64_choose; uint64 (*pg_popcount) (const char *buf, int bytes) = pg_popcount_choose; +uint64 (*pg_popcount_masked) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose; #endif /* TRY_POPCNT_FAST */ #ifdef TRY_POPCNT_FAST @@ -155,9 +158,13 @@ choose_popcount_functions(void) pg_popcount32 = pg_popcount32_fast; pg_popcount64 = pg_popcount64_fast; pg_popcount = pg_popcount_fast; + pg_popcount_masked = pg_popcount_masked_fast; #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK if (pg_popcount_avx512_available()) + { pg_popcount = pg_popcount_avx512; + pg_popcount_masked = pg_popcount_masked_avx512; + } #endif } else @@ -165,6 +172,7 @@ choose_popcount_functions(void) pg_popcount32 = pg_popcount32_slow; pg_popcount64 = pg_popcount64_slow; pg_popcount = pg_popcount_slow; + pg_popcount_masked = pg_popcount_masked_slow; } } @@ -189,6 +197,13 @@ pg_popcount_choose(const char *buf, int bytes) return pg_popcount(buf, bytes); } +static uint64 +pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask) +{ + choose_popcount_functions(); + return pg_popcount_masked(buf, bytes, mask); +} + /* * pg_popcount32_fast * Return the number of 1 bits set in word @@ -269,6 +284,52 @@ pg_popcount_fast(const char *buf, int bytes) return popcnt; } +uint64 +pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask) +{ + uint64 popcnt = 0; + +#if SIZEOF_VOID_P >= 8 + /* Process in 64-bit chunks if the buffer is aligned */ + uint64 maskv = ~UINT64CONST(0) / 0xFF * mask; + + if (buf == (const char *) TYPEALIGN(8, buf)) + { + const uint64 *words = (const uint64 *) buf; + + while (bytes >= 8) + { + popcnt += pg_popcount64_fast(*words++ & maskv); + bytes -= 8; + } + + buf = (const char *) words; + } +#else + /* Process in 32-bit chunks if the buffer is aligned. */ + uint32 maskv = ~0 / 0xFF * mask; + + if (buf == (const char *) TYPEALIGN(4, buf)) + { + const uint32 *words = (const uint32 *) buf; + + while (bytes >= 4) + { + popcnt += pg_popcount32_fast(*words++ & maskv); + bytes -= 4; + } + + buf = (const char *) words; + } +#endif + + /* Process any remaining bytes */ + while (bytes--) + popcnt += pg_number_of_ones[(unsigned char) *buf++ & maskv]; + + return popcnt; +} + #endif /* TRY_POPCNT_FAST */ @@ -368,6 +429,52 @@ pg_popcount_slow(const char *buf, int bytes) return popcnt; } +static uint64 +pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask) +{ + uint64 popcnt = 0; + +#if SIZEOF_VOID_P >= 8 + /* Process in 64-bit chunks if the buffer is aligned */ + uint64 maskv = ~UINT64CONST(0) / 0xFF * mask; + + if (buf == (const char *) TYPEALIGN(8, buf)) + { + const uint64 *words = (const uint64 *) buf; + + while (bytes >= 8) + { + popcnt += pg_popcount64_slow(*words++ & maskv); + bytes -= 8; + } + + buf = (const char *) words; + } +#else + /* Process in 32-bit chunks if the buffer is aligned. */ + uint32 maskv = ~0 / 0xFF * mask; + + if (buf == (const char *) TYPEALIGN(4, buf)) + { + const uint32 *words = (const uint32 *) buf; + + while (bytes >= 4) + { + popcnt += pg_popcount32_slow(*words++ & maskv); + bytes -= 4; + } + + buf = (const char *) words; + } +#endif + + /* Process any remaining bytes */ + while (bytes--) + popcnt += pg_number_of_ones[(unsigned char) *buf++ & maskv]; + + return popcnt; +} + #ifndef TRY_POPCNT_FAST /* @@ -399,4 +506,10 @@ pg_popcount(const char *buf, int bytes) return pg_popcount_slow(buf, bytes); } +uint64 +pg_popcount_masked(const char *buf, int bytes, bits8 mask) +{ + return pg_popcount_masked_slow(buf, bytes, mask); +} + #endif /* !TRY_POPCNT_FAST */ diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_avx512.c index f86558d1ee..8965a8d530 100644 --- a/src/port/pg_popcount_avx512.c +++ b/src/port/pg_popcount_avx512.c @@ -46,4 +46,25 @@ pg_popcount_avx512(const char *buf, int bytes) return popcnt + pg_popcount_fast(buf, bytes); } +uint64 +pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask) +{ + uint64 popcnt; + __m512i accum = _mm512_setzero_si512(); + const __m512i maskv = _mm512_set1_epi8(mask); + + for (; bytes >= sizeof(__m512i); bytes -= sizeof(__m512i)) + { + const __m512i val = _mm512_loadu_si512((const __m512i *) buf); + const __m512i vmasked = _mm512_and_si512(val, maskv); + const __m512i cnt = _mm512_popcnt_epi64(vmasked); + + accum = _mm512_add_epi64(accum, cnt); + buf += sizeof(__m512i); + } + + popcnt = _mm512_reduce_add_epi64(accum); + return popcnt + pg_popcount_masked_fast(buf, bytes, mask); +} + #endif /* TRY_POPCNT_FAST */ -- 2.25.1