diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_avx512.c index f86558d1ee5..7fb2ada16c9 100644 --- a/src/port/pg_popcount_avx512.c +++ b/src/port/pg_popcount_avx512.c @@ -30,20 +30,27 @@ uint64 pg_popcount_avx512(const char *buf, int bytes) { - uint64 popcnt; + __m512i val, cnt; + __mmask64 remaining_mask; __m512i accum = _mm512_setzero_si512(); - for (; bytes >= sizeof(__m512i); bytes -= sizeof(__m512i)) + #pragma clang loop unroll(disable) + for (; bytes > sizeof(__m512i); bytes -= sizeof(__m512i)) { - const __m512i val = _mm512_loadu_si512((const __m512i *) buf); - const __m512i cnt = _mm512_popcnt_epi64(val); + val = _mm512_loadu_si512((const __m512i *) buf); + cnt = _mm512_popcnt_epi64(val); accum = _mm512_add_epi64(accum, cnt); buf += sizeof(__m512i); } - popcnt = _mm512_reduce_add_epi64(accum); - return popcnt + pg_popcount_fast(buf, bytes); + remaining_mask = ~0ULL >> (sizeof(__m512i) - bytes); + val = _mm512_maskz_loadu_epi8(remaining_mask, (const __m512i *) buf); + cnt = _mm512_popcnt_epi64(val); + + accum = _mm512_add_epi64(accum, cnt); + + return _mm512_reduce_add_epi64(accum); } #endif /* TRY_POPCNT_FAST */