From 41882bbf78f2d8a1fe817a0cbac70f221a0debf4 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Mon, 18 Mar 2024 11:02:05 -0500 Subject: [PATCH v4 3/3] Add support for AVX2 in simd.h. Discussion: https://postgr.es/m/20231129171526.GA857928%40nathanxps13 --- src/include/port/simd.h | 61 ++++++++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 13 deletions(-) diff --git a/src/include/port/simd.h b/src/include/port/simd.h index 597496f2fb..f06b21876b 100644 --- a/src/include/port/simd.h +++ b/src/include/port/simd.h @@ -18,7 +18,18 @@ #ifndef SIMD_H #define SIMD_H -#if (defined(__x86_64__) || defined(_M_AMD64)) +#if defined(__AVX2__) + +/* + * XXX: Need to add a big comment here. + */ +#include +#define USE_AVX2 +typedef __m256i Vector8; +typedef __m256i Vector32; + +#elif (defined(__x86_64__) || defined(_M_AMD64)) + /* * SSE2 instructions are part of the spec for the 64-bit x86 ISA. We assume * that compilers targeting this architecture understand SSE2 intrinsics. @@ -107,7 +118,9 @@ static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2); static inline void vector8_load(Vector8 *v, const uint8 *s) { -#if defined(USE_SSE2) +#if defined(USE_AVX2) + *v = _mm256_loadu_si256((const __m256i *) s); +#elif defined(USE_SSE2) *v = _mm_loadu_si128((const __m128i *) s); #elif defined(USE_NEON) *v = vld1q_u8(s); @@ -120,7 +133,9 @@ vector8_load(Vector8 *v, const uint8 *s) static inline void vector32_load(Vector32 *v, const uint32 *s) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + *v = _mm256_loadu_si256((const __m256i *) s); +#elif defined(USE_SSE2) *v = _mm_loadu_si128((const __m128i *) s); #elif defined(USE_NEON) *v = vld1q_u32(s); @@ -134,7 +149,9 @@ vector32_load(Vector32 *v, const uint32 *s) static inline Vector8 vector8_broadcast(const uint8 c) { -#if defined(USE_SSE2) +#if defined(USE_AVX2) + return _mm256_set1_epi8(c); +#elif defined(USE_SSE2) return _mm_set1_epi8(c); #elif defined(USE_NEON) return vdupq_n_u8(c); @@ -147,7 +164,9 @@ vector8_broadcast(const uint8 c) static inline Vector32 vector32_broadcast(const uint32 c) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + return _mm256_set1_epi32(c); +#elif defined(USE_SSE2) return _mm_set1_epi32(c); #elif defined(USE_NEON) return vdupq_n_u32(c); @@ -270,7 +289,9 @@ vector8_has_le(const Vector8 v, const uint8 c) static inline bool vector8_is_highbit_set(const Vector8 v) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + return _mm256_movemask_epi8(v) != 0; +#elif defined(USE_SSE2) return _mm_movemask_epi8(v) != 0; #elif defined(USE_NEON) return vmaxvq_u8(v) > 0x7F; @@ -308,7 +329,9 @@ vector32_is_highbit_set(const Vector32 v) static inline uint32 vector8_highbit_mask(const Vector8 v) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + return (uint32) _mm256_movemask_epi8(v); +#elif defined(USE_SSE2) return (uint32) _mm_movemask_epi8(v); #elif defined(USE_NEON) /* @@ -337,7 +360,9 @@ vector8_highbit_mask(const Vector8 v) static inline Vector8 vector8_or(const Vector8 v1, const Vector8 v2) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + return _mm256_or_si256(v1, v2); +#elif defined(USE_SSE2) return _mm_or_si128(v1, v2); #elif defined(USE_NEON) return vorrq_u8(v1, v2); @@ -350,7 +375,9 @@ vector8_or(const Vector8 v1, const Vector8 v2) static inline Vector32 vector32_or(const Vector32 v1, const Vector32 v2) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + return _mm256_or_si256(v1, v2); +#elif defined(USE_SSE2) return _mm_or_si128(v1, v2); #elif defined(USE_NEON) return vorrq_u32(v1, v2); @@ -368,7 +395,9 @@ vector32_or(const Vector32 v1, const Vector32 v2) static inline Vector8 vector8_ssub(const Vector8 v1, const Vector8 v2) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + return _mm256_subs_epu8(v1, v2); +#elif defined(USE_SSE2) return _mm_subs_epu8(v1, v2); #elif defined(USE_NEON) return vqsubq_u8(v1, v2); @@ -384,7 +413,9 @@ vector8_ssub(const Vector8 v1, const Vector8 v2) static inline Vector8 vector8_eq(const Vector8 v1, const Vector8 v2) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + return _mm256_cmpeq_epi8(v1, v2); +#elif defined(USE_SSE2) return _mm_cmpeq_epi8(v1, v2); #elif defined(USE_NEON) return vceqq_u8(v1, v2); @@ -396,7 +427,9 @@ vector8_eq(const Vector8 v1, const Vector8 v2) static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + return _mm256_cmpeq_epi32(v1, v2); +#elif defined(USE_SSE2) return _mm_cmpeq_epi32(v1, v2); #elif defined(USE_NEON) return vceqq_u32(v1, v2); @@ -411,7 +444,9 @@ vector32_eq(const Vector32 v1, const Vector32 v2) static inline Vector8 vector8_min(const Vector8 v1, const Vector8 v2) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + return _mm256_min_epu8(v1, v2); +#elif defined(USE_SSE2) return _mm_min_epu8(v1, v2); #elif defined(USE_NEON) return vminq_u8(v1, v2); -- 2.25.1