From 89d17ba8a669b53814551284f8f8c82192eb1402 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Wed, 3 Aug 2022 09:49:04 -0700 Subject: [PATCH v5 2/3] Introduce optimized routine for linear searches through an array of integers. If SSE2 is available, this function uses it to speed up the search. Otherwise, it uses a simple 'for' loop. This is a prerequisite for a follow-up commit that will use this function to optimize [sub]xip lookups in XidInMVCCSnapshot(), but it can be used anywhere that might benefit from such an optimization. It might be worthwhile to add an ARM-specific code path to this function in the future. Author: Nathan Bossart Reviewed by: Andres Freund, John Naylor Discussion: https://postgr.es/m/20220713170950.GA3116318%40nathanxps13 --- src/include/utils/linearsearch.h | 76 ++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 src/include/utils/linearsearch.h diff --git a/src/include/utils/linearsearch.h b/src/include/utils/linearsearch.h new file mode 100644 index 0000000000..51298b4355 --- /dev/null +++ b/src/include/utils/linearsearch.h @@ -0,0 +1,76 @@ +/*------------------------------------------------------------------------- + * + * linearsearch.h + * Optimized linear search routines. + * + * Copyright (c) 2022, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/include/utils/linearsearch.h + * + *------------------------------------------------------------------------- + */ +#ifndef LINEARSEARCH_H +#define LINEARSEARCH_H + +#include "port/simd.h" + +/* + * pg_linearsearch_uint32 + * + * Returns true if there is an element in 'base' that equals 'key'. Otherwise, + * returns false. + * + * Since pg_attribute_no_sanitize_alignment() is only intended for x86-specific + * code, we surround it with an SSE2 check. + */ +#ifdef USE_SSE2 +pg_attribute_no_sanitize_alignment() +#endif +static inline bool +pg_linearsearch_uint32(uint32 key, uint32 *base, uint32 nelem) +{ + uint32 i = 0; + + /* If possible, use SSE2 intrinsics to speed up the search. */ +#ifdef USE_SSE2 + __m128i keys = _mm_set1_epi32(key); /* load 4 copies of key */ + uint32 its = nelem & ~0xF; /* round down to multiple of 16 */ + + for (; i < its; i += 16) + { + /* load the next 16 values into __m128i variables */ + __m128i vals1 = _mm_loadu_si128((__m128i *) &base[i]); + __m128i vals2 = _mm_loadu_si128((__m128i *) &base[i + 4]); + __m128i vals3 = _mm_loadu_si128((__m128i *) &base[i + 8]); + __m128i vals4 = _mm_loadu_si128((__m128i *) &base[i + 12]); + + /* perform the comparisons */ + __m128i result1 = _mm_cmpeq_epi32(keys, vals1); + __m128i result2 = _mm_cmpeq_epi32(keys, vals2); + __m128i result3 = _mm_cmpeq_epi32(keys, vals3); + __m128i result4 = _mm_cmpeq_epi32(keys, vals4); + + /* shrink the results into a single variable */ + __m128i tmp1 = _mm_packs_epi32(result1, result2); + __m128i tmp2 = _mm_packs_epi32(result3, result4); + __m128i result = _mm_packs_epi16(tmp1, tmp2); + + /* see if there was a match */ + if (_mm_movemask_epi8(result) != 0) + return true; + } +#endif + + /* Process the remaining elements the slow way. */ + for (; i < nelem; i++) + { + if (key == base[i]) + return true; + } + + return false; +} + +#endif /* LINEARSEARCH_H */ -- 2.25.1