From 077f88bcbff74d29b64459fcdac3096a28d07b72 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Thu, 26 Dec 2019 18:28:50 -0500 Subject: [PATCH] Use the CLZ instruction in AllocSetFreeIndex() In commit ab5b4e2f9ed, we optimized AllocSetFreeIndex() using a lookup table. At the time, using CLZ was rejected because compiler/platform support was not widespread enough to justify it. Since 02a6a54ecd6, we test for availablity of __builtin_clz(), so use that instead. This is about 20% faster on Intel platforms, but perhaps more importantly reduces cache pollution caused by the lookup table approach. In addition, for the open-coded case, use the general-purpose lookup table added by 02a6a54ecd6, rather than a single-purpose one. This allows platforms without CLZ to reduce cache pollution as well. --- src/backend/utils/mmgr/aset.c | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/src/backend/utils/mmgr/aset.c b/src/backend/utils/mmgr/aset.c index f729d9b6de..137c0b8ee5 100644 --- a/src/backend/utils/mmgr/aset.c +++ b/src/backend/utils/mmgr/aset.c @@ -46,6 +46,7 @@ #include "postgres.h" +#include "port/pg_bitutils.h" #include "utils/memdebug.h" #include "utils/memutils.h" @@ -297,18 +298,6 @@ static const MemoryContextMethods AllocSetMethods = { #endif }; -/* - * Table for AllocSetFreeIndex - */ -#define LT16(n) n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n - -static const unsigned char LogTable256[256] = -{ - 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, - LT16(5), LT16(6), LT16(6), LT16(7), LT16(7), LT16(7), LT16(7), - LT16(8), LT16(8), LT16(8), LT16(8), LT16(8), LT16(8), LT16(8), LT16(8) -}; - /* ---------- * Debug macros * ---------- @@ -337,8 +326,7 @@ static inline int AllocSetFreeIndex(Size size) { int idx; - unsigned int t, - tsize; + unsigned int tsize; if (size > (1 << ALLOC_MINBITS)) { @@ -346,15 +334,20 @@ AllocSetFreeIndex(Size size) /* * At this point we need to obtain log2(tsize)+1, ie, the number of - * not-all-zero bits at the right. We used to do this with a - * shift-and-count loop, but this function is enough of a hotspot to - * justify micro-optimization effort. The best approach seems to be - * to use a lookup table. Note that this code assumes that - * ALLOCSET_NUM_FREELISTS <= 17, since we only cope with two bytes of - * the tsize value. + * not-all-zero bits at the right. We don't use the utility function + * pg_leftmost_one_pos32() here because if CLZ is not available, + * determining the correct shift has a performance penalty. + * By assuming that ALLOCSET_NUM_FREELISTS <= 17, we only need to + * cope with two bytes of the tsize value. */ +#ifdef HAVE__BUILTIN_CLZ + idx = 32 - __builtin_clz((uint32) tsize); +#else + unsigned int t; t = tsize >> 8; - idx = t ? LogTable256[t] + 8 : LogTable256[tsize]; + idx = t ? pg_leftmost_one_pos[t] + 8 : pg_leftmost_one_pos[tsize]; + idx += 1; +#endif Assert(idx < ALLOCSET_NUM_FREELISTS); } -- 2.22.0