From 91aa5ef1034b763e279b4a8970101355e4a79600 Mon Sep 17 00:00:00 2001 From: Amit Khandekar Date: Tue, 9 Jun 2020 16:35:23 +0530 Subject: [PATCH] Auto-vectorize loop to speedup large-precision numeric product A 'for' loop in mul_var() runs backwards by decrementing two variables. This prevents the gcc compiler from auto-vectorizing the for loop. So make it a forward loop with a single variable. This gives performance benefits for product of numeric types with large precision, with speedups becoming noticeable from values with precisions starting from 20-40. Typical pattern of benefit is : precision 50: 5%; precision 60: 11%; 120 : 50%; 240: 2.2x; and so on. On some CPU architectures, the speedup starts from 20 precision onwards. With the precisions used in the numeric_big regression test, the multiplication speeds up by 2.5 to 2.7 times. Auto-vectorization happens with -O3 flag or -ftree-loop-vectorize. So this benefit with be seen when built with gcc -O3. --- src/backend/utils/adt/numeric.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c index f3a725271e..4243242ad9 100644 --- a/src/backend/utils/adt/numeric.c +++ b/src/backend/utils/adt/numeric.c @@ -7226,6 +7226,7 @@ mul_var(const NumericVar *var1, const NumericVar *var2, NumericVar *result, int res_weight; int maxdigits; int *dig; + int *digptr; int carry; int maxdig; int newdig; @@ -7362,10 +7363,14 @@ mul_var(const NumericVar *var1, const NumericVar *var2, NumericVar *result, * * As above, digits of var2 can be ignored if they don't contribute, * so we only include digits for which i1+i2+2 <= res_ndigits - 1. + * + * For large precisions, this can become a bottleneck; so keep this for + * loop simple so that it can be auto-vectorized. */ - for (i2 = Min(var2ndigits - 1, res_ndigits - i1 - 3), i = i1 + i2 + 2; - i2 >= 0; i2--) - dig[i--] += var1digit * var2digits[i2]; + i2 = Min(var2ndigits - 1, res_ndigits - i1 - 3); + digptr = &dig[i1 + 2]; + for (i = 0; i <= i2; i++) + digptr[i] += var1digit * var2digits[i]; } /* -- 2.17.1