>From c968f05c26ecfa9344a8a9c9209bd755fa4ddf7b Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Tue, 26 Jan 2016 18:14:33 +0100 Subject: [PATCH 8/9] change how we apply selectivity to number of groups estimate Instead of simply multiplying the ndistinct estimate with selecticity, we instead use the formula for the expected number of distinct values observed in 'k' rows when there are 'd' distinct values in the bin d * (1 - ((d - 1) / d)^k) This is 'with replacements' which seems appropriate for the use, and it mostly assumes uniform distribution of the distinct values. So if the distribution is not uniform (e.g. there are very frequent groups) this may be less accurate than the current algorithm in some cases, giving over-estimates. But that's probably better than OOM. --- src/backend/utils/adt/selfuncs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index f8d39aa..76be0e3 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -3464,9 +3464,9 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows, reldistinct = clamp; /* - * Multiply by restriction selectivity. + * Estimate the number of distinct values observed in rel->rows. */ - reldistinct *= rel->rows / rel->tuples; + reldistinct *= (1 - powl(1 - rel->rows/rel->tuples, rel->tuples/reldistinct)); /* * Update estimate of total distinct groups. -- 2.5.0