diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c new file mode 100644 index a6555e9..99f5f7c --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -3439,9 +3439,51 @@ estimate_num_groups(PlannerInfo *root, L reldistinct = clamp; /* - * Multiply by restriction selectivity. + * Update the estimate based on the restriction selectivity, + * guarding against division by zero when reldistinct is zero. + * Also skip this if we know that we are returning all rows. */ - reldistinct *= rel->rows / rel->tuples; + if (reldistinct > 0 && rel->rows < rel->tuples) + { + /* + * Given a table containing N rows with n distinct values in a + * uniform distribution, if we select p rows at random then + * the expected number of distinct values selected is + * + * n * (1 - product((N-N/n-i)/(N-i), i=0..p-1)) + * + * = n * (1 - (N-N/n)! / (N-N/n-p)! * (N-p)! / N!) + * + * See "Approximating block accesses in database + * organizations", S. B. Yao, Communications of the ACM, + * Volume 20 Issue 4, April 1977 Pages 260-261. + * + * Alternatively, re-arranging the terms from the factorials, + * this may be written as + * + * n * (1 - product((N-p-i)/(N-i), i=0..N/n-1)) + * + * This form of the formula is more efficient to compute in + * the common case where p is larger than N/n. Additionally, + * as pointed out by Dell'Era, if i << N for all terms in the + * product, it can be approximated by + * + * n * (1 - ((N-p)/N)^(N/n)) + * + * See "Expected distinct values when selecting from a bag + * without replacement", Alberto Dell'Era, + * http://www.adellera.it/investigations/distinct_balls/. + * + * The condition i << N is equivalent to n >> 1, so this is a + * good approximation when the number of distinct values in + * the table is large. It turns out that this formula also + * works well even when n is small. + */ + reldistinct *= + (1 - pow((rel->tuples - rel->rows) / rel->tuples, + rel->tuples / reldistinct)); + } + reldistinct = clamp_row_est(reldistinct); /* * Update estimate of total distinct groups. diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out new file mode 100644 index de64ca7..0fc93d9 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -807,27 +807,24 @@ select * from int4_tbl where explain (verbose, costs off) select * from int4_tbl o where (f1, f1) in (select f1, generate_series(1,2) / 10 g from int4_tbl i group by f1); - QUERY PLAN ----------------------------------------------------------------------- - Hash Join + QUERY PLAN +---------------------------------------------------------------- + Hash Semi Join Output: o.f1 Hash Cond: (o.f1 = "ANY_subquery".f1) -> Seq Scan on public.int4_tbl o Output: o.f1 -> Hash Output: "ANY_subquery".f1, "ANY_subquery".g - -> HashAggregate + -> Subquery Scan on "ANY_subquery" Output: "ANY_subquery".f1, "ANY_subquery".g - Group Key: "ANY_subquery".f1, "ANY_subquery".g - -> Subquery Scan on "ANY_subquery" - Output: "ANY_subquery".f1, "ANY_subquery".g - Filter: ("ANY_subquery".f1 = "ANY_subquery".g) - -> HashAggregate - Output: i.f1, (generate_series(1, 2) / 10) - Group Key: i.f1 - -> Seq Scan on public.int4_tbl i - Output: i.f1 -(18 rows) + Filter: ("ANY_subquery".f1 = "ANY_subquery".g) + -> HashAggregate + Output: i.f1, (generate_series(1, 2) / 10) + Group Key: i.f1 + -> Seq Scan on public.int4_tbl i + Output: i.f1 +(15 rows) select * from int4_tbl o where (f1, f1) in (select f1, generate_series(1,2) / 10 g from int4_tbl i group by f1);