From 73069a8b275911b3dedb813e6c9981e957a9af94 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Wed, 24 Mar 2021 00:36:29 +0100 Subject: [PATCH 2/3] prefer expression matches --- src/backend/utils/adt/selfuncs.c | 76 ++++++++++++------------- src/test/regress/expected/stats_ext.out | 2 +- 2 files changed, 38 insertions(+), 40 deletions(-) diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 612b4db1c8..29fc218149 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -3304,14 +3304,17 @@ typedef struct } GroupExprInfo; static List * -add_unique_group_expr(PlannerInfo *root, List *exprinfos, - Node *expr, List *vars) +add_unique_group_expr(PlannerInfo *root, List *exprinfos, Node *expr, + List *vars, VariableStatData *vardata) { GroupExprInfo *exprinfo; ListCell *lc; Bitmapset *varnos; Index varno; + /* can't get both vars and vardata for the expression */ + Assert(!(vars && vardata)); + foreach(lc, exprinfos) { exprinfo = (GroupExprInfo *) lfirst(lc); @@ -3342,31 +3345,26 @@ add_unique_group_expr(PlannerInfo *root, List *exprinfos, /* Track vars for this expression. */ foreach(lc, vars) { - VariableStatData vardata; + VariableStatData tmp; Node *var = (Node *) lfirst(lc); /* can we get no vardata for the variable? */ - examine_variable(root, var, 0, &vardata); + examine_variable(root, var, 0, &tmp); exprinfo->varinfos - = add_unique_group_var(root, exprinfo->varinfos, var, &vardata); + = add_unique_group_var(root, exprinfo->varinfos, var, &tmp); - ReleaseVariableStats(vardata); + ReleaseVariableStats(tmp); } /* without a list of variables, use the expression itself */ if (vars == NIL) { - VariableStatData vardata; - - /* can we get no vardata for the variable? */ - examine_variable(root, expr, 0, &vardata); + Assert(vardata); exprinfo->varinfos = add_unique_group_var(root, exprinfo->varinfos, - expr, &vardata); - - ReleaseVariableStats(vardata); + expr, vardata); } return lappend(exprinfos, exprinfo); @@ -3512,12 +3510,21 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows, * If examine_variable is able to deduce anything about the GROUP BY * expression, treat it as a single variable even if it's really more * complicated. + * + * XXX This has the consequence that if there's a statistics on the + * expression, we don't split it into individual Vars. This affects + * our selection of statistics in estimate_multivariate_ndistinct, + * because it's probably better to use more accurate estimate for + * each expression and treat them as independent, than to combine + * estimates for the extracted variables when we don't know how that + * relates to the expressions. */ examine_variable(root, groupexpr, 0, &vardata); if (HeapTupleIsValid(vardata.statsTuple) || vardata.isunique) { exprinfos = add_unique_group_expr(root, exprinfos, - groupexpr, NIL); + groupexpr, NIL, + &vardata); ReleaseVariableStats(vardata); continue; @@ -3557,7 +3564,8 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows, { exprinfos = add_unique_group_expr(root, exprinfos, groupexpr, - varshere); + varshere, + NULL); continue; } @@ -3568,7 +3576,9 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows, { Node *var = (Node *) lfirst(l2); - exprinfos = add_unique_group_expr(root, exprinfos, var, NIL); + examine_variable(root, var, 0, &vardata); + exprinfos = add_unique_group_expr(root, exprinfos, var, NIL, &vardata); + ReleaseVariableStats(vardata); } } @@ -4013,10 +4023,14 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel, * exact match first, and if we don't find a match we try to search * for smaller "partial" expressions extracted from it. So for example * given GROUP BY (a+b) we search for statistics defined on (a+b) - * first, and then maybe for one on (a) and (b). The trouble here is - * that with the current coding, the one matching (a) and (b) might - * win, because we're comparing the counts. We should probably give - * some preference to exact matches of the expressions. + * first, and then maybe for one on the extracted vars (a) and (b). + * There might be two statistics, one of (a+b) and the other one on + * (a,b), and both of them match the exprinfos in some way. However, + * estimate_num_groups currently does not split the expression into + * parts if there's a statistics with exact match of the expression. + * So the expression has either exact match (and we're guaranteed to + * estimate using the matching statistics), or it has to be matched + * by parts. */ foreach(lc2, *exprinfos) { @@ -4068,20 +4082,7 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel, if (found) continue; - /* - * Inspect the individual Vars extracted from the expression. - * - * XXX Maybe this should not use nshared_vars, but a separate - * variable, so that we can give preference to "exact" matches - * over partial ones? Consider for example two statistics [a,b,c] - * and [(a+b), c], and query with - * - * GROUP BY (a+b), c - * - * Then the first statistics matches no expressions and 3 vars, - * while the second statistics matches one expression and 1 var. - * Currently the first statistics wins, which seems silly. - */ + /* Inspect the individual Vars extracted from the expression. */ foreach(lc3, exprinfo->varinfos) { GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc3); @@ -4110,12 +4111,9 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel, * * XXX This should break ties using name of the object, or something * like that, to make the outcome stable. - * - * XXX Maybe this should consider the vars in the opposite way, i.e. - * expression matches should be more important. */ - if ((nshared_vars > nmatches_vars) || - ((nshared_vars == nmatches_vars) && (nshared_exprs > nmatches_exprs))) + if ((nshared_exprs > nmatches_exprs) || + (((nshared_exprs == nmatches_exprs)) && (nshared_vars > nmatches_vars))) { statOid = info->statOid; nmatches_vars = nshared_vars; diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out index abfb6d9f3c..cf9c6b6ca4 100644 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -1187,7 +1187,7 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c, d'); estimated | actual -----------+-------- - 540 | 180 + 180 | 180 (1 row) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10), (d-1)'); -- 2.30.2