From 73069a8b275911b3dedb813e6c9981e957a9af94 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas.vondra@postgresql.org>
Date: Wed, 24 Mar 2021 00:36:29 +0100
Subject: [PATCH 2/3] prefer expression matches

---
 src/backend/utils/adt/selfuncs.c        | 76 ++++++++++++-------------
 src/test/regress/expected/stats_ext.out |  2 +-
 2 files changed, 38 insertions(+), 40 deletions(-)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 612b4db1c8..29fc218149 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -3304,14 +3304,17 @@ typedef struct
 } GroupExprInfo;
 
 static List *
-add_unique_group_expr(PlannerInfo *root, List *exprinfos,
-					  Node *expr, List *vars)
+add_unique_group_expr(PlannerInfo *root, List *exprinfos, Node *expr,
+					  List *vars, VariableStatData *vardata)
 {
 	GroupExprInfo *exprinfo;
 	ListCell   *lc;
 	Bitmapset  *varnos;
 	Index		varno;
 
+	/* can't get both vars and vardata for the expression */
+	Assert(!(vars && vardata));
+
 	foreach(lc, exprinfos)
 	{
 		exprinfo = (GroupExprInfo *) lfirst(lc);
@@ -3342,31 +3345,26 @@ add_unique_group_expr(PlannerInfo *root, List *exprinfos,
 	/* Track vars for this expression. */
 	foreach(lc, vars)
 	{
-		VariableStatData vardata;
+		VariableStatData tmp;
 		Node	   *var = (Node *) lfirst(lc);
 
 		/* can we get no vardata for the variable? */
-		examine_variable(root, var, 0, &vardata);
+		examine_variable(root, var, 0, &tmp);
 
 		exprinfo->varinfos
-			= add_unique_group_var(root, exprinfo->varinfos, var, &vardata);
+			= add_unique_group_var(root, exprinfo->varinfos, var, &tmp);
 
-		ReleaseVariableStats(vardata);
+		ReleaseVariableStats(tmp);
 	}
 
 	/* without a list of variables, use the expression itself */
 	if (vars == NIL)
 	{
-		VariableStatData vardata;
-
-		/* can we get no vardata for the variable? */
-		examine_variable(root, expr, 0, &vardata);
+		Assert(vardata);
 
 		exprinfo->varinfos
 			= add_unique_group_var(root, exprinfo->varinfos,
-								   expr, &vardata);
-
-		ReleaseVariableStats(vardata);
+								   expr, vardata);
 	}
 
 	return lappend(exprinfos, exprinfo);
@@ -3512,12 +3510,21 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
 		 * If examine_variable is able to deduce anything about the GROUP BY
 		 * expression, treat it as a single variable even if it's really more
 		 * complicated.
+		 *
+		 * XXX This has the consequence that if there's a statistics on the
+		 * expression, we don't split it into individual Vars. This affects
+		 * our selection of statistics in estimate_multivariate_ndistinct,
+		 * because it's probably better to use more accurate estimate for
+		 * each expression and treat them as independent, than to combine
+		 * estimates for the extracted variables when we don't know how that
+		 * relates to the expressions.
 		 */
 		examine_variable(root, groupexpr, 0, &vardata);
 		if (HeapTupleIsValid(vardata.statsTuple) || vardata.isunique)
 		{
 			exprinfos = add_unique_group_expr(root, exprinfos,
-											  groupexpr, NIL);
+											  groupexpr, NIL,
+											  &vardata);
 
 			ReleaseVariableStats(vardata);
 			continue;
@@ -3557,7 +3564,8 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
 		{
 			exprinfos = add_unique_group_expr(root, exprinfos,
 											  groupexpr,
-											  varshere);
+											  varshere,
+											  NULL);
 			continue;
 		}
 
@@ -3568,7 +3576,9 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
 		{
 			Node	   *var = (Node *) lfirst(l2);
 
-			exprinfos = add_unique_group_expr(root, exprinfos, var, NIL);
+			examine_variable(root, var, 0, &vardata);
+			exprinfos = add_unique_group_expr(root, exprinfos, var, NIL, &vardata);
+			ReleaseVariableStats(vardata);
 		}
 	}
 
@@ -4013,10 +4023,14 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
 		 * exact match first, and if we don't find a match we try to search
 		 * for smaller "partial" expressions extracted from it. So for example
 		 * given GROUP BY (a+b) we search for statistics defined on (a+b)
-		 * first, and then maybe for one on (a) and (b). The trouble here is
-		 * that with the current coding, the one matching (a) and (b) might
-		 * win, because we're comparing the counts. We should probably give
-		 * some preference to exact matches of the expressions.
+		 * first, and then maybe for one on the extracted vars (a) and (b).
+		 * There might be two statistics, one of (a+b) and the other one on
+		 * (a,b), and both of them match the exprinfos in some way. However,
+		 * estimate_num_groups currently does not split the expression into
+		 * parts if there's a statistics with exact match of the expression.
+		 * So the expression has either exact match (and we're guaranteed to
+		 * estimate using the matching statistics), or it has to be matched
+		 * by parts.
 		 */
 		foreach(lc2, *exprinfos)
 		{
@@ -4068,20 +4082,7 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
 			if (found)
 				continue;
 
-			/*
-			 * Inspect the individual Vars extracted from the expression.
-			 *
-			 * XXX Maybe this should not use nshared_vars, but a separate
-			 * variable, so that we can give preference to "exact" matches
-			 * over partial ones? Consider for example two statistics [a,b,c]
-			 * and [(a+b), c], and query with
-			 *
-			 * GROUP BY (a+b), c
-			 *
-			 * Then the first statistics matches no expressions and 3 vars,
-			 * while the second statistics matches one expression and 1 var.
-			 * Currently the first statistics wins, which seems silly.
-			 */
+			/* Inspect the individual Vars extracted from the expression. */
 			foreach(lc3, exprinfo->varinfos)
 			{
 				GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc3);
@@ -4110,12 +4111,9 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
 		 *
 		 * XXX This should break ties using name of the object, or something
 		 * like that, to make the outcome stable.
-		 *
-		 * XXX Maybe this should consider the vars in the opposite way, i.e.
-		 * expression matches should be more important.
 		 */
-		if ((nshared_vars > nmatches_vars) ||
-			((nshared_vars == nmatches_vars) && (nshared_exprs > nmatches_exprs)))
+		if ((nshared_exprs > nmatches_exprs) ||
+			(((nshared_exprs == nmatches_exprs)) && (nshared_vars > nmatches_vars)))
 		{
 			statOid = info->statOid;
 			nmatches_vars = nshared_vars;
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index abfb6d9f3c..cf9c6b6ca4 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -1187,7 +1187,7 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b
 SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c, d');
  estimated | actual 
 -----------+--------
-       540 |    180
+       180 |    180
 (1 row)
 
 SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10), (d-1)');
-- 
2.30.2