From 22e956fd80c076c0e836674c49447afb3cd84bfc Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Mon, 17 Jun 2024 14:10:49 +0200 Subject: [PATCH v20240617 02/56] review --- src/backend/optimizer/path/clausesel.c | 29 ++++---- src/backend/statistics/extended_stats.c | 72 +++++++++++++------ .../statistics/extended_stats_internal.h | 2 +- 3 files changed, 69 insertions(+), 34 deletions(-) diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c index bedf76edaec..24e3c9729a3 100644 --- a/src/backend/optimizer/path/clausesel.c +++ b/src/backend/optimizer/path/clausesel.c @@ -130,22 +130,26 @@ clauselist_selectivity_ext(PlannerInfo *root, RangeQueryClause *rqlist = NULL; ListCell *l; int listidx; + + /* skip expensive processing when estimating a single clause */ bool single_clause_optimization = true; /* - * The optimization of skipping to clause_selectivity_ext for single - * clauses means we can't improve join estimates with a single join - * clause but additional baserel restrictions. So we disable it when - * estimating joins. + * Disable the single-clause optimization when estimating a join clause. + * + * The optimization skips clause_selectivity_ext when estimating a single + * clause, but for join clauses it would mean we can't consider both the + * join clause and the baserel restrictions. So we disable the optimization + * when estimating a join clause. * - * XXX Not sure if this is the right way to do it, but more elaborate - * checks would mostly negate the whole point of the optimization. - * The (Var op Var) patch has the same issue. + * XXX Not sure if this is the best way to deal with the optimization. We + * could make it more elaborate in various ways, but increasing the cost + * of the checks might negate the whole point of the optimization. * - * XXX An alternative might be making clause_selectivity_ext smarter - * and make it use the join extended stats there. But that seems kinda - * against the whole point of the optimization (skipping expensive - * stuff) and it's making other parts more complex. + * XXX Alternatively we could make clause_selectivity_ext smarter and + * combine the join clauses and baserel restrictions there. But that seems + * somewhat against the whole point of the optimization (skipping expensive + * stuff) and it'd making other parts more complex. * * XXX Maybe this should check if there are at least some restrictions * on some base relations, which seems important. But then again, that @@ -164,6 +168,7 @@ clauselist_selectivity_ext(PlannerInfo *root, clause = (Node *) rinfo->clause; } + /* disable optimization for join clauses */ single_clause_optimization = !treat_as_join_clause(root, clause, rinfo, varRelid, sjinfo); } @@ -209,7 +214,7 @@ clauselist_selectivity_ext(PlannerInfo *root, */ if (use_extended_stats && statext_try_join_estimates(root, clauses, varRelid, jointype, sjinfo, - estimatedclauses)) + estimatedclauses)) { s1 *= statext_clauselist_join_selectivity(root, clauses, varRelid, jointype, sjinfo, diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c index 80872cc7daa..d3e1dde73d1 100644 --- a/src/backend/statistics/extended_stats.c +++ b/src/backend/statistics/extended_stats.c @@ -2643,23 +2643,28 @@ make_build_data(Relation rel, StatExtEntry *stat, int numrows, HeapTuple *rows, * statext_find_matching_mcv * Search for a MCV covering all the attributes and expressions. * - * We pick the statistics to use for join estimation. The statistics object has - * to have MCV, and we require it to match all the join conditions, because it - * makes the estimation simpler. + * Picks the extended statistics object to estimate join clause. The statistics + * object has to have a MCV, and we require it to match all the join conditions + * (be it plain attribute or an expression), as it makes the estimation simpler. * - * If there are multiple candidate statistics objects (matching all join clauses), - * we pick the smallest one, and we also consider additional conditions on - * the base relations to restrict the MCV items used for estimation (using - * conditional probability). + * If there are multiple applicable candidate statistics objects (matching all + * join clauses), picks the narrowest one. But we also consider additional + * restrictions on base relations that we can use to filter the MCV items + * (to calculate conditional probability). + * + * XXX How exactly this balances the "narrowest" and "additional conditions"? + * Which criteria we prefer? * * XXX The requirement that all the attributes need to be covered might be * too strong. We could relax this and and require fewer matches (at least two, * if counting the additional conditions), and we might even apply multiple * statistics etc. But that would require matching statistics on both sides of - * the join, while now we simply know the statistics match. We don't really - * expect many candidate MCVs, so this simple approach seems sufficient. And - * the joins usually use only one or two columns, so there's not much room - * for applying multiple statistics anyway. + * the join (using a statistics on a subset of conditions one one side means + * we need a matching statistics on the other side too). While now we simply + * know the statistics will match. We don't really expect many candidate MCVs, + * so this simple approach seems sufficient. And the joins usually use only one + * or two columns, so there's not much room for applying multiple statistics + * anyway. */ StatisticExtInfo * statext_find_matching_mcv(PlannerInfo *root, RelOptInfo *rel, @@ -2679,12 +2684,7 @@ statext_find_matching_mcv(PlannerInfo *root, RelOptInfo *rel, if (stat->kind != STATS_EXT_MCV) continue; - /* - * Ignore MCVs not covering all the attributes/expressions. - * - * XXX Maybe we shouldn't be so strict and consider only partial - * matches for join clauses too? - */ + /* Ignore MCVs not covering all the attributes/expressions. */ if (!bms_is_subset(attnums, stat->keys) || !stat_covers_expressions(stat, exprs, NULL)) continue; @@ -2697,8 +2697,8 @@ statext_find_matching_mcv(PlannerInfo *root, RelOptInfo *rel, } /* - * OK, we have two candidate statistics objects and we need to decide - * which one to keep. We'll use two simple heuristics: + * We have two candidate statistics objects and we need to decide which + * one to keep. We'll use two simple heuristics: * * (a) We prefer smaller statistics (fewer columns), on the assumption * that it represents a larger fraction of the data (due to having fewer @@ -2723,6 +2723,23 @@ statext_find_matching_mcv(PlannerInfo *root, RelOptInfo *rel, * as well pick regular statistics for the column/expression, but it's * not clear if that actually exists (so we might reject the stats here * and then fail to find something simpler/better). + * + * XXX I'm not sure about the preceding comment. Why would we find a MCV + * list for a single condition here, but not for the single attribute? + * Would the "partial" extended MCV even be useful? + */ + + /* + * Match additional baserel conditions for the two statistics. + * + * XXX Shouldn't we keep this too? If there are more than 2 candidates, + * we'll end up recalculating the conditions for the statistics we kept + * from the preceding loop. Perhaps we could/should even pass the + * conditions to the caller? + * + * XXX Or maybe we should simply "count" the restrictions here, instead + * of constructing a list? Probably not a meaningful difference in CPU + * costs or a memory leak. */ conditions1 = statext_determine_join_restrictions(root, rel, stat); conditions2 = statext_determine_join_restrictions(root, rel, mcv); @@ -2734,7 +2751,10 @@ statext_find_matching_mcv(PlannerInfo *root, RelOptInfo *rel, continue; } - /* The statistics seem about equal, so just use the smaller one. */ + /* The statistics seem about equal, so just use the narrower one. + * + * XXX Maybe we should have a function/macro to count the keys/exprs? + */ if (bms_num_members(mcv->keys) + list_length(mcv->exprs) > bms_num_members(stat->keys) + list_length(stat->exprs)) { @@ -2803,7 +2823,7 @@ statext_determine_join_restrictions(PlannerInfo *root, RelOptInfo *rel, * relation for now. * * Similar to treat_as_join_clause, but we place additional restrictions - * on the conditions. + * on the conditions, to make sure it can be estimated using extended stats. */ static bool statext_is_supported_join_clause(PlannerInfo *root, Node *clause, @@ -2900,6 +2920,8 @@ statext_is_supported_join_clause(PlannerInfo *root, Node *clause, * * This is supposed to be a quick/cheap check to decide whether to expend * more effort on applying extended statistics to join clauses. + * + * XXX Probably should document arguments, see statext_mcv_clauselist_selectivity. */ bool statext_try_join_estimates(PlannerInfo *root, List *clauses, int varRelid, @@ -3098,6 +3120,10 @@ statext_build_join_pairs(PlannerInfo *root, List *clauses, int varRelid, * * XXX Can we have cases with indexes above 1? Probably for clauses mixing * vars from 3 relations, but statext_is_supported_join_clause rejects those. + * + * XXX Name should probably start with statext_ too. + * + * XXX The 0/1 index seems a bit weird. Is there a better way to do this? */ static RelOptInfo * extract_relation_info(PlannerInfo *root, JoinPairInfo *info, int index, @@ -3196,6 +3222,8 @@ extract_relation_info(PlannerInfo *root, JoinPairInfo *info, int index, * * XXX This should probably return a flag identifying whether it's the * left or right argument. + * + * XXX Name should probably start with statext_ too. */ static Node * get_expression_for_rel(PlannerInfo *root, RelOptInfo *rel, Node *clause) @@ -3241,6 +3269,8 @@ get_expression_for_rel(PlannerInfo *root, RelOptInfo *rel, Node *clause) * But currently that does not happen, because clauselist_selectivity_ext * treats a single clause as a special case (and we don't apply extended * statistics in that case yet). + * + * XXX Isn't the preceding comment stale? We skip the optimization, no? */ Selectivity statext_clauselist_join_selectivity(PlannerInfo *root, List *clauses, int varRelid, diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h index a85f896d53a..f156fda555e 100644 --- a/src/include/statistics/extended_stats_internal.h +++ b/src/include/statistics/extended_stats_internal.h @@ -15,7 +15,7 @@ #define EXTENDED_STATS_INTERNAL_H #include "statistics/statistics.h" -#include "utils/lsyscache.h" +#include "utils/lsyscache.h" /* XXX is this needed? */ #include "utils/sortsupport.h" typedef struct -- 2.45.2