From 59d6fa8fd1dbccd60cf4a72663157fc531dbd5f9 Mon Sep 17 00:00:00 2001 From: erthalion <9erthalion6@gmail.com> Date: Fri, 29 Mar 2019 15:20:22 +0100 Subject: [PATCH v12 3/3] Reorder by values distribution --- src/backend/optimizer/path/pathkeys.c | 218 +++++++++++++++++++++++++++++++ src/backend/optimizer/plan/planner.c | 33 ++++- src/backend/utils/misc/guc.c | 21 ++- src/include/optimizer/paths.h | 10 ++ src/test/regress/expected/aggregates.out | 16 +-- 5 files changed, 288 insertions(+), 10 deletions(-) diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c index 65e53ef854..ac79bc4975 100644 --- a/src/backend/optimizer/path/pathkeys.c +++ b/src/backend/optimizer/path/pathkeys.c @@ -24,6 +24,7 @@ #include "nodes/plannodes.h" #include "optimizer/optimizer.h" #include "optimizer/pathnode.h" +#include "optimizer/plancat.h" #include "optimizer/paths.h" #include "partitioning/partbounds.h" #include "utils/lsyscache.h" @@ -332,6 +333,11 @@ pathkeys_contained_in(List *keys1, List *keys2) return false; } +/*************************************************************/ +bool debug_group_by_reorder_by_pathkeys = true; +bool debug_cheapest_group_by = true; +/************************************************************/ + /* * Reorder GROUP BY pathkeys and clauses to match order of pathkeys. Function * returns new lists, original GROUP BY lists stay untouched. @@ -345,6 +351,9 @@ group_keys_reorder_by_pathkeys(List *pathkeys, List **group_pathkeys, ListCell *key; int n; + if (debug_group_by_reorder_by_pathkeys == false) + return 0; + if (pathkeys == NIL || *group_pathkeys == NIL) return 0; @@ -384,6 +393,215 @@ group_keys_reorder_by_pathkeys(List *pathkeys, List **group_pathkeys, return n; } +/* + * get_width_multiplier + * + * Returns relative complexity of comparing two values based on their width. + * The idea behind is that long values have more expensive comparison. + */ +static double +get_width_multiplier(PlannerInfo *root, Expr *expr) +{ + double width = -1.0; + + if (IsA(expr, RelabelType)) + expr = (Expr *) ((RelabelType *) expr)->arg; + + /* Try to find actual stat in corresonding relation */ + if (IsA(expr, Var)) + { + Var *var = (Var *) expr; + + if (var->varno > 0 && var->varno < root->simple_rel_array_size) + { + RelOptInfo *rel = root->simple_rel_array[var->varno]; + + if (rel != NULL && + var->varattno >= rel->min_attr && + var->varattno <= rel->max_attr) + { + int ndx = var->varattno - rel->min_attr; + + if (rel->attr_widths[ndx] > 0) + width = rel->attr_widths[ndx]; + } + } + } + + /* Didn't find any actual stats, use estimation by type */ + if (width < 0.0) + { + Node *node = (Node*) expr; + + width = get_typavgwidth(exprType(node), exprTypmod(node)); + } + + /* + * Any value in pgsql is passed by Datum type, so any operation with value + * could not be cheaper than operation with Datum type + */ + if (width <= sizeof(Datum)) + return sizeof(Datum); + + return width; +} + +/* + * Order tail of list of group pathkeys by uniqueness descendetly. It allows to + * speedup sorting. Returns newly allocated lists, old ones stay untouched. + * n_preordered defines a head of list which order should be prevented. + */ +void +get_cheapest_group_keys_order(PlannerInfo *root, double nrows, + List *target_list, + List **group_pathkeys, List **group_clauses, + int n_preordered) +{ + struct + { + PathKey *pathkey; + SortGroupClause *sgc; + Node *pathkeyExpr; + } + *keys, tmp; + int nkeys = list_length(*group_pathkeys) - n_preordered; + List *pathkeyExprList = NIL, + *new_group_pathkeys = NIL, + *new_group_clauses = NIL; + ListCell *cell; + int i = 0, n_keys_to_est; + + if (!debug_cheapest_group_by) + return; + + if (nkeys < 2) + return; /* nothing to do */ + + /* + * Nothing to do here, since reordering of group clauses to match ORDER BY + * already performed in preprocess_groupclause + */ + if (n_preordered == 0 && root->sort_pathkeys) + return; + + keys = palloc(nkeys * sizeof(*keys)); + + /* + * Collect information about pathkey for subsequent usage + */ + for_each_cell(cell, list_nth_cell(*group_pathkeys, n_preordered)) + { + PathKey *pathkey = (PathKey *) lfirst(cell); + + keys[i].pathkey = pathkey; + keys[i].sgc = get_sortgroupref_clause(pathkey->pk_eclass->ec_sortref, + *group_clauses); + keys[i].pathkeyExpr = get_sortgroupclause_expr(keys[i].sgc, + target_list); + i++; + } + + /* + * Find the cheapest to sort order of columns. We will find a first column + * with bigger number of group, then pair (first column in pair is already + * defined in first step), them triple and so on. + */ + for(n_keys_to_est = 1; n_keys_to_est <= nkeys - 1; n_keys_to_est++) + { + ListCell *tail_cell; + int best_i = 0; + double best_est_num_groups = -1; + + /* expand list of columns and remeber last cell */ + pathkeyExprList = lappend(pathkeyExprList, NULL); + tail_cell = list_tail(pathkeyExprList); + + /* + * Find the best last column - the best means bigger number of groups, + * previous columns are already choosen + */ + for(i = n_keys_to_est - 1; i < nkeys; i++) + { + double est_num_groups; + Expr *expr = (Expr *) keys[i].pathkeyExpr; + PathKey *pathkey = keys[i].pathkey; + EquivalenceMember *em = (EquivalenceMember *) + linitial(pathkey->pk_eclass->ec_members); + + lfirst(tail_cell) = keys[i].pathkeyExpr; + est_num_groups = estimate_num_groups(root, pathkeyExprList, + nrows, NULL); + est_num_groups /= get_width_multiplier(root, expr); + + if (em->em_datatype != InvalidOid) + { + Oid sortop; + QualCost costs; + costs.startup = costs.per_tuple = 0; + + sortop = get_opfamily_member(pathkey->pk_opfamily, + em->em_datatype, em->em_datatype, + pathkey->pk_strategy); + add_function_cost(root, get_opcode(sortop), NULL, &costs); + + est_num_groups /= costs.per_tuple; + } + + if (est_num_groups > best_est_num_groups) + { + best_est_num_groups = est_num_groups; + best_i = i; + } + } + + /* Save the best choice */ + lfirst(tail_cell) = keys[best_i].pathkeyExpr; + if (best_i != n_keys_to_est - 1) + { + tmp = keys[n_keys_to_est - 1]; + keys[n_keys_to_est - 1] = keys[best_i]; + keys[best_i] = tmp; + } + } + list_free(pathkeyExprList); + + /* + * Construct result lists, keys array is already ordered to get a cheapest + * sort + */ + i = 0; + foreach(cell, *group_pathkeys) + { + PathKey *pathkey; + SortGroupClause *sgc; + + if (i < n_preordered) + { + pathkey = (PathKey *) lfirst(cell); + sgc = get_sortgroupref_clause(pathkey->pk_eclass->ec_sortref, + *group_clauses); + } + else + { + pathkey = keys[i - n_preordered].pathkey; + sgc = keys[i - n_preordered].sgc; + } + + new_group_pathkeys = lappend(new_group_pathkeys, pathkey); + new_group_clauses = lappend(new_group_clauses, sgc); + + i++; + } + + pfree(keys); + + /* Just append the rest GROUP BY clauses */ + new_group_clauses = list_concat_unique_ptr(new_group_clauses, *group_clauses); + + *group_pathkeys = new_group_pathkeys; + *group_clauses = new_group_clauses; +} + /* * get_cheapest_path_for_pathkeys * Find the cheapest path (according to the specified criterion) that diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 247e39d6ff..d480551d3c 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -6389,7 +6389,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, bool is_sorted; List *group_pathkeys = root->group_pathkeys, *group_clauses = parse->groupClause; - int n_preordered_groups; + int n_preordered_groups = 0; if (parse->groupingSets) { @@ -6413,11 +6413,20 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, { /* Sort the cheapest-total path if it isn't already sorted */ if (!is_sorted) + { + if (!parse->groupingSets) + get_cheapest_group_keys_order(root, + path->rows, + extra->targetList, + &group_pathkeys, + &group_clauses, + n_preordered_groups); path = (Path *) create_sort_path(root, grouped_rel, path, group_pathkeys, -1.0); + } /* Now decide what to stick atop it */ if (parse->groupingSets) @@ -6491,6 +6500,12 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, { if (path != partially_grouped_rel->cheapest_total_path) continue; + get_cheapest_group_keys_order(root, + path->rows, + extra->targetList, + &group_pathkeys, + &group_clauses, + n_preordered_groups); path = (Path *) create_sort_path(root, grouped_rel, path, @@ -6765,11 +6780,19 @@ create_partial_grouping_paths(PlannerInfo *root, { /* Sort the cheapest partial path, if it isn't already */ if (!is_sorted) + { + get_cheapest_group_keys_order(root, + path->rows, + extra->targetList, + &group_pathkeys, + &group_clauses, + n_preordered_groups); path = (Path *) create_sort_path(root, partially_grouped_rel, path, group_pathkeys, -1.0); + } if (parse->hasAggs) add_path(partially_grouped_rel, (Path *) @@ -6816,11 +6839,19 @@ create_partial_grouping_paths(PlannerInfo *root, /* Sort the cheapest partial path, if it isn't already */ if (!is_sorted) + { + get_cheapest_group_keys_order(root, + path->rows, + extra->targetList, + &group_pathkeys, + &group_clauses, + n_preordered_groups); path = (Path *) create_sort_path(root, partially_grouped_rel, path, group_pathkeys, -1.0); + } if (parse->hasAggs) add_partial_path(partially_grouped_rel, (Path *) diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index f7f726b5ae..f13ea2ab5d 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -1952,7 +1952,26 @@ static struct config_bool ConfigureNamesBool[] = false, NULL, NULL, NULL }, - +/*********************************************************/ + { + {"debug_group_by_reorder_by_pathkeys", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("enable reorder GROUP BY by pathkeys"), + NULL + }, + &debug_group_by_reorder_by_pathkeys, + true, + NULL, NULL, NULL + }, + { + {"debug_enable_cheapest_group_by", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("find a cheapest order of columns in GROUP BY."), + NULL + }, + &debug_cheapest_group_by, + true, + NULL, NULL, NULL + }, +/********************************************************/ /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index faf6449f4d..fe4c067369 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -186,6 +186,16 @@ extern bool pathkeys_contained_in(List *keys1, List *keys2); extern int group_keys_reorder_by_pathkeys(List *pathkeys, List **group_pathkeys, List **group_clauses); +/*********************************************************/ +extern bool debug_group_by_reorder_by_pathkeys; +extern bool debug_cheapest_group_by; +/********************************************************/ +extern void get_cheapest_group_keys_order(PlannerInfo *root, + double nrows, + List *target_list, + List **group_pathkeys, + List **group_clauses, + int n_preordered); extern Path *get_cheapest_path_for_pathkeys(List *paths, List *pathkeys, Relids required_outer, CostSelector cost_criterion, diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index 265c996d5e..9afc1a827d 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -2228,9 +2228,9 @@ SELECT count(*) FROM btg GROUP BY v, p; QUERY PLAN ----------------------------- GroupAggregate - Group Key: v, p + Group Key: p, v -> Sort - Sort Key: v, p + Sort Key: p, v -> Seq Scan on btg (5 rows) @@ -2239,9 +2239,9 @@ SELECT count(*) FROM btg GROUP BY v, p, c; QUERY PLAN ----------------------------- GroupAggregate - Group Key: v, p, c + Group Key: p, c, v -> Sort - Sort Key: v, p, c + Sort Key: p, c, v -> Seq Scan on btg (5 rows) @@ -2261,9 +2261,9 @@ SELECT count(*) FROM btg GROUP BY v, p, d, c; QUERY PLAN ------------------------------ GroupAggregate - Group Key: v, p, d, c + Group Key: p, d, c, v -> Sort - Sort Key: v, p, d, c + Sort Key: p, d, c, v -> Seq Scan on btg (5 rows) @@ -2318,9 +2318,9 @@ SELECT count(*) FROM btg GROUP BY p, d, e; QUERY PLAN ----------------------------- GroupAggregate - Group Key: p, d, e + Group Key: p, e, d -> Sort - Sort Key: p, d, e + Sort Key: p, e, d -> Seq Scan on btg (5 rows) -- 2.16.4