From 1d4fb1214b3f6b216654825985fb3ef2c5e045e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=80=E6=8C=83?= Date: Mon, 24 Feb 2020 20:16:42 +0800 Subject: [PATCH] Erase the distinctClause if the result is unique by definition For a single relation, we can tell it by any one of the following is true: 1. The pk is in the target list. 2. The uk is in the target list and the columns is not null 3. The columns in group-by clause is also in the target list for relation join, we can tell it by: if every relation in the jointree yield a unique result set,then the final result is unique as well regardless the join method. --- src/backend/nodes/bitmapset.c | 18 ++ src/backend/optimizer/path/costsize.c | 1 + src/backend/optimizer/plan/analyzejoins.c | 246 +++++++++++++++- src/backend/optimizer/plan/planner.c | 28 ++ src/backend/utils/misc/guc.c | 10 + src/include/nodes/bitmapset.h | 1 + src/include/optimizer/cost.h | 1 + src/include/optimizer/planmain.h | 2 + src/test/regress/expected/aggregates.out | 13 +- src/test/regress/expected/join.out | 16 +- .../regress/expected/select_distinct_2.out | 276 ++++++++++++++++++ src/test/regress/expected/sysviews.out | 3 +- src/test/regress/sql/select_distinct_2.sql | 84 ++++++ 13 files changed, 679 insertions(+), 20 deletions(-) create mode 100644 src/test/regress/expected/select_distinct_2.out create mode 100644 src/test/regress/sql/select_distinct_2.sql diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c index 648cc1a7eb..5cb6924a29 100644 --- a/src/backend/nodes/bitmapset.c +++ b/src/backend/nodes/bitmapset.c @@ -1167,3 +1167,21 @@ bms_hash_value(const Bitmapset *a) return DatumGetUInt32(hash_any((const unsigned char *) a->words, (lastword + 1) * sizeof(bitmapword))); } + +/* + * bms_array_free + * + * free the element in the array one by one, free the array as well at last + */ +void +bms_array_free(Bitmapset **bms_array, int len) +{ + int idx; + if (bms_array == NULL) + return; + for(idx = 0 ; idx < len; idx++) + { + bms_free(bms_array[idx]); + } + pfree(bms_array); +} diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index b5a0033721..dde16b5d44 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -138,6 +138,7 @@ bool enable_partitionwise_aggregate = false; bool enable_parallel_append = true; bool enable_parallel_hash = true; bool enable_partition_pruning = true; +bool enable_distinct_elimination = true; typedef struct { diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c index d0ff660284..a15ba80808 100644 --- a/src/backend/optimizer/plan/analyzejoins.c +++ b/src/backend/optimizer/plan/analyzejoins.c @@ -22,6 +22,7 @@ */ #include "postgres.h" +#include "access/relation.h" #include "nodes/nodeFuncs.h" #include "optimizer/clauses.h" #include "optimizer/joininfo.h" @@ -30,7 +31,10 @@ #include "optimizer/paths.h" #include "optimizer/planmain.h" #include "optimizer/tlist.h" +#include "parser/parsetree.h" #include "utils/lsyscache.h" +#include "utils/rel.h" +#include "utils/relcache.h" /* local functions */ static bool join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo); @@ -801,9 +805,248 @@ query_is_distinct_for(Query *query, List *colnos, List *opids) if (l == NULL) /* had matches for all? */ return true; } + return query_is_distinct_agg(query, colnos, opids); +} + +/* + * scan_non_semi_anti_relids + * + * scan jointree to get non-semi/anti join rtindex. + */ +static void +scan_non_semi_anti_relids(Node* jtnode, Relids* relids) +{ + if (jtnode == NULL) + return; + + if (IsA(jtnode, RangeTblRef)) + { + int varno = ((RangeTblRef *) jtnode)->rtindex; + + *relids = bms_add_member(*relids, varno); + } + else if (IsA(jtnode, FromExpr)) + { + FromExpr *f = (FromExpr *) jtnode; + ListCell *l; + + foreach(l, f->fromlist) + scan_non_semi_anti_relids(lfirst(l), relids); + } + else if (IsA(jtnode, JoinExpr)) + { + JoinExpr *j = (JoinExpr *) jtnode; + + scan_non_semi_anti_relids(j->larg, relids); + if (j->jointype != JOIN_SEMI && j->jointype != JOIN_ANTI) + { + scan_non_semi_anti_relids(j->rarg, relids); + } + } + else + elog(ERROR, "unrecognized node type: %d", + (int) nodeTag(jtnode)); + +} + +/* + * query_distinct_through_join + * + * check if the result is unique after the join. if every relation + * yields a unique result, the result is unique as well + */ +bool +query_distinct_through_join(PlannerInfo *root, List *colnos, List *opids) +{ + RangeTblEntry *rte; + int rt_index; + Relids non_semi_anti_relids = NULL, tmp = NULL; + ListCell *lc1, *lc2; + + /* be filled with metadata and find_nonnull_var */ + Bitmapset **non_null_var_per_table = NULL; + Query *query = root->parse; + int max_len; + + /* be used for relation_has_unique_for */ + List **non_null_expr_per_table = NULL; + List **non_null_opids_per_table = NULL; + bool ret = true; + int idx = 0; + + if (query->hasTargetSRFs) + return false; + /* remove the relids for right semi/anti */ + scan_non_semi_anti_relids((Node*)query->jointree, &non_semi_anti_relids); + + rt_index = -1; + while ((rt_index = bms_next_member(non_semi_anti_relids, rt_index)) >= 0 ) + { + List *indexlist = root->simple_rel_array[rt_index]->indexlist; + bool found = false; + rte = rt_fetch(rt_index, query->rtable); + + /* for the subquery, we just handle some simple case */ + if (rte->rtekind == RTE_SUBQUERY) + { + Query *subquery = rte->subquery; + List *sub_opnos = NIL; + List *sub_opids = NIL; + if (!query_supports_distinctness(subquery)) + { + ret = false; + goto done; + } + + forboth(lc1, colnos, lc2, opids) + { + TargetEntry *tle = get_tle_by_resno(query->targetList, lfirst_int(lc1)); + Var *var; + if (!IsA(tle->expr, Var)) + continue; + var =(Var*)tle->expr; + if (var->varno == rt_index) + { + sub_opnos = lappend_int(sub_opnos, var->varattno); + sub_opids = lappend_oid(sub_opids, lfirst_oid(lc2)); + } + } + + if (query_is_distinct_for(subquery, sub_opnos, sub_opids)) + { + tmp = bms_add_member(tmp, rt_index); + } + else + { + ret = false; + goto done; + } + } + else if (rte->rtekind != RTE_RELATION) + { + ret = false; + goto done; + } + else + { + foreach(lc1, indexlist) + { + IndexOptInfo *ind = lfirst_node(IndexOptInfo, lc1); + if (ind->unique && ind->immediate && (ind->indpred == NIL || ind->predOK)) + { + found = true; + break; + } + } + + /* if any relation has no pk/uk index, we return fast */ + if (!found) + { + ret = false; + goto done; + } + } + } + + non_semi_anti_relids = bms_del_members(non_semi_anti_relids, tmp); /* - * Otherwise, a set-returning function in the query's targetlist can + * we know all the base rte is RTE_RELATION and every one has uk/pk index + */ + + max_len = list_length(query->rtable) + 1; + non_null_var_per_table = palloc0(max_len * sizeof(Bitmapset *)); + non_null_expr_per_table = palloc0(max_len * sizeof(Bitmapset *)); + non_null_opids_per_table = palloc0(max_len * sizeof(Bitmapset *)); + + /* fill non_null_var_per_table with restrictinfo */ + foreach(lc1, find_nonnullable_vars(query->jointree->quals)) + { + Var *var; + + if (!IsA(lfirst(lc1), Var)) + continue; + var = lfirst_node(Var, lc1); + if (var->varno == INNER_VAR || + var->varno == OUTER_VAR || + var->varno == INDEX_VAR) + continue; + non_null_var_per_table[var->varno] = bms_add_member( + non_null_var_per_table[var->varno], var->varattno); + } + + rt_index = -1; + /* fill non_null_var_per_table with catalog */ + while ((rt_index = bms_next_member(non_semi_anti_relids, rt_index)) >= 0 ) + { + int attr_idx = 0; + Relation relation; + TupleDesc desc; + + rte = rt_fetch(rt_index, query->rtable); + relation = relation_open(rte->relid, AccessShareLock); + desc = relation->rd_att; + + for(; attr_idx < desc->natts; attr_idx++) + { + if (!desc->attrs[attr_idx].attnotnull) + continue; + non_null_var_per_table[rt_index] = bms_add_member( + non_null_var_per_table[rt_index], attr_idx+1); + } + relation_close(relation, AccessShareLock); + } + + /* fiter all the colnos and opids which is not null */ + forboth(lc1, colnos, lc2, opids) + { + int colno = lfirst_int(lc1); + TargetEntry *tle = get_tle_by_resno(query->targetList, colno); + Var *var = NULL; + /* We don't know the varno if the target tle->expr is not a Var */ + if (!IsA(tle->expr, Var)) + continue; + var = (Var *)tle->expr; + if (!bms_is_member(var->varattno, non_null_var_per_table[var->varno])) + continue; + non_null_expr_per_table[var->varno] = lappend( + non_null_expr_per_table[var->varno], tle->expr); + non_null_opids_per_table[var->varno] = lappend_oid( + non_null_opids_per_table[var->varno], lfirst_oid(lc2)); + } + + rt_index = -1; + while ((rt_index = bms_next_member(non_semi_anti_relids, rt_index)) >= 0 ) + { + + /* if any one can't yield a unique result, we return fast */ + if (!relation_has_unique_index_for(root, root->simple_rel_array[rt_index], + NIL, + non_null_expr_per_table[rt_index], + non_null_opids_per_table[rt_index])) + { + ret = false; + goto done; + } + } +done: + bms_array_free(non_null_var_per_table, max_len); + for(idx = 0; idx < max_len; idx++) + { + list_free(non_null_expr_per_table[idx]); + list_free(non_null_opids_per_table[idx]); + } + return ret; +} + +bool +query_is_distinct_agg(Query *query, List *colnos, List *opids) +{ + ListCell *l; + Oid opid; + + /* + * a set-returning function in the query's targetlist can * result in returning duplicate rows, despite any grouping that might * occur before tlist evaluation. (If all tlist SRFs are within GROUP BY * columns, it would be safe because they'd be expanded before grouping. @@ -901,7 +1144,6 @@ query_is_distinct_for(Query *query, List *colnos, List *opids) return true; } } - /* * XXX Are there any other cases in which we can easily see the result * must be distinct? diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index d6f2153593..690fab676c 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -22,8 +22,10 @@ #include "access/htup_details.h" #include "access/parallel.h" #include "access/sysattr.h" +#include "access/relation.h" #include "access/table.h" #include "access/xact.h" +#include "catalog/index.h" #include "catalog/pg_constraint.h" #include "catalog/pg_inherits.h" #include "catalog/pg_proc.h" @@ -35,6 +37,7 @@ #include "lib/bipartite_match.h" #include "lib/knapsack.h" #include "miscadmin.h" +#include "nodes/bitmapset.h" #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" #ifdef OPTIMIZER_DEBUG @@ -64,6 +67,8 @@ #include "utils/rel.h" #include "utils/selfuncs.h" #include "utils/syscache.h" +#include "utils/typcache.h" + /* GUC parameters */ double cursor_tuple_fraction = DEFAULT_CURSOR_TUPLE_FRACTION; @@ -4737,6 +4742,29 @@ create_distinct_paths(PlannerInfo *root, Path *path; ListCell *lc; + if (enable_distinct_elimination) + { + List *colnos = NIL; + List *opnos = NIL; + ListCell *lc; + + Assert(parse->distinctClause != NIL); + + foreach(lc, parse->distinctClause) + { + SortGroupClause *sgc = lfirst_node(SortGroupClause, lc); + int idx = sgc->tleSortGroupRef; + TargetEntry *tle = get_tle_by_resno(parse->targetList, idx); + if (tle->resjunk) + continue; + colnos = lappend_int(colnos, idx); + opnos = lappend_oid(opnos, sgc->eqop); + } + + if (query_is_distinct_agg(parse, colnos, opnos) || + query_distinct_through_join(root, colnos, opnos)) + return input_rel; + } /* For now, do all work in the (DISTINCT, NULL) upperrel */ distinct_rel = fetch_upper_rel(root, UPPERREL_DISTINCT, NULL); diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index e44f71e991..fa798dd564 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -1064,6 +1064,16 @@ static struct config_bool ConfigureNamesBool[] = true, NULL, NULL, NULL }, + { + {"enable_distinct_elimination", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables plan-time and run-time unique elimination."), + gettext_noop("Allows the query planner to remove the uncecessary distinct clause."), + GUC_EXPLAIN + }, + &enable_distinct_elimination, + true, + NULL, NULL, NULL + }, { {"geqo", PGC_USERSET, QUERY_TUNING_GEQO, gettext_noop("Enables genetic query optimization."), diff --git a/src/include/nodes/bitmapset.h b/src/include/nodes/bitmapset.h index b7b18a0b68..5d6f04ffa9 100644 --- a/src/include/nodes/bitmapset.h +++ b/src/include/nodes/bitmapset.h @@ -117,4 +117,5 @@ extern int bms_prev_member(const Bitmapset *a, int prevbit); /* support for hashtables using Bitmapsets as keys: */ extern uint32 bms_hash_value(const Bitmapset *a); +extern void bms_array_free(Bitmapset **bms_array, int len); #endif /* BITMAPSET_H */ diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index cb012ba198..4fa5d32df6 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -64,6 +64,7 @@ extern PGDLLIMPORT bool enable_partitionwise_aggregate; extern PGDLLIMPORT bool enable_parallel_append; extern PGDLLIMPORT bool enable_parallel_hash; extern PGDLLIMPORT bool enable_partition_pruning; +extern PGDLLIMPORT bool enable_distinct_elimination; extern PGDLLIMPORT int constraint_exclusion; extern double index_pages_fetched(double tuples_fetched, BlockNumber pages, diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h index eab486a621..ebd4f24577 100644 --- a/src/include/optimizer/planmain.h +++ b/src/include/optimizer/planmain.h @@ -100,6 +100,8 @@ extern List *remove_useless_joins(PlannerInfo *root, List *joinlist); extern void reduce_unique_semijoins(PlannerInfo *root); extern bool query_supports_distinctness(Query *query); extern bool query_is_distinct_for(Query *query, List *colnos, List *opids); +extern bool query_is_distinct_agg(Query *query, List *colnos, List *opids); +extern bool query_distinct_through_join(PlannerInfo *root, List *colnos, List *opids); extern bool innerrel_is_unique(PlannerInfo *root, Relids joinrelids, Relids outerrelids, RelOptInfo *innerrel, JoinType jointype, List *restrictlist, bool force_cache); diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index f457b5b150..6712571578 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -870,14 +870,12 @@ explain (costs off) select distinct max(unique2) from tenk1; QUERY PLAN --------------------------------------------------------------------- - HashAggregate - Group Key: $0 + Result InitPlan 1 (returns $0) -> Limit -> Index Only Scan Backward using tenk1_unique2 on tenk1 Index Cond: (unique2 IS NOT NULL) - -> Result -(7 rows) +(5 rows) select distinct max(unique2) from tenk1; max @@ -1036,7 +1034,7 @@ explain (costs off) select distinct min(f1), max(f1) from minmaxtest; QUERY PLAN --------------------------------------------------------------------------------------------- - Unique + Result InitPlan 1 (returns $0) -> Limit -> Merge Append @@ -1059,10 +1057,7 @@ explain (costs off) -> Index Only Scan using minmaxtest2i on minmaxtest2 minmaxtest_8 Index Cond: (f1 IS NOT NULL) -> Index Only Scan Backward using minmaxtest3i on minmaxtest3 minmaxtest_9 - -> Sort - Sort Key: ($0), ($1) - -> Result -(26 rows) +(23 rows) select distinct min(f1), max(f1) from minmaxtest; min | max diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index 761376b007..3f6595d53b 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -4433,17 +4433,17 @@ select d.* from d left join (select * from b group by b.id, b.c_id) s explain (costs off) select d.* from d left join (select distinct * from b) s on d.a = s.id; - QUERY PLAN --------------------------------------- - Merge Right Join - Merge Cond: (b.id = d.a) - -> Unique - -> Sort - Sort Key: b.id, b.c_id - -> Seq Scan on b + QUERY PLAN +--------------------------------- + Merge Left Join + Merge Cond: (d.a = s.id) -> Sort Sort Key: d.a -> Seq Scan on d + -> Sort + Sort Key: s.id + -> Subquery Scan on s + -> Seq Scan on b (9 rows) -- check join removal works when uniqueness of the join condition is enforced diff --git a/src/test/regress/expected/select_distinct_2.out b/src/test/regress/expected/select_distinct_2.out new file mode 100644 index 0000000000..2ece95a806 --- /dev/null +++ b/src/test/regress/expected/select_distinct_2.out @@ -0,0 +1,276 @@ +create table select_distinct_a(pk1 int, pk2 char(20), uk1 char(20) not null, uk2 int, e int, primary key(pk1, pk2)); +create unique index select_distinct_a_uk on select_distinct_a(uk1, uk2); +create table select_distinct_b(a int, b char(20), pk1 char(20), pk2 int, e int, primary key(pk1, pk2)); +-- distinct erased since (pk1, pk2) +explain (costs off) select distinct * from select_distinct_a; + QUERY PLAN +------------------------------- + Seq Scan on select_distinct_a +(1 row) + +-- distinct can't be reased since since we required all the uk must be not null +explain (costs off) select distinct uk1, uk2 from select_distinct_a; + QUERY PLAN +------------------------------------- + HashAggregate + Group Key: uk1, uk2 + -> Seq Scan on select_distinct_a +(3 rows) + +-- distinct ereased since uk + not null +explain (costs off) select distinct uk1, uk2 from select_distinct_a where uk2 is not null; + QUERY PLAN +------------------------------- + Seq Scan on select_distinct_a + Filter: (uk2 IS NOT NULL) +(2 rows) + +explain (costs off) select distinct uk1, uk2 from select_distinct_a where uk2 > 1; + QUERY PLAN +------------------------------- + Seq Scan on select_distinct_a + Filter: (uk2 > 1) +(2 rows) + +-- distinct erased due to group by +explain select distinct e from select_distinct_a group by e; + QUERY PLAN +-------------------------------------------------------------------------- + HashAggregate (cost=14.88..16.88 rows=200 width=4) + Group Key: e + -> Seq Scan on select_distinct_a (cost=0.00..13.90 rows=390 width=4) +(3 rows) + +-- distinct erased due to the restirctinfo +explain select distinct uk1 from select_distinct_a where pk1 = 1 and pk2 = 'c'; + QUERY PLAN +------------------------------------------------------------------------------------------------- + Index Scan using select_distinct_a_pkey on select_distinct_a (cost=0.15..8.17 rows=1 width=84) + Index Cond: ((pk1 = 1) AND (pk2 = 'c'::bpchar)) +(2 rows) + +-- test join +set enable_mergejoin to off; +set enable_hashjoin to off; +insert into select_distinct_a values(1, 'a', 'a', 0, 1), (1, 'b', 'A', 0, 2), (3, 'c', 'c', 0, 3); +insert into select_distinct_b values(1, 'a', 'a', 0, 1), (4, 'd', 'd', 0, 4), (1, 'e', 'e', 0, 5); +-- Cartesian join +explain (costs off) select distinct a.uk1, a.uk2, b.pk1, b.pk2 from select_distinct_a a, select_distinct_b b where a.uk2 is not null; + QUERY PLAN +--------------------------------------------- + Nested Loop + -> Seq Scan on select_distinct_b b + -> Materialize + -> Seq Scan on select_distinct_a a + Filter: (uk2 IS NOT NULL) +(5 rows) + +select distinct a.uk1, a.uk2, b.pk1, b.pk2 from select_distinct_a a, select_distinct_b b where a.uk2 is not null order by 1, 2, 3, 4; + uk1 | uk2 | pk1 | pk2 +----------------------+-----+----------------------+----- + a | 0 | a | 0 + a | 0 | d | 0 + a | 0 | e | 0 + A | 0 | a | 0 + A | 0 | d | 0 + A | 0 | e | 0 + c | 0 | a | 0 + c | 0 | d | 0 + c | 0 | e | 0 +(9 rows) + +-- left join +explain select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a left join select_distinct_b b on (a.pk1 = b.a); + QUERY PLAN +----------------------------------------------------------------------------------- + Nested Loop Left Join (cost=0.00..2310.28 rows=760 width=176) + Join Filter: (a.pk1 = b.a) + -> Seq Scan on select_distinct_a a (cost=0.00..13.90 rows=390 width=88) + -> Materialize (cost=0.00..15.85 rows=390 width=92) + -> Seq Scan on select_distinct_b b (cost=0.00..13.90 rows=390 width=92) +(5 rows) + +select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a left join select_distinct_b b on (a.pk1 = b.a) order by 1, 2, 3, 4;; + pk1 | pk2 | pk1 | pk2 +-----+----------------------+----------------------+----- + 1 | a | a | 0 + 1 | a | e | 0 + 1 | b | a | 0 + 1 | b | e | 0 + 3 | c | | +(5 rows) + +-- right join +explain select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a right join select_distinct_b b on (a.pk1 = b.a); + QUERY PLAN +-------------------------------------------------------------------------------------------------------------- + Nested Loop Left Join (cost=0.15..140.88 rows=760 width=176) + -> Seq Scan on select_distinct_b b (cost=0.00..13.90 rows=390 width=92) + -> Index Only Scan using select_distinct_a_pkey on select_distinct_a a (cost=0.15..0.31 rows=2 width=88) + Index Cond: (pk1 = b.a) +(4 rows) + +select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a right join select_distinct_b b on (a.pk1 = b.a) order by 1, 2, 3, 4; + pk1 | pk2 | pk1 | pk2 +-----+----------------------+----------------------+----- + 1 | a | a | 0 + 1 | a | e | 0 + 1 | b | a | 0 + 1 | b | e | 0 + | | d | 0 +(5 rows) + +-- full join +explain select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a full outer join select_distinct_b b on (a.pk1 = b.a); + QUERY PLAN +----------------------------------------------------------------------------------- + Hash Full Join (cost=10000000018.77..10000000060.26 rows=760 width=176) + Hash Cond: (a.pk1 = b.a) + -> Seq Scan on select_distinct_a a (cost=0.00..13.90 rows=390 width=88) + -> Hash (cost=13.90..13.90 rows=390 width=92) + -> Seq Scan on select_distinct_b b (cost=0.00..13.90 rows=390 width=92) +(5 rows) + +select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a full outer join select_distinct_b b on (a.pk1 = b.a) order by 1, 2, 3, 4; + pk1 | pk2 | pk1 | pk2 +-----+----------------------+----------------------+----- + 1 | a | a | 0 + 1 | a | e | 0 + 1 | b | a | 0 + 1 | b | e | 0 + 3 | c | | + | | d | 0 +(6 rows) + +-- distinct can't be erased since b.pk2 is missed +explain select distinct a.pk1, a.pk2, b.pk1 from select_distinct_a a full outer join select_distinct_b b on (a.pk1 = b.a); + QUERY PLAN +----------------------------------------------------------------------------------------------- + Unique (cost=10000000096.63..10000000104.23 rows=760 width=172) + -> Sort (cost=10000000096.63..10000000098.53 rows=760 width=172) + Sort Key: a.pk1, a.pk2, b.pk1 + -> Hash Full Join (cost=10000000018.77..10000000060.26 rows=760 width=172) + Hash Cond: (a.pk1 = b.a) + -> Seq Scan on select_distinct_a a (cost=0.00..13.90 rows=390 width=88) + -> Hash (cost=13.90..13.90 rows=390 width=88) + -> Seq Scan on select_distinct_b b (cost=0.00..13.90 rows=390 width=88) +(8 rows) + +-- Semi/anti join +explain (costs off) select distinct pk1, pk2 from select_distinct_a where pk1 in (select a from select_distinct_b); + QUERY PLAN +------------------------------------------------------------------------- + Nested Loop + -> HashAggregate + Group Key: select_distinct_b.a + -> Seq Scan on select_distinct_b + -> Index Only Scan using select_distinct_a_pkey on select_distinct_a + Index Cond: (pk1 = select_distinct_b.a) +(6 rows) + +explain (costs off) select distinct pk1, pk2 from select_distinct_a where pk1 not in (select a from select_distinct_b); + QUERY PLAN +--------------------------------------- + Seq Scan on select_distinct_a + Filter: (NOT (hashed SubPlan 1)) + SubPlan 1 + -> Seq Scan on select_distinct_b +(4 rows) + +-- we also can handle some limited subquery +explain select distinct * from select_distinct_a a, (select a from select_distinct_b group by a) b where a.pk1 = b.a; + QUERY PLAN +---------------------------------------------------------------------------------------------------------- + Nested Loop (cost=15.02..107.38 rows=390 width=184) + -> HashAggregate (cost=14.88..16.88 rows=200 width=4) + Group Key: select_distinct_b.a + -> Seq Scan on select_distinct_b (cost=0.00..13.90 rows=390 width=4) + -> Index Scan using select_distinct_a_pkey on select_distinct_a a (cost=0.15..0.42 rows=2 width=180) + Index Cond: (pk1 = select_distinct_b.a) +(6 rows) + +select distinct * from select_distinct_a a, (select a from select_distinct_b group by a) b where a.pk1 = b.a order by 1, 2, 3; + pk1 | pk2 | uk1 | uk2 | e | a +-----+----------------------+----------------------+-----+---+--- + 1 | a | a | 0 | 1 | 1 + 1 | b | A | 0 | 2 | 1 +(2 rows) + +explain select distinct * from select_distinct_a a, (select distinct a from select_distinct_b) b where a.pk1 = b.a; + QUERY PLAN +---------------------------------------------------------------------------------------------------------- + Nested Loop (cost=15.02..107.38 rows=390 width=184) + -> HashAggregate (cost=14.88..16.88 rows=200 width=4) + Group Key: select_distinct_b.a + -> Seq Scan on select_distinct_b (cost=0.00..13.90 rows=390 width=4) + -> Index Scan using select_distinct_a_pkey on select_distinct_a a (cost=0.15..0.42 rows=2 width=180) + Index Cond: (pk1 = select_distinct_b.a) +(6 rows) + +select distinct * from select_distinct_a a, (select distinct a from select_distinct_b) b where a.pk1 = b.a order by 1 ,2, 3; + pk1 | pk2 | uk1 | uk2 | e | a +-----+----------------------+----------------------+-----+---+--- + 1 | a | a | 0 | 1 | 1 + 1 | b | A | 0 | 2 | 1 +(2 rows) + +-- Distinct On +-- can't erase since pk2 is missed +explain (costs off) select distinct on(pk1) pk1, pk2 from select_distinct_a; + QUERY PLAN +------------------------------------------- + Unique + -> Sort + Sort Key: pk1 + -> Seq Scan on select_distinct_a +(4 rows) + +-- ok to erase +explain (costs off) select distinct on(pk1, pk2) pk1, pk2 from select_distinct_a; + QUERY PLAN +------------------------------- + Seq Scan on select_distinct_a +(1 row) + +-- test some view. +create view distinct_v1 as select distinct uk1, uk2 from select_distinct_a where uk2 is not null; +explain select * from distinct_v1; + QUERY PLAN +--------------------------------------------------------------------- + Seq Scan on select_distinct_a (cost=0.00..13.90 rows=388 width=88) + Filter: (uk2 IS NOT NULL) +(2 rows) + +alter table select_distinct_a alter column uk1 drop not null; +explain select * from distinct_v1; + QUERY PLAN +--------------------------------------------------------------------------- + HashAggregate (cost=15.84..17.84 rows=200 width=88) + Group Key: select_distinct_a.uk1, select_distinct_a.uk2 + -> Seq Scan on select_distinct_a (cost=0.00..13.90 rows=388 width=88) + Filter: (uk2 IS NOT NULL) +(4 rows) + +alter table select_distinct_a alter column uk1 set not null; +-- test generic plan +prepare pt as select * from distinct_v1; +explain execute pt; + QUERY PLAN +--------------------------------------------------------------------- + Seq Scan on select_distinct_a (cost=0.00..13.90 rows=388 width=88) + Filter: (uk2 IS NOT NULL) +(2 rows) + +alter table select_distinct_a alter column uk1 drop not null; +explain execute pt; + QUERY PLAN +--------------------------------------------------------------------------- + HashAggregate (cost=15.84..17.84 rows=200 width=88) + Group Key: select_distinct_a.uk1, select_distinct_a.uk2 + -> Seq Scan on select_distinct_a (cost=0.00..13.90 rows=388 width=88) + Filter: (uk2 IS NOT NULL) +(4 rows) + +drop view distinct_v1; +drop table select_distinct_a; +drop table select_distinct_b; diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index a1c90eb905..e053214f9d 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -73,6 +73,7 @@ select name, setting from pg_settings where name like 'enable%'; name | setting --------------------------------+--------- enable_bitmapscan | on + enable_distinct_elimination | on enable_gathermerge | on enable_hashagg | on enable_hashjoin | on @@ -89,7 +90,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_seqscan | on enable_sort | on enable_tidscan | on -(17 rows) +(18 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail diff --git a/src/test/regress/sql/select_distinct_2.sql b/src/test/regress/sql/select_distinct_2.sql new file mode 100644 index 0000000000..2fc54e7e36 --- /dev/null +++ b/src/test/regress/sql/select_distinct_2.sql @@ -0,0 +1,84 @@ +create table select_distinct_a(pk1 int, pk2 char(20), uk1 char(20) not null, uk2 int, e int, primary key(pk1, pk2)); +create unique index select_distinct_a_uk on select_distinct_a(uk1, uk2); +create table select_distinct_b(a int, b char(20), pk1 char(20), pk2 int, e int, primary key(pk1, pk2)); + +-- distinct erased since (pk1, pk2) +explain (costs off) select distinct * from select_distinct_a; + +-- distinct can't be reased since since we required all the uk must be not null +explain (costs off) select distinct uk1, uk2 from select_distinct_a; + +-- distinct ereased since uk + not null +explain (costs off) select distinct uk1, uk2 from select_distinct_a where uk2 is not null; +explain (costs off) select distinct uk1, uk2 from select_distinct_a where uk2 > 1; + +-- distinct erased due to group by +explain select distinct e from select_distinct_a group by e; + +-- distinct erased due to the restirctinfo +explain select distinct uk1 from select_distinct_a where pk1 = 1 and pk2 = 'c'; + +-- test join +set enable_mergejoin to off; +set enable_hashjoin to off; + +insert into select_distinct_a values(1, 'a', 'a', 0, 1), (1, 'b', 'A', 0, 2), (3, 'c', 'c', 0, 3); +insert into select_distinct_b values(1, 'a', 'a', 0, 1), (4, 'd', 'd', 0, 4), (1, 'e', 'e', 0, 5); + +-- Cartesian join +explain (costs off) select distinct a.uk1, a.uk2, b.pk1, b.pk2 from select_distinct_a a, select_distinct_b b where a.uk2 is not null; +select distinct a.uk1, a.uk2, b.pk1, b.pk2 from select_distinct_a a, select_distinct_b b where a.uk2 is not null order by 1, 2, 3, 4; + + +-- left join +explain select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a left join select_distinct_b b on (a.pk1 = b.a); +select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a left join select_distinct_b b on (a.pk1 = b.a) order by 1, 2, 3, 4;; + +-- right join +explain select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a right join select_distinct_b b on (a.pk1 = b.a); +select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a right join select_distinct_b b on (a.pk1 = b.a) order by 1, 2, 3, 4; + +-- full join +explain select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a full outer join select_distinct_b b on (a.pk1 = b.a); +select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a full outer join select_distinct_b b on (a.pk1 = b.a) order by 1, 2, 3, 4; + +-- distinct can't be erased since b.pk2 is missed +explain select distinct a.pk1, a.pk2, b.pk1 from select_distinct_a a full outer join select_distinct_b b on (a.pk1 = b.a); + + +-- Semi/anti join +explain (costs off) select distinct pk1, pk2 from select_distinct_a where pk1 in (select a from select_distinct_b); +explain (costs off) select distinct pk1, pk2 from select_distinct_a where pk1 not in (select a from select_distinct_b); + +-- we also can handle some limited subquery +explain select distinct * from select_distinct_a a, (select a from select_distinct_b group by a) b where a.pk1 = b.a; +select distinct * from select_distinct_a a, (select a from select_distinct_b group by a) b where a.pk1 = b.a order by 1, 2, 3; + +explain select distinct * from select_distinct_a a, (select distinct a from select_distinct_b) b where a.pk1 = b.a; +select distinct * from select_distinct_a a, (select distinct a from select_distinct_b) b where a.pk1 = b.a order by 1 ,2, 3; + +-- Distinct On +-- can't erase since pk2 is missed +explain (costs off) select distinct on(pk1) pk1, pk2 from select_distinct_a; +-- ok to erase +explain (costs off) select distinct on(pk1, pk2) pk1, pk2 from select_distinct_a; + + +-- test some view. +create view distinct_v1 as select distinct uk1, uk2 from select_distinct_a where uk2 is not null; +explain select * from distinct_v1; + +alter table select_distinct_a alter column uk1 drop not null; +explain select * from distinct_v1; + +alter table select_distinct_a alter column uk1 set not null; + +-- test generic plan +prepare pt as select * from distinct_v1; +explain execute pt; +alter table select_distinct_a alter column uk1 drop not null; +explain execute pt; + +drop view distinct_v1; +drop table select_distinct_a; +drop table select_distinct_b; -- 2.21.0