src/backend/optimizer/path/allpaths.c | 9 +- src/backend/optimizer/path/joinpath.c | 9 ++ src/backend/optimizer/path/joinrels.c | 129 ++++++++++++++++++++++++ src/backend/optimizer/plan/planner.c | 6 +- src/backend/optimizer/util/appendinfo.c | 12 ++- src/backend/optimizer/util/relnode.c | 14 +-- src/include/optimizer/paths.h | 10 +- src/test/regress/expected/partition_join.out | 144 +++++++++++++++++++++++++++ src/test/regress/sql/partition_join.sql | 64 ++++++++++++ 9 files changed, 376 insertions(+), 21 deletions(-) diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index db3a68a..69c3cb2 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -1275,7 +1275,7 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, } /* Add paths to the append relation. */ - add_paths_to_append_rel(root, rel, live_childrels); + add_paths_to_append_rel(root, rel, live_childrels, NIL); } @@ -1292,7 +1292,8 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, */ void add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, - List *live_childrels) + List *live_childrels, + List *original_partitioned_rels) { List *subpaths = NIL; bool subpaths_valid = true; @@ -1304,7 +1305,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, List *all_child_pathkeys = NIL; List *all_child_outers = NIL; ListCell *l; - List *partitioned_rels = NIL; + List *partitioned_rels = original_partitioned_rels; double partial_rows = -1; /* If appropriate, consider parallel append */ @@ -3717,7 +3718,7 @@ generate_partitionwise_join_paths(PlannerInfo *root, RelOptInfo *rel) } /* Build additional paths for this rel from child-join paths. */ - add_paths_to_append_rel(root, rel, live_children); + add_paths_to_append_rel(root, rel, live_children, NIL); list_free(live_children); } diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c index dc28b56..9444758 100644 --- a/src/backend/optimizer/path/joinpath.c +++ b/src/backend/optimizer/path/joinpath.c @@ -324,6 +324,15 @@ add_paths_to_joinrel(PlannerInfo *root, if (set_join_pathlist_hook) set_join_pathlist_hook(root, joinrel, outerrel, innerrel, jointype, &extra); + + /* + * 7. If outer relation is delivered from partition-tables, consider + * distributing inner relation to every partition-leaf prior to + * append these leafs. + */ + try_asymmetric_partitionwise_join(root, joinrel, + outerrel, innerrel, + jointype, &extra); } /* diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c index 6a480ab..6c35d35 100644 --- a/src/backend/optimizer/path/joinrels.c +++ b/src/backend/optimizer/path/joinrels.c @@ -16,6 +16,7 @@ #include "miscadmin.h" #include "optimizer/appendinfo.h" +#include "optimizer/cost.h" #include "optimizer/joininfo.h" #include "optimizer/pathnode.h" #include "optimizer/paths.h" @@ -1532,6 +1533,134 @@ try_partitionwise_join(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2, } } +static List * +extract_asymmetric_partitionwise_subjoin(PlannerInfo *root, + RelOptInfo *joinrel, + AppendPath *append_path, + RelOptInfo *inner_rel, + JoinType jointype, + JoinPathExtraData *extra) +{ + List *result = NIL; + ListCell *lc; + + foreach (lc, append_path->subpaths) + { + Path *child_path = lfirst(lc); + RelOptInfo *child_rel = child_path->parent; + Relids child_join_relids; + RelOptInfo *child_join_rel; + SpecialJoinInfo *child_sjinfo; + List *child_restrictlist; + AppendRelInfo **appinfos; + int nappinfos; + + child_join_relids = bms_union(child_rel->relids, + inner_rel->relids); + appinfos = find_appinfos_by_relids(root, child_join_relids, + &nappinfos); + child_sjinfo = build_child_join_sjinfo(root, extra->sjinfo, + child_rel->relids, + inner_rel->relids); + child_restrictlist = (List *) + adjust_appendrel_attrs(root, (Node *)extra->restrictlist, + nappinfos, appinfos); + pfree(appinfos); + + child_join_rel = find_join_rel(root, child_join_relids); + if (!child_join_rel) + { + child_join_rel = build_child_join_rel(root, + child_rel, + inner_rel, + joinrel, + child_restrictlist, + child_sjinfo, + jointype); + if (!child_join_rel) + return NIL; + } + populate_joinrel_with_paths(root, + child_rel, + inner_rel, + child_join_rel, + child_sjinfo, + child_restrictlist); + /* Give up if asymmetric partition-wise join is not available */ + if (child_join_rel->pathlist == NIL) + return NIL; + set_cheapest(child_join_rel); + + result = lappend(result, child_join_rel); + } + return result; +} + +void +try_asymmetric_partitionwise_join(PlannerInfo *root, + RelOptInfo *joinrel, + RelOptInfo *outer_rel, + RelOptInfo *inner_rel, + JoinType jointype, + JoinPathExtraData *extra) +{ + ListCell *lc; + + if (!enable_partitionwise_join) + return; + if (IS_OTHER_REL(outer_rel) || IS_OTHER_REL(inner_rel)) + return; + if (jointype != JOIN_INNER && + jointype != JOIN_LEFT) + return; + + foreach (lc, outer_rel->pathlist) + { + AppendPath *append_path = lfirst(lc); + + /* + * MEMO: We assume this pathlist keeps at least one AppendPath that + * represents partitioned table-scan, symmetric or asymmetric + * partition-wise join. It is not correct right now, however, a hook + * on add_path() to give additional decision for path removel allows + * to retain this kind of AppendPath, regardless of its cost. + */ + if (IsA(append_path, AppendPath) && + append_path->partitioned_rels != NIL) + { + List **join_rel_level_saved; + List *live_childrels = NIL; + + join_rel_level_saved = root->join_rel_level; + PG_TRY(); + { + /* temporary disables "dynamic programming" algorithm */ + root->join_rel_level = NULL; + + live_childrels = + extract_asymmetric_partitionwise_subjoin(root, + joinrel, + append_path, + inner_rel, + jointype, + extra); + } + PG_CATCH(); + { + root->join_rel_level = join_rel_level_saved; + PG_RE_THROW(); + } + PG_END_TRY(); + root->join_rel_level = join_rel_level_saved; + + if (live_childrels != NIL) + add_paths_to_append_rel(root, joinrel, live_childrels, + append_path->partitioned_rels); + break; + } + } +} + /* * Construct the SpecialJoinInfo for a child-join by translating * SpecialJoinInfo for the join between parents. left_relids and right_relids diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 17c5f08..c7a1fc9 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -7181,7 +7181,7 @@ apply_scanjoin_target_to_paths(PlannerInfo *root, } /* Build new paths for this relation by appending child paths. */ - add_paths_to_append_rel(root, rel, live_children); + add_paths_to_append_rel(root, rel, live_children, NIL); } /* @@ -7334,7 +7334,7 @@ create_partitionwise_grouping_paths(PlannerInfo *root, Assert(partially_grouped_live_children != NIL); add_paths_to_append_rel(root, partially_grouped_rel, - partially_grouped_live_children); + partially_grouped_live_children, NIL); /* * We need call set_cheapest, since the finalization step will use the @@ -7349,7 +7349,7 @@ create_partitionwise_grouping_paths(PlannerInfo *root, { Assert(grouped_live_children != NIL); - add_paths_to_append_rel(root, grouped_rel, grouped_live_children); + add_paths_to_append_rel(root, grouped_rel, grouped_live_children, NIL); } } diff --git a/src/backend/optimizer/util/appendinfo.c b/src/backend/optimizer/util/appendinfo.c index 16d3151..7496cb9 100644 --- a/src/backend/optimizer/util/appendinfo.c +++ b/src/backend/optimizer/util/appendinfo.c @@ -187,8 +187,9 @@ adjust_appendrel_attrs(PlannerInfo *root, Node *node, int nappinfos, context.nappinfos = nappinfos; context.appinfos = appinfos; - /* If there's nothing to adjust, don't call this function. */ - Assert(nappinfos >= 1 && appinfos != NULL); + /* If there's nothing to adjust, just return a duplication */ + if (nappinfos == 0) + return copyObject(node); /* * Must be prepared to start with a Query or a bare expression tree. @@ -709,11 +710,11 @@ AppendRelInfo ** find_appinfos_by_relids(PlannerInfo *root, Relids relids, int *nappinfos) { AppendRelInfo **appinfos; + int nrooms = bms_num_members(relids); int cnt = 0; int i; - *nappinfos = bms_num_members(relids); - appinfos = (AppendRelInfo **) palloc(sizeof(AppendRelInfo *) * *nappinfos); + appinfos = (AppendRelInfo **) palloc(sizeof(AppendRelInfo *) * nrooms); i = -1; while ((i = bms_next_member(relids, i)) >= 0) @@ -721,9 +722,10 @@ find_appinfos_by_relids(PlannerInfo *root, Relids relids, int *nappinfos) AppendRelInfo *appinfo = root->append_rel_array[i]; if (!appinfo) - elog(ERROR, "child rel %d not found in append_rel_array", i); + continue; appinfos[cnt++] = appinfo; } + *nappinfos = cnt; return appinfos; } diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c index 8541538..c1a36b5 100644 --- a/src/backend/optimizer/util/relnode.c +++ b/src/backend/optimizer/util/relnode.c @@ -778,11 +778,8 @@ build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel, AppendRelInfo **appinfos; int nappinfos; - /* Only joins between "other" relations land here. */ - Assert(IS_OTHER_REL(outer_rel) && IS_OTHER_REL(inner_rel)); - - /* The parent joinrel should have consider_partitionwise_join set. */ - Assert(parent_joinrel->consider_partitionwise_join); + /* Either of relations must be "other" relation at least. */ + Assert(IS_OTHER_REL(outer_rel) || IS_OTHER_REL(inner_rel)); joinrel->reloptkind = RELOPT_OTHER_JOINREL; joinrel->relids = bms_union(outer_rel->relids, inner_rel->relids); @@ -837,8 +834,11 @@ build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel, joinrel->nullable_partexprs = NULL; joinrel->partitioned_child_rels = NIL; - joinrel->top_parent_relids = bms_union(outer_rel->top_parent_relids, - inner_rel->top_parent_relids); + joinrel->top_parent_relids = + bms_union(IS_OTHER_REL(outer_rel) ? + outer_rel->top_parent_relids : outer_rel->relids, + IS_OTHER_REL(inner_rel) ? + inner_rel->top_parent_relids : inner_rel->relids); /* Compute information relevant to foreign relations. */ set_foreign_rel_properties(joinrel, outer_rel, inner_rel); diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index 7345137..e70ebe7 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -109,7 +109,12 @@ extern void mark_dummy_rel(RelOptInfo *rel); extern bool have_partkey_equi_join(RelOptInfo *joinrel, RelOptInfo *rel1, RelOptInfo *rel2, JoinType jointype, List *restrictlist); - +extern void try_asymmetric_partitionwise_join(PlannerInfo *root, + RelOptInfo *joinrel, + RelOptInfo *outer_rel, + RelOptInfo *inner_rel, + JoinType jointype, + JoinPathExtraData *extra); /* * equivclass.c * routines for managing EquivalenceClasses @@ -233,6 +238,7 @@ extern PathKey *make_canonical_pathkey(PlannerInfo *root, EquivalenceClass *eclass, Oid opfamily, int strategy, bool nulls_first); extern void add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, - List *live_childrels); + List *live_childrels, + List *original_partitioned_rels); #endif /* PATHS_H */ diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out index 1296edc..d838713 100644 --- a/src/test/regress/expected/partition_join.out +++ b/src/test/regress/expected/partition_join.out @@ -2003,3 +2003,147 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.b = Filter: (b = 0) (16 rows) +-- +-- For asymmetric partition-wise join +-- +CREATE TABLE prt5 (hkey int, a int, b int) PARTITION BY HASH(hkey); +CREATE TABLE prt5_p0 PARTITION OF prt5 + FOR VALUES WITH (modulus 3, remainder 0); +CREATE TABLE prt5_p1 PARTITION OF prt5 + FOR VALUES WITH (modulus 3, remainder 1); +CREATE TABLE prt5_p2 PARTITION OF prt5 + FOR VALUES WITH (modulus 3, remainder 2); +CREATE TABLE t5_1 (aid int, alabel text); +CREATE TABLE t5_2 (bid int, blabel text); +INSERT INTO prt5 (SELECT x, (1000.0 * random())::int, + (1000.0 * random())::int + FROM generate_series(1,1000000) x); +INSERT INTO t5_1 (SELECT x, md5(x::text) FROM generate_series(-200, 800) x); +INSERT INTO t5_2 (SELECT x, md5(x::text) FROM generate_series(-200, 800) x); +VACUUM ANALYZE prt5; +VACUUM ANALYZE t5_1; +VACUUM ANALYZE t5_2; +SET max_parallel_workers_per_gather = 0; +EXPLAIN (COSTS OFF) +SELECT * + FROM prt5 JOIN t5_1 ON a = aid AND alabel like '%abc%'; + QUERY PLAN +------------------------------------------------------- + Append + -> Hash Join + Hash Cond: (prt5_p0.a = t5_1.aid) + -> Seq Scan on prt5_p0 + -> Hash + -> Seq Scan on t5_1 + Filter: (alabel ~~ '%abc%'::text) + -> Hash Join + Hash Cond: (prt5_p1.a = t5_1.aid) + -> Seq Scan on prt5_p1 + -> Hash + -> Seq Scan on t5_1 + Filter: (alabel ~~ '%abc%'::text) + -> Hash Join + Hash Cond: (prt5_p2.a = t5_1.aid) + -> Seq Scan on prt5_p2 + -> Hash + -> Seq Scan on t5_1 + Filter: (alabel ~~ '%abc%'::text) +(19 rows) + +EXPLAIN (COSTS OFF) +SELECT * + FROM prt5 JOIN t5_1 ON a = aid AND alabel like '%ab%' + JOIN t5_2 ON b = bid AND blabel like '%cd%'; + QUERY PLAN +------------------------------------------------------------ + Append + -> Hash Join + Hash Cond: (prt5_p0.a = t5_1.aid) + -> Hash Join + Hash Cond: (prt5_p0.b = t5_2.bid) + -> Seq Scan on prt5_p0 + -> Hash + -> Seq Scan on t5_2 + Filter: (blabel ~~ '%cd%'::text) + -> Hash + -> Seq Scan on t5_1 + Filter: (alabel ~~ '%ab%'::text) + -> Hash Join + Hash Cond: (prt5_p1.a = t5_1.aid) + -> Hash Join + Hash Cond: (prt5_p1.b = t5_2.bid) + -> Seq Scan on prt5_p1 + -> Hash + -> Seq Scan on t5_2 + Filter: (blabel ~~ '%cd%'::text) + -> Hash + -> Seq Scan on t5_1 + Filter: (alabel ~~ '%ab%'::text) + -> Hash Join + Hash Cond: (prt5_p2.a = t5_1.aid) + -> Hash Join + Hash Cond: (prt5_p2.b = t5_2.bid) + -> Seq Scan on prt5_p2 + -> Hash + -> Seq Scan on t5_2 + Filter: (blabel ~~ '%cd%'::text) + -> Hash + -> Seq Scan on t5_1 + Filter: (alabel ~~ '%ab%'::text) +(34 rows) + +-- unable to extract non-partitioned right relation +EXPLAIN (COSTS OFF) +SELECT * FROM prt5 RIGHT JOIN t5_1 ON a = aid AND alabel like '%abc%'; + QUERY PLAN +----------------------------------------------- + Hash Right Join + Hash Cond: (prt5_p0.a = t5_1.aid) + Join Filter: (t5_1.alabel ~~ '%abc%'::text) + -> Append + -> Seq Scan on prt5_p0 + -> Seq Scan on prt5_p1 + -> Seq Scan on prt5_p2 + -> Hash + -> Seq Scan on t5_1 +(9 rows) + +-- left side can be extracted, but no cost benefit +EXPLAIN (COSTS OFF) +SELECT * FROM prt5 LEFT JOIN t5_1 ON a = aid AND alabel like '%abc%'; + QUERY PLAN +------------------------------------------------- + Hash Left Join + Hash Cond: (prt5_p0.a = t5_1.aid) + -> Append + -> Seq Scan on prt5_p0 + -> Seq Scan on prt5_p1 + -> Seq Scan on prt5_p2 + -> Hash + -> Seq Scan on t5_1 + Filter: (alabel ~~ '%abc%'::text) +(9 rows) + +-- validation of the results with/without asymmetric partition-wise join +SELECT * INTO pg_temp.result01a + FROM prt5 JOIN t5_1 ON a = aid AND alabel like '%abc%'; +SELECT * INTO pg_temp.result02a + FROM prt5 JOIN t5_1 ON a = aid AND alabel like '%ab%' + JOIN t5_2 ON b = bid AND blabel like '%cd%'; +SET enable_partitionwise_join = off; +SELECT * INTO pg_temp.result01b + FROM prt5 JOIN t5_1 ON a = aid AND alabel like '%abc%'; +SELECT * INTO pg_temp.result02b + FROM prt5 JOIN t5_1 ON a = aid AND alabel like '%ab%' + JOIN t5_2 ON b = bid AND blabel like '%cd%'; +SELECT * FROM pg_temp.result01a EXCEPT SELECT * FROM pg_temp.result01b; + hkey | a | b | aid | alabel +------+---+---+-----+-------- +(0 rows) + +SELECT * FROM pg_temp.result02a EXCEPT SELECT * FROM pg_temp.result02b; + hkey | a | b | aid | alabel | bid | blabel +------+---+---+-----+--------+-----+-------- +(0 rows) + +RESET max_parallel_workers_per_gather; diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql index db9a6b4..8ad54b4 100644 --- a/src/test/regress/sql/partition_join.sql +++ b/src/test/regress/sql/partition_join.sql @@ -435,3 +435,67 @@ ANALYZE prt2; EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b; + +-- +-- For asymmetric partition-wise join +-- +CREATE TABLE prt5 (hkey int, a int, b int) PARTITION BY HASH(hkey); +CREATE TABLE prt5_p0 PARTITION OF prt5 + FOR VALUES WITH (modulus 3, remainder 0); +CREATE TABLE prt5_p1 PARTITION OF prt5 + FOR VALUES WITH (modulus 3, remainder 1); +CREATE TABLE prt5_p2 PARTITION OF prt5 + FOR VALUES WITH (modulus 3, remainder 2); +CREATE TABLE t5_1 (aid int, alabel text); +CREATE TABLE t5_2 (bid int, blabel text); + +INSERT INTO prt5 (SELECT x, (1000.0 * random())::int, + (1000.0 * random())::int + FROM generate_series(1,1000000) x); +INSERT INTO t5_1 (SELECT x, md5(x::text) FROM generate_series(-200, 800) x); +INSERT INTO t5_2 (SELECT x, md5(x::text) FROM generate_series(-200, 800) x); + +VACUUM ANALYZE prt5; +VACUUM ANALYZE t5_1; +VACUUM ANALYZE t5_2; + +SET max_parallel_workers_per_gather = 0; +EXPLAIN (COSTS OFF) +SELECT * + FROM prt5 JOIN t5_1 ON a = aid AND alabel like '%abc%'; + +EXPLAIN (COSTS OFF) +SELECT * + FROM prt5 JOIN t5_1 ON a = aid AND alabel like '%ab%' + JOIN t5_2 ON b = bid AND blabel like '%cd%'; + +-- unable to extract non-partitioned right relation +EXPLAIN (COSTS OFF) +SELECT * FROM prt5 RIGHT JOIN t5_1 ON a = aid AND alabel like '%abc%'; +-- left side can be extracted, but no cost benefit +EXPLAIN (COSTS OFF) +SELECT * FROM prt5 LEFT JOIN t5_1 ON a = aid AND alabel like '%abc%'; + +-- validation of the results with/without asymmetric partition-wise join +SELECT * INTO pg_temp.result01a + FROM prt5 JOIN t5_1 ON a = aid AND alabel like '%abc%'; + +SELECT * INTO pg_temp.result02a + FROM prt5 JOIN t5_1 ON a = aid AND alabel like '%ab%' + JOIN t5_2 ON b = bid AND blabel like '%cd%'; + +SET enable_partitionwise_join = off; + +SELECT * INTO pg_temp.result01b + FROM prt5 JOIN t5_1 ON a = aid AND alabel like '%abc%'; + +SELECT * INTO pg_temp.result02b + FROM prt5 JOIN t5_1 ON a = aid AND alabel like '%ab%' + JOIN t5_2 ON b = bid AND blabel like '%cd%'; + +SELECT * FROM pg_temp.result01a EXCEPT SELECT * FROM pg_temp.result01b; +SELECT * FROM pg_temp.result02a EXCEPT SELECT * FROM pg_temp.result02b; + +RESET max_parallel_workers_per_gather; + +