From 53c7c239715824278c2abe19c15abdb1ed3d7d91 Mon Sep 17 00:00:00 2001 From: Ashutosh Bapat Date: Tue, 7 Feb 2017 10:47:49 +0530 Subject: [PATCH 06/11] Canonical partition scheme. For a single level partitioned table, annotate RelOptInfo of a partitioned table with canonical partition scheme. All partitioned tables, with the same partitioning scheme share the same canonical partitioning scheme. We store the RelOptInfo's corresponding to the partitions in RelOptInfo of the partitioned table. Those are arranged in the same order as the partition bound indices in the partition scheme. We do not handle multi-level partitioned tables since inheritance hierarchy does not retain the partition hierarchy. All the partitions at any level appear as children of the top-level partitioned table. Thus making it hard to associate a partition relation with corresponding partition bounds. Multi-level partitioned tables will be handled in a separate patch. --- src/backend/optimizer/path/allpaths.c | 48 +++++++ src/backend/optimizer/util/plancat.c | 232 +++++++++++++++++++++++++++++++++ src/include/nodes/relation.h | 51 ++++++++ 3 files changed, 331 insertions(+) diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index d8fac14..0eb56f3 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -829,6 +829,7 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, double *parent_attrsizes; int nattrs; ListCell *l; + int nparts; /* * Initialize to compute size estimates for whole append relation. @@ -850,6 +851,18 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, nattrs = rel->max_attr - rel->min_attr + 1; parent_attrsizes = (double *) palloc0(nattrs * sizeof(double)); + /* + * For a partitioned table, allocate an array to hold RelOptInfo's of the + * partitions. It will be filled while handling the children below. + */ + if (rel->part_scheme) + { + nparts = rel->part_scheme->nparts; + rel->part_rels = (RelOptInfo **) palloc0(sizeof(RelOptInfo *) * nparts); + } + else + nparts = 0; + foreach(l, root->append_rel_list) { AppendRelInfo *appinfo = (AppendRelInfo *) lfirst(l); @@ -879,6 +892,30 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, Assert(childrel->reloptkind == RELOPT_OTHER_MEMBER_REL); /* + * Two partitioned tables with the same partitioning scheme, have their + * partition bounds arranged in the same order. The order of partition + * OIDs in RelOptInfo corresponds to the partition bound order. Thus + * the OIDs of matching partitions from both the tables are placed at + * the same position in the array of partition OIDs in the respective + * RelOptInfos. Arranging RelOptInfos of partitions in the same order + * as their OIDs makes it easy to find the RelOptInfos of matching + * partitions for partition-wise join. + */ + if (rel->part_scheme) + { + int cnt_parts; + + for (cnt_parts = 0; cnt_parts < nparts; cnt_parts++) + { + if (rel->part_oids[cnt_parts] == childRTE->relid) + { + Assert(!rel->part_rels[cnt_parts]); + rel->part_rels[cnt_parts] = childrel; + } + } + } + + /* * We have to copy the parent's targetlist and quals to the child, * with appropriate substitution of variables. However, only the * baserestrictinfo quals are needed before we can check for @@ -1130,6 +1167,17 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, } } + /* Should have found all the childrels of a partitioned relation. */ + if (rel->part_scheme) + { + int cnt_parts; + + for (cnt_parts = 0; cnt_parts < nparts; cnt_parts++) + if (!rel->part_rels[cnt_parts]) + elog(ERROR, "could not find the RelOptInfo of a partition with oid %u", + rel->part_oids[cnt_parts]); + } + if (has_live_children) { /* diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 7836e6b..01ba885 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -27,6 +27,7 @@ #include "catalog/catalog.h" #include "catalog/dependency.h" #include "catalog/heap.h" +#include "catalog/pg_inherits_fn.h" #include "catalog/partition.h" #include "catalog/pg_am.h" #include "foreign/fdwapi.h" @@ -63,6 +64,13 @@ static List *get_relation_constraints(PlannerInfo *root, bool include_notnull); static List *build_index_tlist(PlannerInfo *root, IndexOptInfo *index, Relation heapRelation); +static List **build_baserel_partition_key_exprs(Relation relation, + Index varno); +static PartitionScheme find_partition_scheme(struct PlannerInfo *root, + Relation rel); +static void get_relation_partition_info(PlannerInfo *root, RelOptInfo *rel, + Relation relation, bool inhparent); + /* @@ -412,6 +420,9 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, /* Collect info about relation's foreign keys, if relevant */ get_relation_foreign_keys(root, rel, relation, inhparent); + /* Collect info about relation's partitioning scheme, if any. */ + get_relation_partition_info(root, rel, relation, inhparent); + heap_close(relation, NoLock); /* @@ -1716,3 +1727,224 @@ has_row_triggers(PlannerInfo *root, Index rti, CmdType event) heap_close(relation, NoLock); return result; } + +/* + * get_relation_partition_info + * + * Retrieves partitioning information for a given relation. + * + * Partitioning scheme, partition key expressions and OIDs of partitions are + * added to the given RelOptInfo. A partitioned table can participate in the + * query as a simple relation or an inheritance parent. Only the later can have + * child relations, and hence partitions. From the point of view of the query + * optimizer only such relations are considered to be partitioned. Hence + * partitioning information is set only for an inheritance parent. + */ +static void +get_relation_partition_info(PlannerInfo *root, RelOptInfo *rel, + Relation relation, bool inhparent) +{ + /* No partitioning information for an unpartitioned relation. */ + if (relation->rd_rel->relkind != RELKIND_PARTITIONED_TABLE || + !inhparent || + !(rel->part_scheme = find_partition_scheme(root, relation))) + { + rel->partexprs = NULL; + rel->part_rels = NULL; + rel->part_oids = NULL; + return; + } + + rel->partexprs = build_baserel_partition_key_exprs(relation, rel->relid); + rel->part_oids = RelationGetPartitionDesc(relation)->oids; + + /* + * RelOptInfos of the partitions will be filled in when we build those for + * the child relations. + */ + rel->part_rels = NULL; + return; +} + +/* + * find_partition_scheme + * + * The function returns a canonical partition scheme which exactly matches the + * partitioning properties of the given relation if one exists in the of + * canonical partitioning schemes maintained in PlannerInfo. If none of the + * existing partitioning schemes match, the function creates a canonical + * partition scheme and adds it to the list. + * + * For an unpartitioned table or for a multi-level partitioned table it returns + * NULL. See comments in the function for more details. + */ +static PartitionScheme +find_partition_scheme(PlannerInfo *root, Relation relation) +{ + PartitionKey part_key = RelationGetPartitionKey(relation); + PartitionDesc part_desc = RelationGetPartitionDesc(relation); + ListCell *lc; + int nparts; + int partnatts; + int cnt_pks; + int cnt_parts; + PartitionScheme part_scheme = NULL; + + /* No partition scheme for an unpartitioned relation. */ + if (!part_desc || !part_key) + return NULL; + + nparts = part_desc->nparts; + partnatts = part_key->partnatts; + + /* + * For a multi-level partitioned table, we do not retain the partitioning + * hierarchy while expanding RTE for the topmost parent. Thus the number of + * children as per root->append_rel_list does not match the number of + * partitions specified in the partition descriptor and hence the + * partitioning scheme of a multi-partitioned table does not reflect the + * true picture. So for now, treat a multi-partitioned table as not + * partitioned. + */ + for (cnt_parts = 0; cnt_parts < nparts; cnt_parts++) + { + if (has_subclass(part_desc->oids[cnt_parts])) + return NULL; + } + + /* Search for a matching partition scheme and return if found one. */ + foreach (lc, root->part_schemes) + { + part_scheme = lfirst(lc); + + /* Match number of partitions and partitioning strategy. */ + if (nparts != part_scheme->nparts || + part_key->strategy != part_scheme->strategy || + partnatts != part_scheme->partnatts) + continue; + + /* Match the partition key types. */ + for (cnt_pks = 0; cnt_pks < partnatts; cnt_pks++) + { + /* + * For types, it suffices to match the type id, mod and collation; + * len, byval and align are depedent on the first two. + */ + if (part_key->partopfamily[cnt_pks] != part_scheme->partopfamily[cnt_pks] || + part_key->partopcintype[cnt_pks] != part_scheme->partopcintype[cnt_pks] || + part_key->parttypid[cnt_pks] != part_scheme->key_types[cnt_pks] || + part_key->parttypmod[cnt_pks] != part_scheme->key_typmods[cnt_pks] || + part_key->parttypcoll[cnt_pks] != part_scheme->key_collations[cnt_pks]) + break; + } + + /* Some partition key didn't match. Check next partitioning scheme. */ + if (cnt_pks < partnatts) + continue; + + if (!partition_bounds_equal(part_key, part_desc->boundinfo, + part_scheme->boundinfo)) + continue; + + /* Found matching partition scheme. */ + return part_scheme; + } + + /* Did not find matching partition scheme. Create one. */ + part_scheme = (PartitionScheme) palloc0(sizeof(PartitionSchemeData)); + + /* Copy partition bounds/lists. */ + part_scheme->nparts = part_desc->nparts; + part_scheme->strategy = part_key->strategy; + part_scheme->boundinfo = part_desc->boundinfo; + + /* Store partition key information. */ + part_scheme->partnatts = part_key->partnatts; + + part_scheme->partopfamily = (Oid *) palloc(sizeof(Oid) * partnatts); + memcpy(part_scheme->partopfamily, part_key->partopfamily, + sizeof(Oid) * partnatts); + + part_scheme->partopcintype = (Oid *) palloc(sizeof(Oid) * partnatts); + memcpy(part_scheme->partopcintype, part_key->partopcintype, + sizeof(Oid) * partnatts); + + part_scheme->key_types = (Oid *) palloc(sizeof(Oid) * partnatts); + memcpy(part_scheme->key_types, part_key->parttypid, + sizeof(Oid) * partnatts); + + part_scheme->key_typmods = (int32 *) palloc(sizeof(int32) * partnatts); + memcpy(part_scheme->key_typmods, part_key->parttypmod, + sizeof(int32) * partnatts); + + part_scheme->key_collations = (Oid *) palloc(sizeof(Oid) * partnatts); + memcpy(part_scheme->key_collations, part_key->parttypcoll, + sizeof(Oid) * partnatts); + + /* Add the partitioning scheme to PlannerInfo. */ + root->part_schemes = lappend(root->part_schemes, part_scheme); + + return part_scheme; +} + +/* + * build_baserel_partition_key_exprs + * + * Collect partition key expressions for a given base relation. The function + * converts any single column partition keys into corresponding Var nodes. It + * restamps Var nodes in partition key expressions by given varno. The + * partition key expressions are returned as an array of single element Lists + * to be stored in RelOptInfo of the base relation. + */ +static List ** +build_baserel_partition_key_exprs(Relation relation, Index varno) +{ + PartitionKey part_key = RelationGetPartitionKey(relation); + int num_pkexprs; + int cnt_pke; + List **partexprs; + ListCell *lc; + + if (!part_key || part_key->partnatts <= 0) + return NULL; + + num_pkexprs = part_key->partnatts; + partexprs = (List **) palloc(sizeof(List *) * num_pkexprs); + lc = list_head(part_key->partexprs); + + for (cnt_pke = 0; cnt_pke < num_pkexprs; cnt_pke++) + { + AttrNumber attno = part_key->partattrs[cnt_pke]; + Expr *pkexpr; + + if (attno != InvalidAttrNumber) + { + /* Single column partition key is stored as a Var node. */ + Form_pg_attribute att_tup; + + if (attno < 0) + att_tup = SystemAttributeDefinition(attno, + relation->rd_rel->relhasoids); + else + att_tup = relation->rd_att->attrs[attno - 1]; + + pkexpr = (Expr *) makeVar(varno, attno, att_tup->atttypid, + att_tup->atttypmod, + att_tup->attcollation, 0); + } + else + { + if (lc == NULL) + elog(ERROR, "wrong number of partition key expressions"); + + /* Re-stamp the expressions with given varno. */ + pkexpr = (Expr *) copyObject(lfirst(lc)); + ChangeVarNodes((Node *) pkexpr, 1, varno, 0); + lc = lnext(lc); + } + + partexprs[cnt_pke] = list_make1(pkexpr); + } + + return partexprs; +} diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index 643be54..4f99184 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -15,6 +15,7 @@ #define RELATION_H #include "access/sdir.h" +#include "catalog/partition.h" #include "lib/stringinfo.h" #include "nodes/params.h" #include "nodes/parsenodes.h" @@ -261,6 +262,9 @@ typedef struct PlannerInfo List *distinct_pathkeys; /* distinctClause pathkeys, if any */ List *sort_pathkeys; /* sortClause pathkeys, if any */ + List *part_schemes; /* Canonicalised partition schemes + * used in the query. */ + List *initial_rels; /* RelOptInfos we are now trying to join */ /* Use fetch_upper_rel() to get any particular upper rel */ @@ -321,6 +325,38 @@ typedef struct PlannerInfo ((root)->simple_rte_array ? (root)->simple_rte_array[rti] : \ rt_fetch(rti, (root)->parse->rtable)) +/* + * Partitioning scheme + * Structure to hold partitioning scheme for a given relation. + * + * Multiple relations may be partitioned in the same way. The relations + * resulting from joining such relations may be partitioned in the same way as + * the joining relations. Similarly, relations derived from such relations by + * grouping, sorting may be partitioned in the same way as the underlying + * scan relations. All such relations partitioned in the same way share the + * partitioning scheme. + * + * PlannerInfo stores a list of distinct "canonical" partitioning schemes. + * RelOptInfo of a partitioned relation holds the pointer to "canonical" + * partitioning scheme. + */ +typedef struct PartitionSchemeData +{ + /* Information about partitions */ + int nparts; /* number of partitions */ + PartitionBoundInfo boundinfo; /* Partition bounds/lists */ + + /* Information about partition keys */ + char strategy; /* partition strategy */ + int16 partnatts; /* number of partition attributes */ + Oid *partopfamily; /* OIDs of operator families */ + Oid *partopcintype; /* OIDs of opclass declared input data types */ + Oid *key_types; /* OIDs of partition key data types. */ + int32 *key_typmods; /* typmods of partition keys. */ + Oid *key_collations; /* OIDs of collations of partition keys. */ +} PartitionSchemeData; + +typedef struct PartitionSchemeData *PartitionScheme; /*---------- * RelOptInfo @@ -531,6 +567,7 @@ typedef struct RelOptInfo PlannerInfo *subroot; /* if subquery */ List *subplan_params; /* if subquery */ int rel_parallel_workers; /* wanted number of parallel workers */ + Oid *part_oids; /* OIDs of partitions */ /* Information about foreign tables and foreign joins */ Oid serverid; /* identifies server for the table or join */ @@ -549,6 +586,20 @@ typedef struct RelOptInfo List *joininfo; /* RestrictInfo structures for join clauses * involving this rel */ bool has_eclass_joins; /* T means joininfo is incomplete */ + + /* For all the partitioned relations. */ + PartitionScheme part_scheme; /* Partitioning scheme. */ + struct RelOptInfo **part_rels; /* Array of RelOptInfos of partitions, + * stored in the same order as bounds + * or lists in PartitionScheme. + */ + List **partexprs; /* Array of list of partition key + * expressions. For base relations + * these are one element lists. For + * join there may be as many elements + * as the number of joining + * relations. + */ } RelOptInfo; /* -- 1.7.9.5