>From dc82608078d14bffc6c2e174100fbe8b78dcb6fa Mon Sep 17 00:00:00 2001 From: amit Date: Wed, 27 Jul 2016 16:59:21 +0900 Subject: [PATCH 7/8] Tuple routing for partitioned tables. Both COPY FROM and INSERT are covered by this commit. Routing to foreing partitions is not supported at the moment. --- src/backend/catalog/partition.c | 289 +++++++++++++++++++++++++++++++- src/backend/commands/copy.c | 151 ++++++++++++++++- src/backend/commands/tablecmds.c | 1 + src/backend/executor/execMain.c | 58 ++++++- src/backend/executor/nodeModifyTable.c | 130 ++++++++++++++ src/backend/parser/analyze.c | 8 + src/include/catalog/partition.h | 6 + src/include/executor/executor.h | 6 + src/include/nodes/execnodes.h | 8 + src/test/regress/expected/insert.out | 52 ++++++ src/test/regress/sql/insert.sql | 25 +++ 11 files changed, 728 insertions(+), 6 deletions(-) diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index 03c7fa1..d758500 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -161,6 +161,18 @@ static Oid get_partition_operator(PartitionKey key, int col, StrategyNumber stra static List *generate_partition_qual(Relation rel, bool recurse); +/* Support get_partition_for_tuple() */ +static PartitionKeyExecInfo *BuildPartitionKeyExecInfo(Relation rel); +static void FormPartitionKeyDatum(PartitionKeyExecInfo *pkinfo, + TupleTableSlot *slot, + EState *estate, + Datum *values, + bool *isnull); +static int list_partition_for_tuple(PartitionKey key, PartitionDesc pdesc, + Datum value, bool isnull); +static int range_partition_for_tuple(PartitionKey key, PartitionDesc pdesc, + Datum *tuple); + /* List partition related support functions */ static bool equal_list_info(PartitionKey key, PartitionListInfo *l1, PartitionListInfo *l2); @@ -174,6 +186,8 @@ static PartitionRangeBound *copy_range_bound(PartitionKey key, PartitionRangeBou static bool equal_range_info(PartitionKey key, PartitionRangeInfo *r1, PartitionRangeInfo *r2); static int32 partition_rbound_cmp(PartitionKey key, PartitionRangeBound *b1, void *arg); +static int32 partition_rbound_datum_cmp(PartitionKey key, PartitionRangeBound *bound, + void *arg); static bool partition_rbound_eq(PartitionKey key, PartitionRangeBound *b1, PartitionRangeBound *b2); typedef int32 (*partition_rbound_bsearch_cmp_fn) (PartitionKey, @@ -988,7 +1002,7 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode, pd[i] = (PartitionDispatch) palloc(sizeof(PartitionDispatchData)); pd[i]->relid = RelationGetRelid(partrel); - pd[i]->pkinfo = NULL; + pd[i]->pkinfo = BuildPartitionKeyExecInfo(partrel); pd[i]->partdesc = partdesc; pd[i]->indexes = (int *) palloc(partdesc->nparts * sizeof(int)); heap_close(partrel, NoLock); @@ -1459,6 +1473,245 @@ generate_partition_qual(Relation rel, bool recurse) return result; } +/* + * BuildPartitionKeyExecInfo + * Construct a list of PartitionKeyExecInfo records for an open + * relation + * + * PartitionKeyExecInfo stores the information about the partition key + * that's needed when inserting tuples into a partitioned table; especially, + * partition key expression state if there are any expression columns in + * the partition key. Normally we build a PartitionKeyExecInfo for a + * partitioned table just once per command, and then use it for (potentially) + * many tuples. + * + */ +static PartitionKeyExecInfo * +BuildPartitionKeyExecInfo(Relation rel) +{ + PartitionKeyExecInfo *pkinfo; + + pkinfo = (PartitionKeyExecInfo *) palloc0(sizeof(PartitionKeyExecInfo)); + pkinfo->pi_Key = RelationGetPartitionKey(rel); + pkinfo->pi_ExpressionState = NIL; + + return pkinfo; +} + +/* ---------------- + * FormPartitionKeyDatum + * Construct values[] and isnull[] arrays for the partition key + * of a tuple. + * + * pkinfo partition key execution info + * slot Heap tuple from which to extract partition key + * estate executor state for evaluating any partition key + * expressions (must be non-NULL) + * values Array of partition key Datums (output area) + * isnull Array of is-null indicators (output area) + * + * the ecxt_scantuple slot of estate's per-tuple expr context must point to + * the heap tuple passed in. + * ---------------- + */ +static void +FormPartitionKeyDatum(PartitionKeyExecInfo *pkinfo, + TupleTableSlot *slot, + EState *estate, + Datum *values, + bool *isnull) +{ + ListCell *partexpr_item; + int i; + + if (pkinfo->pi_Key->partexprs != NIL && pkinfo->pi_ExpressionState == NIL) + { + /* Check caller has set up context correctly */ + Assert(estate != NULL && + GetPerTupleExprContext(estate)->ecxt_scantuple == slot); + + /* First time through, set up expression evaluation state */ + pkinfo->pi_ExpressionState = (List *) + ExecPrepareExpr((Expr *) pkinfo->pi_Key->partexprs, + estate); + } + + partexpr_item = list_head(pkinfo->pi_ExpressionState); + for (i = 0; i < pkinfo->pi_Key->partnatts; i++) + { + AttrNumber keycol = pkinfo->pi_Key->partattrs[i]; + Datum pkDatum; + bool isNull; + + if (keycol != 0) + { + /* Plain column; get the value directly from the heap tuple */ + pkDatum = slot_getattr(slot, keycol, &isNull); + } + else + { + /* Expression; need to evaluate it */ + if (partexpr_item == NULL) + elog(ERROR, "wrong number of partition key expressions"); + pkDatum = ExecEvalExprSwitchContext((ExprState *) lfirst(partexpr_item), + GetPerTupleExprContext(estate), + &isNull, + NULL); + partexpr_item = lnext(partexpr_item); + } + values[i] = pkDatum; + isnull[i] = isNull; + } + + if (partexpr_item != NULL) + elog(ERROR, "wrong number of partition key expressions"); +} + +/* + * get_partition_for_tuple + * Finds a leaf partition for tuple contained in *slot + * + * Returned value is the sequence number of the leaf partition thus found, + * or -1 if no leaf partition is found for the tuple. *failed_at is set + * to the OID of the partitioned table whose partition was not found in + * the latter case. + */ +int +get_partition_for_tuple(PartitionDispatch *pd, + TupleTableSlot *slot, + EState *estate, + Oid *failed_at) +{ + PartitionDispatch parent; + Datum values[PARTITION_MAX_KEYS]; + bool isnull[PARTITION_MAX_KEYS]; + int cur_idx; + int i; + + /* start with the root partitioned table */ + parent = pd[0]; + while(1) + { + PartitionDesc partdesc = parent->partdesc; + PartitionKeyExecInfo *pkinfo = parent->pkinfo; + + /* Quick exit */ + if (partdesc->nparts == 0) + { + *failed_at = parent->relid; + return -1; + } + + /* Extract partition key from tuple */ + FormPartitionKeyDatum(pkinfo, slot, estate, values, isnull); + + switch (pkinfo->pi_Key->strategy) + { + case PARTITION_STRATEGY_LIST: + cur_idx = list_partition_for_tuple(pkinfo->pi_Key, partdesc, + values[0], isnull[0]); + break; + + case PARTITION_STRATEGY_RANGE: + /* Disallow nulls in the partition key of the tuple */ + for (i = 0; i < pkinfo->pi_Key->partnatts; i++) + if (isnull[i]) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("range partition key of row contains null"))); + + cur_idx = range_partition_for_tuple(pkinfo->pi_Key, partdesc, + values); + break; + } + + /* + * cur_idx < 0 means we failed to find a partition of this parent. + * cur_idx >= 0 means we either found the leaf partition we have been + * looking for, or the next parent to find a partition of. + */ + if (cur_idx < 0) + { + *failed_at = parent->relid; + return -1; + } + else if (parent->indexes[cur_idx] < 0) + parent = pd[-parent->indexes[cur_idx]]; + else + break; + } + + return parent->indexes[cur_idx]; +} + +/* + * list_partition_for_tuple + * Find the list partition for a tuple (arg 'value' contains the + * list partition key of the original tuple) + * + * Returns -1 if none found. + */ +static int +list_partition_for_tuple(PartitionKey key, PartitionDesc pdesc, + Datum value, bool isnull) +{ + PartitionListInfo listinfo; + int found; + + Assert(pdesc->nparts > 0); + Assert(pdesc->boundinfo->strategy == PARTITION_STRATEGY_LIST); + listinfo = pdesc->boundinfo->bounds.lists; + + if (isnull && listinfo.has_null) + return listinfo.null_index; + else if (!isnull) + { + found = partition_list_values_bsearch(key, + listinfo.values, + listinfo.nvalues, + value); + if (found >= 0) + return listinfo.indexes[found]; + } + + /* Control reaches here if isnull and !listinfo->has_null */ + return -1; +} + +/* + * range_partition_for_tuple + * Get the index of the range partition for a tuple (arg 'tuple' + * actually contains the range partition key of the original + * tuple) + * + * Returns -1 if none found. + */ +static int +range_partition_for_tuple(PartitionKey key, PartitionDesc pdesc, Datum *tuple) +{ + int offset; + PartitionRangeInfo rangeinfo; + + Assert(pdesc->nparts > 0); + Assert(pdesc->boundinfo->strategy == PARTITION_STRATEGY_RANGE); + rangeinfo = pdesc->boundinfo->bounds.ranges; + + offset = partition_rbound_bsearch(key, + rangeinfo.bounds, rangeinfo.nbounds, + tuple, partition_rbound_datum_cmp, + true, NULL); + + /* + * Offset returned is such that the bound at offset is found to be less + * or equal with the tuple. That is, the tuple belongs to the partition + * with the rangeinfo.bounds[offset] as the lower bound and + * rangeinfo.bounds[offset+1] as the upper bound, provided the latter is + * indeed an upper (!lower) bound. If it turns out otherwise, the + * corresponding index will be -1, which means no valid partition exists. + */ + return rangeinfo.indexes[offset+1]; +} + /* List partition related support functions */ /* @@ -1704,6 +1957,40 @@ partition_rbound_cmp(PartitionKey key, PartitionRangeBound *b1, void *arg) } /* + * Return whether bound <=, =, >= partition key of tuple + * + * The 3rd argument is void * so that it can be used with + * partition_rbound_bsearch() + */ +static int32 +partition_rbound_datum_cmp(PartitionKey key, PartitionRangeBound *bound, + void *arg) +{ + Datum *datums1 = bound->datums, + *datums2 = (Datum *) arg; + int i; + int32 cmpval; + + for (i = 0; i < key->partnatts; i++) + { + if (bound->infinite[i]) + return bound->lower ? -1 : 1; + + cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[i], + key->partcollation[i], + datums1[i], datums2[i])); + if (cmpval != 0) + break; + } + + /* If datums are equal and this is an upper bound, tuple > bound */ + if (cmpval == 0 && !bound->lower) + return -1; + + return cmpval; +} + +/* * Return whether two range bounds are equal simply by comparing datums */ static bool diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 7a2bf94..7d76ead 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -161,6 +161,10 @@ typedef struct CopyStateData ExprState **defexprs; /* array of default att expressions */ bool volatile_defexprs; /* is any of defexprs volatile? */ List *range_table; + PartitionDispatch *partition_dispatch_info; + int num_partitions; + ResultRelInfo *partitions; + TupleConversionMap **partition_tupconv_maps; /* * These variables are used to reduce overhead in textual COPY FROM. @@ -1397,6 +1401,67 @@ BeginCopy(ParseState *pstate, (errcode(ERRCODE_UNDEFINED_COLUMN), errmsg("table \"%s\" does not have OIDs", RelationGetRelationName(cstate->rel)))); + + /* + * Initialize state for CopyFrom tuple routing. Watch out for + * any foreign partitions. + */ + if (is_from && rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + { + PartitionDispatch *pd; + List *leaf_parts; + ListCell *cell; + int i, + num_leaf_parts; + ResultRelInfo *leaf_part_rri; + + /* Get the tuple-routing information and lock partitions */ + pd = RelationGetPartitionDispatchInfo(rel, RowExclusiveLock, + &leaf_parts); + num_leaf_parts = list_length(leaf_parts); + cstate->partition_dispatch_info = pd; + cstate->num_partitions = num_leaf_parts; + cstate->partitions = (ResultRelInfo *) palloc(num_leaf_parts * + sizeof(ResultRelInfo)); + cstate->partition_tupconv_maps = (TupleConversionMap **) + palloc0(num_leaf_parts * sizeof(TupleConversionMap *)); + + leaf_part_rri = cstate->partitions; + i = 0; + foreach(cell, leaf_parts) + { + Relation partrel; + + /* + * All partitions locked above; will be closed after CopyFrom is + * finished. + */ + partrel = heap_open(lfirst_oid(cell), NoLock); + + /* + * Verify result relation is a valid target for the current + * operation. + */ + CheckValidResultRel(partrel, CMD_INSERT); + + InitResultRelInfo(leaf_part_rri, + partrel, + 1, /* dummy */ + false, /* no need for partition check */ + 0); + + /* Open partition indices */ + ExecOpenIndices(leaf_part_rri, false); + + if (!equalTupleDescs(tupDesc, RelationGetDescr(partrel))) + cstate->partition_tupconv_maps[i] = + convert_tuples_by_name(tupDesc, + RelationGetDescr(partrel), + gettext_noop("could not convert row type")); + leaf_part_rri++; + i++; + } + } } else { @@ -2255,6 +2320,7 @@ CopyFrom(CopyState cstate) Datum *values; bool *nulls; ResultRelInfo *resultRelInfo; + ResultRelInfo *saved_resultRelInfo = NULL; EState *estate = CreateExecutorState(); /* for ExecConstraints() */ ExprContext *econtext; TupleTableSlot *myslot; @@ -2281,6 +2347,7 @@ CopyFrom(CopyState cstate) * only hint about them in the view case.) */ if (cstate->rel->rd_rel->relkind != RELKIND_RELATION && + cstate->rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE && !(cstate->rel->trigdesc && cstate->rel->trigdesc->trig_insert_instead_row)) { @@ -2391,6 +2458,7 @@ CopyFrom(CopyState cstate) InitResultRelInfo(resultRelInfo, cstate->rel, 1, /* dummy rangetable index */ + true, /* do load partition check expression */ 0); ExecOpenIndices(resultRelInfo, false); @@ -2418,6 +2486,7 @@ CopyFrom(CopyState cstate) if ((resultRelInfo->ri_TrigDesc != NULL && (resultRelInfo->ri_TrigDesc->trig_insert_before_row || resultRelInfo->ri_TrigDesc->trig_insert_instead_row)) || + cstate->partition_dispatch_info != NULL || cstate->volatile_defexprs) { useHeapMultiInsert = false; @@ -2442,7 +2511,11 @@ CopyFrom(CopyState cstate) values = (Datum *) palloc(tupDesc->natts * sizeof(Datum)); nulls = (bool *) palloc(tupDesc->natts * sizeof(bool)); - bistate = GetBulkInsertState(); + if (useHeapMultiInsert) + bistate = GetBulkInsertState(); + else + bistate = NULL; + econtext = GetPerTupleExprContext(estate); /* Set up callback to identify error line number */ @@ -2494,6 +2567,56 @@ CopyFrom(CopyState cstate) slot = myslot; ExecStoreTuple(tuple, slot, InvalidBuffer, false); + /* Determine the partition to heap_insert the tuple into */ + if (cstate->partition_dispatch_info) + { + int leaf_part_index; + TupleConversionMap *map; + + /* + * Away we go ... If we end up not finding a partition after all, + * ExecFindPartition() does not return and errors out instead. + * Otherwise, the returned value is to be used as an index into + * arrays mt_partitions[] and mt_partition_tupconv_maps[] that + * will get us the ResultRelInfo and TupleConversionMap for the + * partition, respectively. + */ + leaf_part_index = ExecFindPartition(resultRelInfo, + cstate->partition_dispatch_info, + slot, + estate); + Assert(leaf_part_index >= 0 && + leaf_part_index < cstate->num_partitions); + + /* + * Save the old ResultRelInfo and switch to the one corresponding + * to the selected partition. + */ + saved_resultRelInfo = resultRelInfo; + resultRelInfo = cstate->partitions + leaf_part_index; + + /* We do not yet have a way to insert into a foreign partition */ + if (resultRelInfo->ri_FdwRoutine) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot route inserted tuples to a foreign table"))); + + /* + * For ExecInsertIndexTuples() to work on the partition's indexes + */ + estate->es_result_relation_info = resultRelInfo; + + /* + * We might need to convert from the parent rowtype to the + * partition rowtype. + */ + map = cstate->partition_tupconv_maps[leaf_part_index]; + if (map) + tuple = do_convert_tuple(tuple, map); + + tuple->t_tableOid = RelationGetRelid(resultRelInfo->ri_RelationDesc); + } + skip_tuple = false; /* BEFORE ROW INSERT Triggers */ @@ -2553,7 +2676,8 @@ CopyFrom(CopyState cstate) List *recheckIndexes = NIL; /* OK, store the tuple and create index entries for it */ - heap_insert(cstate->rel, tuple, mycid, hi_options, bistate); + heap_insert(resultRelInfo->ri_RelationDesc, tuple, mycid, + hi_options, bistate); if (resultRelInfo->ri_NumIndices > 0) recheckIndexes = ExecInsertIndexTuples(slot, @@ -2577,6 +2701,12 @@ CopyFrom(CopyState cstate) * tuples inserted by an INSERT command. */ processed++; + + if (saved_resultRelInfo) + { + resultRelInfo = saved_resultRelInfo; + estate->es_result_relation_info = resultRelInfo; + } } } @@ -2590,7 +2720,8 @@ CopyFrom(CopyState cstate) /* Done, clean up */ error_context_stack = errcallback.previous; - FreeBulkInsertState(bistate); + if (bistate) + FreeBulkInsertState(bistate); MemoryContextSwitchTo(oldcontext); @@ -2614,6 +2745,20 @@ CopyFrom(CopyState cstate) ExecCloseIndices(resultRelInfo); + /* Close all partitions and indices thereof */ + if (cstate->partition_dispatch_info) + { + int i; + + for (i = 0; i < cstate->num_partitions; i++) + { + ResultRelInfo *resultRelInfo = cstate->partitions + i; + + ExecCloseIndices(resultRelInfo); + heap_close(resultRelInfo->ri_RelationDesc, NoLock); + } + } + FreeExecutorState(estate); /* diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 3b72ae3..6478211 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -1293,6 +1293,7 @@ ExecuteTruncate(TruncateStmt *stmt) InitResultRelInfo(resultRelInfo, rel, 0, /* dummy rangetable index */ + false, 0); resultRelInfo++; } diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index c7a6347..54fb771 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -826,6 +826,7 @@ InitPlan(QueryDesc *queryDesc, int eflags) InitResultRelInfo(resultRelInfo, resultRelation, resultRelationIndex, + true, estate->es_instrument); resultRelInfo++; } @@ -1215,6 +1216,7 @@ void InitResultRelInfo(ResultRelInfo *resultRelInfo, Relation resultRelationDesc, Index resultRelationIndex, + bool load_partition_check, int instrument_options) { MemSet(resultRelInfo, 0, sizeof(ResultRelInfo)); @@ -1252,8 +1254,10 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo, resultRelInfo->ri_ConstraintExprs = NULL; resultRelInfo->ri_junkFilter = NULL; resultRelInfo->ri_projectReturning = NULL; - resultRelInfo->ri_PartitionCheck = - RelationGetPartitionQual(resultRelationDesc, true); + if (load_partition_check) + resultRelInfo->ri_PartitionCheck = + RelationGetPartitionQual(resultRelationDesc, + true); } /* @@ -1316,6 +1320,7 @@ ExecGetTriggerResultRel(EState *estate, Oid relid) InitResultRelInfo(rInfo, rel, 0, /* dummy rangetable index */ + true, estate->es_instrument); estate->es_trig_target_relations = lappend(estate->es_trig_target_relations, rInfo); @@ -2990,3 +2995,52 @@ EvalPlanQualEnd(EPQState *epqstate) epqstate->planstate = NULL; epqstate->origslot = NULL; } + +/* + * ExecFindPartition -- Find a leaf partition in the partition tree rooted + * at parent, for the heap tuple contained in *slot + * + * estate must be non-NULL; we'll need it to compute any expressions in the + * partition key(s) + * + * If no leaf partition is found, this routine errors out with the appropriate + * error message, else it returns the leaf partition sequence number returned + * by get_partition_for_tuple() unchanged. + */ +int +ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, + TupleTableSlot *slot, EState *estate) +{ + int result; + Oid failed_at; + ExprContext *econtext = GetPerTupleExprContext(estate); + + econtext->ecxt_scantuple = slot; + result = get_partition_for_tuple(pd, slot, estate, &failed_at); + if (result < 0) + { + Relation rel = resultRelInfo->ri_RelationDesc; + char *val_desc; + Bitmapset *insertedCols, + *updatedCols, + *modifiedCols; + TupleDesc tupDesc = RelationGetDescr(rel); + + insertedCols = GetInsertedColumns(resultRelInfo, estate); + updatedCols = GetUpdatedColumns(resultRelInfo, estate); + modifiedCols = bms_union(insertedCols, updatedCols); + val_desc = ExecBuildSlotValueDescription(RelationGetRelid(rel), + slot, + tupDesc, + modifiedCols, + 64); + Assert(OidIsValid(failed_at)); + ereport(ERROR, + (errcode(ERRCODE_CHECK_VIOLATION), + errmsg("no partition of relation \"%s\" found for row", + get_rel_name(failed_at)), + val_desc ? errdetail("Failing row contains %s.", val_desc) : 0)); + } + + return result; +} diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index a612b08..aa1d8b9 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -258,6 +258,7 @@ ExecInsert(ModifyTableState *mtstate, { HeapTuple tuple; ResultRelInfo *resultRelInfo; + ResultRelInfo *saved_resultRelInfo = NULL; Relation resultRelationDesc; Oid newId; List *recheckIndexes = NIL; @@ -272,6 +273,56 @@ ExecInsert(ModifyTableState *mtstate, * get information on the (current) result relation */ resultRelInfo = estate->es_result_relation_info; + + /* Determine the partition to heap_insert the tuple into */ + if (mtstate->mt_partition_dispatch_info) + { + int leaf_part_index; + TupleConversionMap *map; + + /* + * Away we go ... If we end up not finding a partition after all, + * ExecFindPartition() does not return and errors out instead. + * Otherwise, the returned value is to be used as an index into + * arrays mt_partitions[] and mt_partition_tupconv_maps[] that + * will get us the ResultRelInfo and TupleConversionMap for the + * partition, respectively. + */ + leaf_part_index = ExecFindPartition(resultRelInfo, + mtstate->mt_partition_dispatch_info, + slot, + estate); + Assert(leaf_part_index >= 0 && + leaf_part_index < mtstate->mt_num_partitions); + + /* + * Save the old ResultRelInfo and switch to the one corresponding to + * the selected partition. + */ + saved_resultRelInfo = resultRelInfo; + resultRelInfo = mtstate->mt_partitions + leaf_part_index; + + /* We do not yet have a way to insert into a foreign partition */ + if (resultRelInfo->ri_FdwRoutine) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot route inserted tuples to a foreign table"))); + + /* For ExecInsertIndexTuples() to work on the partition's indexes */ + estate->es_result_relation_info = resultRelInfo; + + /* + * We might need to convert from the parent rowtype to the partition + * rowtype. + */ + map = mtstate->mt_partition_tupconv_maps[leaf_part_index]; + if (map) + { + tuple = do_convert_tuple(tuple, map); + ExecStoreTuple(tuple, slot, InvalidBuffer, false); + } + } + resultRelationDesc = resultRelInfo->ri_RelationDesc; /* @@ -511,6 +562,12 @@ ExecInsert(ModifyTableState *mtstate, list_free(recheckIndexes); + if (saved_resultRelInfo) + { + resultRelInfo = saved_resultRelInfo; + estate->es_result_relation_info = resultRelInfo; + } + /* * Check any WITH CHECK OPTION constraints from parent views. We are * required to do this after testing all constraints and uniqueness @@ -1565,6 +1622,7 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) Plan *subplan; ListCell *l; int i; + Relation rel; /* check for unsupported flags */ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); @@ -1655,6 +1713,69 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) estate->es_result_relation_info = saved_resultRelInfo; + /* Build state for INSERT tuple routing */ + rel = mtstate->resultRelInfo->ri_RelationDesc; + if (operation == CMD_INSERT && + rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + { + PartitionDispatch *pd; + int i, + j, + num_leaf_parts; + List *leaf_parts; + ListCell *cell; + ResultRelInfo *leaf_part_rri; + + /* Form the partition node tree and lock partitions */ + pd = RelationGetPartitionDispatchInfo(rel, RowExclusiveLock, + &leaf_parts); + mtstate->mt_partition_dispatch_info = pd; + num_leaf_parts = list_length(leaf_parts); + mtstate->mt_num_partitions = num_leaf_parts; + mtstate->mt_partitions = (ResultRelInfo *) + palloc0(num_leaf_parts * sizeof(ResultRelInfo)); + mtstate->mt_partition_tupconv_maps = (TupleConversionMap **) + palloc0(num_leaf_parts * sizeof(TupleConversionMap *)); + + leaf_part_rri = mtstate->mt_partitions; + i = j = 0; + foreach(cell, leaf_parts) + { + Oid ftoid = lfirst_oid(cell); + Relation part_rel; + + part_rel = heap_open(ftoid, RowExclusiveLock); + + /* + * Verify result relation is a valid target for the current + * operation + */ + CheckValidResultRel(part_rel, CMD_INSERT); + + InitResultRelInfo(leaf_part_rri, + part_rel, + 1, /* dummy */ + false, /* no need for partition checks */ + eflags); + + /* Open partition indices (note: ON CONFLICT unsupported)*/ + if (leaf_part_rri->ri_RelationDesc->rd_rel->relhasindex && + operation != CMD_DELETE && + leaf_part_rri->ri_IndexRelationDescs == NULL) + ExecOpenIndices(leaf_part_rri, false); + + if (!equalTupleDescs(RelationGetDescr(rel), + RelationGetDescr(part_rel))) + mtstate->mt_partition_tupconv_maps[i] = + convert_tuples_by_name(RelationGetDescr(rel), + RelationGetDescr(part_rel), + gettext_noop("could not convert row type")); + + leaf_part_rri++; + i++; + } + } + /* * Initialize any WITH CHECK OPTION constraints if needed. */ @@ -1972,6 +2093,15 @@ ExecEndModifyTable(ModifyTableState *node) resultRelInfo); } + /* Close all partitions and indices thereof */ + for (i = 0; i < node->mt_num_partitions; i++) + { + ResultRelInfo *resultRelInfo = node->mt_partitions + i; + + ExecCloseIndices(resultRelInfo); + heap_close(resultRelInfo->ri_RelationDesc, NoLock); + } + /* * Free the exprcontext */ diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index 6901e08..c10b6c3 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -798,8 +798,16 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) /* Process ON CONFLICT, if any. */ if (stmt->onConflictClause) + { + /* Bail out if target relation is partitioned table */ + if (pstate->p_target_rangetblentry->relkind == RELKIND_PARTITIONED_TABLE) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("ON CONFLICT clause is not supported with partitioned tables"))); + qry->onConflict = transformOnConflictClause(pstate, stmt->onConflictClause); + } /* * If we have a RETURNING clause, we need to add the target relation to diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h index fb43cc1..1d231a3 100644 --- a/src/include/catalog/partition.h +++ b/src/include/catalog/partition.h @@ -14,6 +14,8 @@ #define PARTITION_H #include "fmgr.h" +#include "executor/tuptable.h" +#include "nodes/execnodes.h" #include "parser/parse_node.h" #include "utils/rel.h" @@ -50,4 +52,8 @@ extern List *RelationGetPartitionQual(Relation rel, bool recurse); /* For tuple routing */ extern PartitionDispatch *RelationGetPartitionDispatchInfo(Relation rel, int lockmode, List **leaf_part_oids); +extern int get_partition_for_tuple(PartitionDispatch *pd, + TupleTableSlot *slot, + EState *estate, + Oid *failed_at); #endif /* PARTITION_H */ diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 136276b..b4d09f9 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -14,6 +14,7 @@ #ifndef EXECUTOR_H #define EXECUTOR_H +#include "catalog/partition.h" #include "executor/execdesc.h" #include "nodes/parsenodes.h" @@ -188,6 +189,7 @@ extern void CheckValidResultRel(Relation resultRel, CmdType operation); extern void InitResultRelInfo(ResultRelInfo *resultRelInfo, Relation resultRelationDesc, Index resultRelationIndex, + bool load_partition_check, int instrument_options); extern ResultRelInfo *ExecGetTriggerResultRel(EState *estate, Oid relid); extern bool ExecContextForcesOids(PlanState *planstate, bool *hasoids); @@ -211,6 +213,10 @@ extern void EvalPlanQualSetPlan(EPQState *epqstate, extern void EvalPlanQualSetTuple(EPQState *epqstate, Index rti, HeapTuple tuple); extern HeapTuple EvalPlanQualGetTuple(EPQState *epqstate, Index rti); +extern int ExecFindPartition(ResultRelInfo *resultRelInfo, + PartitionDispatch *pd, + TupleTableSlot *slot, + EState *estate); #define EvalPlanQualSetSlot(epqstate, slot) ((epqstate)->origslot = (slot)) extern void EvalPlanQualFetchRowMarks(EPQState *epqstate); diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index ff8b66b..606cb21 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -16,6 +16,7 @@ #include "access/genam.h" #include "access/heapam.h" +#include "access/tupconvert.h" #include "executor/instrument.h" #include "lib/pairingheap.h" #include "nodes/params.h" @@ -1147,6 +1148,13 @@ typedef struct ModifyTableState * tlist */ TupleTableSlot *mt_conflproj; /* CONFLICT ... SET ... projection * target */ + struct PartitionDispatchData **mt_partition_dispatch_info; + /* Tuple-routing support info */ + int mt_num_partitions; /* Number of members in the + * following arrays */ + ResultRelInfo *mt_partitions; /* Per partition result relation */ + TupleConversionMap **mt_partition_tupconv_maps; + /* Per partition tuple conversion map */ } ModifyTableState; /* ---------------- diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out index 9ae6b09..d5dcb59 100644 --- a/src/test/regress/expected/insert.out +++ b/src/test/regress/expected/insert.out @@ -227,6 +227,58 @@ DETAIL: Failing row contains (cc, 1). -- ok insert into part_EE_FF_1_10 values ('ff', 1); insert into part_EE_FF_10_20 values ('ff', 11); +-- Check tuple routing for partitioned tables +-- fail +insert into range_parted values ('a', 0); +ERROR: no partition of relation "range_parted" found for row +DETAIL: Failing row contains (a, 0). +-- ok +insert into range_parted values ('a', 1); +insert into range_parted values ('a', 10); +-- fail +insert into range_parted values ('a', 20); +ERROR: no partition of relation "range_parted" found for row +DETAIL: Failing row contains (a, 20). +-- ok +insert into range_parted values ('b', 1); +insert into range_parted values ('b', 10); +select tableoid::regclass, * from range_parted; + tableoid | a | b +----------------+---+---- + part_a_1_a_10 | a | 1 + part_a_1_a_10 | a | 1 + part_a_10_a_20 | a | 10 + part_b_1_b_10 | b | 1 + part_b_10_b_20 | b | 10 + part_b_10_b_20 | b | 10 +(6 rows) + +-- ok +insert into list_parted values (null, 1); +insert into list_parted (a) values ('aA'); +-- fail (partition of part_EE_FF not found) +insert into list_parted values ('EE', 0); +ERROR: no partition of relation "part_ee_ff" found for row +DETAIL: Failing row contains (EE, 0). +insert into part_EE_FF values ('EE', 0); +ERROR: no partition of relation "part_ee_ff" found for row +DETAIL: Failing row contains (EE, 0). +-- ok +insert into list_parted values ('EE', 1); +insert into part_EE_FF values ('EE', 10); +select tableoid::regclass, * from list_parted; + tableoid | a | b +------------------+----+---- + part_aa_bb | aA | + part_cc_dd | cC | 1 + part_null | | 0 + part_null | | 1 + part_ee_ff_1_10 | ff | 1 + part_ee_ff_1_10 | EE | 1 + part_ee_ff_10_20 | ff | 11 + part_ee_ff_10_20 | EE | 10 +(8 rows) + -- cleanup drop table range_parted cascade; NOTICE: drop cascades to 4 other objects diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql index b6e821e..fbd30d9 100644 --- a/src/test/regress/sql/insert.sql +++ b/src/test/regress/sql/insert.sql @@ -140,6 +140,31 @@ insert into part_EE_FF_1_10 values ('cc', 1); insert into part_EE_FF_1_10 values ('ff', 1); insert into part_EE_FF_10_20 values ('ff', 11); +-- Check tuple routing for partitioned tables + +-- fail +insert into range_parted values ('a', 0); +-- ok +insert into range_parted values ('a', 1); +insert into range_parted values ('a', 10); +-- fail +insert into range_parted values ('a', 20); +-- ok +insert into range_parted values ('b', 1); +insert into range_parted values ('b', 10); +select tableoid::regclass, * from range_parted; + +-- ok +insert into list_parted values (null, 1); +insert into list_parted (a) values ('aA'); +-- fail (partition of part_EE_FF not found) +insert into list_parted values ('EE', 0); +insert into part_EE_FF values ('EE', 0); +-- ok +insert into list_parted values ('EE', 1); +insert into part_EE_FF values ('EE', 10); +select tableoid::regclass, * from list_parted; + -- cleanup drop table range_parted cascade; drop table list_parted cascade; -- 1.7.1