Home > mailing lists
Re: BUG #16840: Rows not found in table partitioned by hash when not all partitions exists - Mailing list pgsql-bugs

From	Tom Lane
Subject	Re: BUG #16840: Rows not found in table partitioned by hash when not all partitions exists
Date	January 27, 2021 22:54:23
Msg-id	2503822.1611788063@sss.pgh.pa.us Whole thread Raw
In response to	Re: BUG #16840: Rows not found in table partitioned by hash when not all partitions exists (Tom Lane <tgl@sss.pgh.pa.us>)
Responses	Re: BUG #16840: Rows not found in table partitioned by hash when not all partitions exists
List	pgsql-bugs
Tree view
I wrote:
> Hmm, seems to be a case of faulty partition exclusion, because the
> plan isn't scanning anything:

Here's a proposed patch for this.  The core of the problem is confusion
around the number of entries in the PartitionBoundInfoData.indexes array.
Each of the three types of partitioning has a different rule for that,
despite which we were expecting assorted code to know what to do, and
some places got it wrong for hash --- even hash-specific code :-(

I propose here to solve that by explicitly storing the number of entries
in PartitionBoundInfoData, and thereby removing the need for partition-
strategy-independent code to know anything about the rules.  I think
we can get away with that in the back branches by adding "nindexes"
at the end of the struct.  This could break extensions that are
manufacturing their own PartitionBoundInfoData structs, but it seems
unlikely that there are any.

Most of the patch just straightforwardly sets or uses the new field.
Notably, partition_bounds_equal() and partition_bounds_copy() get
significantly simpler and safer.  The actual bug fix is in
get_matching_hash_bounds() and perform_pruning_combine_step(), where
"all partitions" needs to be 0 .. nindexes-1 not 0 .. ndatums-1.
(The reason your example fails is that the OR clause should produce
"all partitions potentially match", but because of this bug, it's
producing a bitmask that doesn't include the partition we need.)

            regards, tom lane

diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index 1746cb8793..746cd1e9d7 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -1323,16 +1323,14 @@ get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull)
     {
         case PARTITION_STRATEGY_HASH:
             {
-                int            greatest_modulus;
                 uint64        rowHash;

-                greatest_modulus = get_hash_partition_greatest_modulus(boundinfo);
                 rowHash = compute_partition_hash_value(key->partnatts,
                                                        key->partsupfunc,
                                                        key->partcollation,
                                                        values, isnull);

-                part_index = boundinfo->indexes[rowHash % greatest_modulus];
+                part_index = boundinfo->indexes[rowHash % boundinfo->nindexes];
             }
             break;

diff --git a/src/backend/partitioning/partbounds.c b/src/backend/partitioning/partbounds.c
index b9aeb77bc2..104e8b06cd 100644
--- a/src/backend/partitioning/partbounds.c
+++ b/src/backend/partitioning/partbounds.c
@@ -224,7 +224,6 @@ static int    partition_range_bsearch(int partnatts, FmgrInfo *partsupfunc,
                                     Oid *partcollation,
                                     PartitionBoundInfo boundinfo,
                                     PartitionRangeBound *probe, int32 *cmpval);
-static int    get_partition_bound_num_indexes(PartitionBoundInfo b);
 static Expr *make_partition_op_expr(PartitionKey key, int keynum,
                                     uint16 strategy, Expr *arg1, Expr *arg2);
 static Oid    get_partition_operator(PartitionKey key, int col,
@@ -398,6 +397,7 @@ create_hash_bounds(PartitionBoundSpec **boundspecs, int nparts,

     boundinfo->ndatums = ndatums;
     boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *));
+    boundinfo->nindexes = greatest_modulus;
     boundinfo->indexes = (int *) palloc(greatest_modulus * sizeof(int));
     for (i = 0; i < greatest_modulus; i++)
         boundinfo->indexes[i] = -1;
@@ -530,6 +530,7 @@ create_list_bounds(PartitionBoundSpec **boundspecs, int nparts,

     boundinfo->ndatums = ndatums;
     boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *));
+    boundinfo->nindexes = ndatums;
     boundinfo->indexes = (int *) palloc(ndatums * sizeof(int));

     /*
@@ -725,8 +726,9 @@ create_range_bounds(PartitionBoundSpec **boundspecs, int nparts,

     /*
      * For range partitioning, an additional value of -1 is stored as the last
-     * element.
+     * element of the indexes[] array.
      */
+    boundinfo->nindexes = ndatums + 1;
     boundinfo->indexes = (int *) palloc((ndatums + 1) * sizeof(int));

     for (i = 0; i < ndatums; i++)
@@ -807,45 +809,39 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval,
     if (b1->ndatums != b2->ndatums)
         return false;

+    if (b1->nindexes != b2->nindexes)
+        return false;
+
     if (b1->null_index != b2->null_index)
         return false;

     if (b1->default_index != b2->default_index)
         return false;

-    if (b1->strategy == PARTITION_STRATEGY_HASH)
-    {
-        int            greatest_modulus = get_hash_partition_greatest_modulus(b1);
-
-        /*
-         * If two hash partitioned tables have different greatest moduli,
-         * their partition schemes don't match.
-         */
-        if (greatest_modulus != get_hash_partition_greatest_modulus(b2))
+    /* For all partition strategies, the indexes[] arrays have to match */
+    for (i = 0; i < b1->nindexes; i++)
+        if (b1->indexes[i] != b2->indexes[i])
             return false;

+    /* Finally, compare the datums[] arrays */
+    if (b1->strategy == PARTITION_STRATEGY_HASH)
+    {
         /*
          * We arrange the partitions in the ascending order of their moduli
          * and remainders.  Also every modulus is factor of next larger
          * modulus.  Therefore we can safely store index of a given partition
          * in indexes array at remainder of that partition.  Also entries at
          * (remainder + N * modulus) positions in indexes array are all same
-         * for (modulus, remainder) specification for any partition.  Thus
-         * datums array from both the given bounds are same, if and only if
-         * their indexes array will be same.  So, it suffices to compare
-         * indexes array.
-         */
-        for (i = 0; i < greatest_modulus; i++)
-            if (b1->indexes[i] != b2->indexes[i])
-                return false;
-
-#ifdef USE_ASSERT_CHECKING
-
-        /*
-         * Nonetheless make sure that the bounds are indeed same when the
+         * for (modulus, remainder) specification for any partition.  Thus the
+         * datums arrays from the given bounds are the same, if and only if
+         * their indexes arrays are the same.  So, it suffices to compare the
+         * indexes arrays.
+         *
+         * Nonetheless make sure that the bounds are indeed the same when the
          * indexes match.  Hash partition bound stores modulus and remainder
          * at b1->datums[i][0] and b1->datums[i][1] position respectively.
          */
+#ifdef USE_ASSERT_CHECKING
         for (i = 0; i < b1->ndatums; i++)
             Assert((b1->datums[i][0] == b2->datums[i][0] &&
                     b1->datums[i][1] == b2->datums[i][1]));
@@ -891,15 +887,7 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval,
                                   parttypbyval[j], parttyplen[j]))
                     return false;
             }
-
-            if (b1->indexes[i] != b2->indexes[i])
-                return false;
         }
-
-        /* There are ndatums+1 indexes in case of range partitions */
-        if (b1->strategy == PARTITION_STRATEGY_RANGE &&
-            b1->indexes[i] != b2->indexes[i])
-            return false;
     }
     return true;
 }
@@ -920,8 +908,8 @@ partition_bounds_copy(PartitionBoundInfo src,
     PartitionBoundInfo dest;
     int            i;
     int            ndatums;
+    int            nindexes;
     int            partnatts;
-    int            num_indexes;
     bool        hash_part;
     int            natts;

@@ -929,10 +917,9 @@ partition_bounds_copy(PartitionBoundInfo src,

     dest->strategy = src->strategy;
     ndatums = dest->ndatums = src->ndatums;
+    nindexes = dest->nindexes = src->nindexes;
     partnatts = key->partnatts;

-    num_indexes = get_partition_bound_num_indexes(src);
-
     /* List partitioned tables have only a single partition key. */
     Assert(key->strategy != PARTITION_STRATEGY_LIST || partnatts == 1);

@@ -990,8 +977,8 @@ partition_bounds_copy(PartitionBoundInfo src,
         }
     }

-    dest->indexes = (int *) palloc(sizeof(int) * num_indexes);
-    memcpy(dest->indexes, src->indexes, sizeof(int) * num_indexes);
+    dest->indexes = (int *) palloc(sizeof(int) * nindexes);
+    memcpy(dest->indexes, src->indexes, sizeof(int) * nindexes);

     dest->null_index = src->null_index;
     dest->default_index = src->default_index;
@@ -2456,6 +2443,7 @@ build_merged_partition_bounds(char strategy, List *merged_datums,
     }

     Assert(list_length(merged_indexes) == ndatums);
+    merged_bounds->nindexes = ndatums;
     merged_bounds->indexes = (int *) palloc(sizeof(int) * ndatums);
     pos = 0;
     foreach(lc, merged_indexes)
@@ -2889,7 +2877,7 @@ check_new_partition_bound(char *relname, Relation parent,
                                 (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
                                  errmsg("every hash partition modulus must be a factor of the next larger modulus")));

-                    greatest_modulus = get_hash_partition_greatest_modulus(boundinfo);
+                    greatest_modulus = boundinfo->nindexes;
                     remainder = spec->remainder;

                     /*
@@ -3282,18 +3270,15 @@ check_default_partition_contents(Relation parent, Relation default_rel,
 /*
  * get_hash_partition_greatest_modulus
  *
- * Returns the greatest modulus of the hash partition bound. The greatest
- * modulus will be at the end of the datums array because hash partitions are
- * arranged in the ascending order of their moduli and remainders.
+ * Returns the greatest modulus of the hash partition bound.
+ * This is no longer used in the core code, but we keep it around
+ * in case external modules are using it.
  */
 int
 get_hash_partition_greatest_modulus(PartitionBoundInfo bound)
 {
     Assert(bound && bound->strategy == PARTITION_STRATEGY_HASH);
-    Assert(bound->datums && bound->ndatums > 0);
-    Assert(DatumGetInt32(bound->datums[bound->ndatums - 1][0]) > 0);
-
-    return DatumGetInt32(bound->datums[bound->ndatums - 1][0]);
+    return bound->nindexes;
 }

 /*
@@ -3697,46 +3682,6 @@ qsort_partition_rbound_cmp(const void *a, const void *b, void *arg)
                                 b1, b2);
 }

-/*
- * get_partition_bound_num_indexes
- *
- * Returns the number of the entries in the partition bound indexes array.
- */
-static int
-get_partition_bound_num_indexes(PartitionBoundInfo bound)
-{
-    int            num_indexes;
-
-    Assert(bound);
-
-    switch (bound->strategy)
-    {
-        case PARTITION_STRATEGY_HASH:
-
-            /*
-             * The number of the entries in the indexes array is same as the
-             * greatest modulus.
-             */
-            num_indexes = get_hash_partition_greatest_modulus(bound);
-            break;
-
-        case PARTITION_STRATEGY_LIST:
-            num_indexes = bound->ndatums;
-            break;
-
-        case PARTITION_STRATEGY_RANGE:
-            /* Range partitioned table has an extra index. */
-            num_indexes = bound->ndatums + 1;
-            break;
-
-        default:
-            elog(ERROR, "unexpected partition strategy: %d",
-                 (int) bound->strategy);
-    }
-
-    return num_indexes;
-}
-
 /*
  * get_partition_operator
  *
diff --git a/src/backend/partitioning/partprune.c b/src/backend/partitioning/partprune.c
index fac921eea5..d0c51d800f 100644
--- a/src/backend/partitioning/partprune.c
+++ b/src/backend/partitioning/partprune.c
@@ -781,7 +781,10 @@ get_matching_partitions(PartitionPruneContext *context, List *pruning_steps)
     scan_default = final_result->scan_default;
     while ((i = bms_next_member(final_result->bound_offsets, i)) >= 0)
     {
-        int            partindex = context->boundinfo->indexes[i];
+        int            partindex;
+
+        Assert(i < context->boundinfo->nindexes);
+        partindex = context->boundinfo->indexes[i];

         if (partindex < 0)
         {
@@ -2514,20 +2517,19 @@ get_matching_hash_bounds(PartitionPruneContext *context,
         for (i = 0; i < partnatts; i++)
             isnull[i] = bms_is_member(i, nullkeys);

-        greatest_modulus = get_hash_partition_greatest_modulus(boundinfo);
         rowHash = compute_partition_hash_value(partnatts, partsupfunc, partcollation,
                                                values, isnull);

+        greatest_modulus = boundinfo->nindexes;
         if (partindices[rowHash % greatest_modulus] >= 0)
             result->bound_offsets =
                 bms_make_singleton(rowHash % greatest_modulus);
     }
     else
     {
-        /* Getting here means at least one hash partition exists. */
-        Assert(boundinfo->ndatums > 0);
+        /* Report all valid offsets into the boundinfo->indexes array. */
         result->bound_offsets = bms_add_range(NULL, 0,
-                                              boundinfo->ndatums - 1);
+                                              boundinfo->nindexes - 1);
     }

     /*
@@ -3389,29 +3391,24 @@ perform_pruning_combine_step(PartitionPruneContext *context,
                              PruneStepResult **step_results)
 {
     ListCell   *lc1;
-    PruneStepResult *result = NULL;
+    PruneStepResult *result;
     bool        firststep;

+    result = (PruneStepResult *) palloc0(sizeof(PruneStepResult));
+
     /*
      * A combine step without any source steps is an indication to not perform
      * any partition pruning.  Return all datum indexes in that case.
      */
-    result = (PruneStepResult *) palloc0(sizeof(PruneStepResult));
-    if (list_length(cstep->source_stepids) == 0)
+    if (cstep->source_stepids == NIL)
     {
         PartitionBoundInfo boundinfo = context->boundinfo;
-        int            rangemax;

         /*
-         * Add all valid offsets into the boundinfo->indexes array.  For range
-         * partitioning, boundinfo->indexes contains (boundinfo->ndatums + 1)
-         * valid entries; otherwise there are boundinfo->ndatums.
+         * Add all valid offsets into the boundinfo->indexes array.
          */
-        rangemax = context->strategy == PARTITION_STRATEGY_RANGE ?
-            boundinfo->ndatums : boundinfo->ndatums - 1;
-
-        result->bound_offsets =
-            bms_add_range(result->bound_offsets, 0, rangemax);
+        result->bound_offsets = bms_add_range(NULL, 0,
+                                              boundinfo->nindexes - 1);
         result->scan_default = partition_bound_has_default(boundinfo);
         result->scan_null = partition_bound_accepts_nulls(boundinfo);
         return result;
diff --git a/src/include/partitioning/partbounds.h b/src/include/partitioning/partbounds.h
index 6bd330dd93..ebf3ff1f49 100644
--- a/src/include/partitioning/partbounds.h
+++ b/src/include/partitioning/partbounds.h
@@ -30,7 +30,7 @@ struct RelOptInfo;                /* avoid including pathnodes.h here */
  * In the case of range partitioning, ndatums will typically be far less than
  * 2 * nparts, because a partition's upper bound and the next partition's lower
  * bound are the same in most common cases, and we only store one of them (the
- * upper bound).  In case of hash partitioning, ndatums will be same as the
+ * upper bound).  In case of hash partitioning, ndatums will be the same as the
  * number of partitions.
  *
  * For range and list partitioned tables, datums is an array of datum-tuples
@@ -46,24 +46,31 @@ struct RelOptInfo;                /* avoid including pathnodes.h here */
  * the partition key's operator classes and collations.
  *
  * In the case of list partitioning, the indexes array stores one entry for
- * every datum, which is the index of the partition that accepts a given datum.
- * In case of range partitioning, it stores one entry per distinct range
- * datum, which is the index of the partition for which a given datum
- * is an upper bound.  In the case of hash partitioning, the number of the
- * entries in the indexes array is same as the greatest modulus amongst all
- * partitions.  For a given partition key datum-tuple, the index of the
- * partition which would accept that datum-tuple would be given by the entry
- * pointed by remainder produced when hash value of the datum-tuple is divided
- * by the greatest modulus.
+ * each datum-array entry, which is the index of the partition that accepts
+ * rows matching that datum.  So nindexes == ndatums.
+ *
+ * In the case of range partitioning, the indexes array stores one entry per
+ * distinct range datum, which is the index of the partition for which that
+ * datum is an upper bound (or -1 for a "gap" that has no partition).  It is
+ * convenient to have an extra -1 entry representing values above the last
+ * range datum, so nindexes == ndatums + 1.
+ *
+ * In the case of hash partitioning, the number of entries in the indexes
+ * array is the same as the greatest modulus amongst all partitions (which
+ * is a multiple of all partition moduli), so nindexes == greatest modulus.
+ * The indexes array is indexed according to the hash key's remainder modulo
+ * the greatest modulus, and it contains either the partition index accepting
+ * that remainder, or -1 if there is no partition for that remainder.
  */
 typedef struct PartitionBoundInfoData
 {
     char        strategy;        /* hash, list or range? */
-    int            ndatums;        /* Length of the datums following array */
+    int            ndatums;        /* Length of the datums[] array */
     Datum      **datums;
     PartitionRangeDatumKind **kind; /* The kind of each range bound datum;
                                      * NULL for hash and list partitioned
                                      * tables */
+    int            nindexes;        /* Length of the indexes[] array */
     int           *indexes;        /* Partition indexes */
     int            null_index;        /* Index of the null-accepting partition; -1
                                  * if there isn't one */
diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out
index c72a6d051f..bde29e38a9 100644
--- a/src/test/regress/expected/partition_prune.out
+++ b/src/test/regress/expected/partition_prune.out
@@ -1538,26 +1538,27 @@ drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, boolrangep, rp, coll_pru
 -- result on different machines.  See the definitions of
 -- part_part_test_int4_ops and part_test_text_ops in insert.sql.
 --
-create table hp (a int, b text) partition by hash (a part_test_int4_ops, b part_test_text_ops);
+create table hp (a int, b text, c int)
+  partition by hash (a part_test_int4_ops, b part_test_text_ops);
 create table hp0 partition of hp for values with (modulus 4, remainder 0);
 create table hp3 partition of hp for values with (modulus 4, remainder 3);
 create table hp1 partition of hp for values with (modulus 4, remainder 1);
 create table hp2 partition of hp for values with (modulus 4, remainder 2);
-insert into hp values (null, null);
-insert into hp values (1, null);
-insert into hp values (1, 'xxx');
-insert into hp values (null, 'xxx');
-insert into hp values (2, 'xxx');
-insert into hp values (1, 'abcde');
-select tableoid::regclass, * from hp order by 1;
- tableoid | a |   b
-----------+---+-------
- hp0      |   |
- hp0      | 1 | xxx
- hp3      | 2 | xxx
- hp1      | 1 |
- hp2      |   | xxx
- hp2      | 1 | abcde
+insert into hp values (null, null, 0);
+insert into hp values (1, null, 1);
+insert into hp values (1, 'xxx', 2);
+insert into hp values (null, 'xxx', 3);
+insert into hp values (2, 'xxx', 4);
+insert into hp values (1, 'abcde', 5);
+select tableoid::regclass, * from hp order by c;
+ tableoid | a |   b   | c
+----------+---+-------+---
+ hp0      |   |       | 0
+ hp1      | 1 |       | 1
+ hp0      | 1 | xxx   | 2
+ hp2      |   | xxx   | 3
+ hp3      | 2 | xxx   | 4
+ hp2      | 1 | abcde | 5
 (6 rows)

 -- partial keys won't prune, nor would non-equality conditions
@@ -1715,6 +1716,33 @@ explain (costs off) select * from hp where (a = 1 and b = 'abcde') or (a = 2 and
          Filter: (((a = 1) AND (b = 'abcde'::text)) OR ((a = 2) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS
NULL)))
 (7 rows)

+-- test pruning when not all the partitions exist
+drop table hp1;
+drop table hp3;
+explain (costs off) select * from hp where a = 1 and b = 'abcde';
+                 QUERY PLAN
+---------------------------------------------
+ Seq Scan on hp2 hp
+   Filter: ((a = 1) AND (b = 'abcde'::text))
+(2 rows)
+
+explain (costs off) select * from hp where a = 1 and b = 'abcde' and
+  (c = 2 or c = 3);
+                              QUERY PLAN
+----------------------------------------------------------------------
+ Seq Scan on hp2 hp
+   Filter: ((a = 1) AND (b = 'abcde'::text) AND ((c = 2) OR (c = 3)))
+(2 rows)
+
+drop table hp2;
+explain (costs off) select * from hp where a = 1 and b = 'abcde' and
+  (c = 2 or c = 3);
+        QUERY PLAN
+--------------------------
+ Result
+   One-Time Filter: false
+(2 rows)
+
 drop table hp;
 --
 -- Test runtime partition pruning
diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql
index ffd5fe8b0d..6ccb52ad1d 100644
--- a/src/test/regress/sql/partition_prune.sql
+++ b/src/test/regress/sql/partition_prune.sql
@@ -304,19 +304,20 @@ drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, boolrangep, rp, coll_pru
 -- part_part_test_int4_ops and part_test_text_ops in insert.sql.
 --

-create table hp (a int, b text) partition by hash (a part_test_int4_ops, b part_test_text_ops);
+create table hp (a int, b text, c int)
+  partition by hash (a part_test_int4_ops, b part_test_text_ops);
 create table hp0 partition of hp for values with (modulus 4, remainder 0);
 create table hp3 partition of hp for values with (modulus 4, remainder 3);
 create table hp1 partition of hp for values with (modulus 4, remainder 1);
 create table hp2 partition of hp for values with (modulus 4, remainder 2);

-insert into hp values (null, null);
-insert into hp values (1, null);
-insert into hp values (1, 'xxx');
-insert into hp values (null, 'xxx');
-insert into hp values (2, 'xxx');
-insert into hp values (1, 'abcde');
-select tableoid::regclass, * from hp order by 1;
+insert into hp values (null, null, 0);
+insert into hp values (1, null, 1);
+insert into hp values (1, 'xxx', 2);
+insert into hp values (null, 'xxx', 3);
+insert into hp values (2, 'xxx', 4);
+insert into hp values (1, 'abcde', 5);
+select tableoid::regclass, * from hp order by c;

 -- partial keys won't prune, nor would non-equality conditions
 explain (costs off) select * from hp where a = 1;
@@ -337,6 +338,16 @@ explain (costs off) select * from hp where a = 2 and b = 'xxx';
 explain (costs off) select * from hp where a = 1 and b = 'abcde';
 explain (costs off) select * from hp where (a = 1 and b = 'abcde') or (a = 2 and b = 'xxx') or (a is null and b is
null);

+-- test pruning when not all the partitions exist
+drop table hp1;
+drop table hp3;
+explain (costs off) select * from hp where a = 1 and b = 'abcde';
+explain (costs off) select * from hp where a = 1 and b = 'abcde' and
+  (c = 2 or c = 3);
+drop table hp2;
+explain (costs off) select * from hp where a = 1 and b = 'abcde' and
+  (c = 2 or c = 3);
+
 drop table hp;

 --
pgsql-bugs by date:
From: "David G. Johnston"
Date: 27 January 2021, 22:33:37
Subject: Re: Inconsistent application of [, ...] in documentation
From: PG Bug reporting form
Date: 28 January 2021, 05:48:48
Subject: BUG #16841: psql -- \d tablename , displays "Error : column c.relhasoids does not exit"
Re: BUG #16840: Rows not found in table partitioned by hash when not all partitions exists - Mailing list pgsql-bugs

Previous

Next