From 1cb76c8c19b1c0549fbba70febc32017bc04c0a2 Mon Sep 17 00:00:00 2001 From: Ilya Gladyshev Date: Thu, 23 May 2024 18:13:41 +0100 Subject: [PATCH v2] Allow CREATE INDEX CONCURRENTLY on partitioned table --- doc/src/sgml/ddl.sgml | 7 +- doc/src/sgml/ref/create_index.sgml | 14 +- src/backend/commands/indexcmds.c | 293 ++++++++++++++++++------- src/test/regress/expected/indexing.out | 127 ++++++++++- src/test/regress/sql/indexing.sql | 26 ++- 5 files changed, 367 insertions(+), 100 deletions(-) diff --git a/doc/src/sgml/ddl.sgml b/doc/src/sgml/ddl.sgml index 6aab79e901..f1d4a59a99 100644 --- a/doc/src/sgml/ddl.sgml +++ b/doc/src/sgml/ddl.sgml @@ -4314,10 +4314,9 @@ ALTER TABLE measurement ATTACH PARTITION measurement_y2008m02 As mentioned earlier, it is possible to create indexes on partitioned tables so that they are applied automatically to the entire hierarchy. This can be very convenient as not only will all existing partitions be - indexed, but any future partitions will be as well. However, one - limitation when creating new indexes on partitioned tables is that it - is not possible to use the CONCURRENTLY - qualifier, which could lead to long lock times. To avoid this, you can + indexed, but any future partitions will be as well. + CREATE INDEX ... CONCURRENTLY can incur long lock times + on huge partitioned tables, to avoid that you can use CREATE INDEX ON ONLY the partitioned table, which creates the new index marked as invalid, preventing automatic application to existing partitions. Instead, indexes can then be created individually diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml index 621bc0e253..2366cfd9b5 100644 --- a/doc/src/sgml/ref/create_index.sgml +++ b/doc/src/sgml/ref/create_index.sgml @@ -645,7 +645,10 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] If a problem arises while scanning the table, such as a deadlock or a uniqueness violation in a unique index, the CREATE INDEX - command will fail but leave behind an invalid index. This index + command will fail but leave behind an invalid index. + If this happens while build an index concurrently on a partitioned + table, the command can also leave behind valid or + invalid indexes on table partitions. The invalid index will be ignored for querying purposes because it might be incomplete; however it will still consume update overhead. The psql \d command will report such an index as INVALID: @@ -692,15 +695,6 @@ Indexes: cannot. - - Concurrent builds for indexes on partitioned tables are currently not - supported. However, you may concurrently build the index on each - partition individually and then finally create the partitioned index - non-concurrently in order to reduce the time where writes to the - partitioned table will be locked out. In this case, building the - partitioned index is a metadata only operation. - - diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 309389e20d..5806eeb8ef 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -95,6 +95,11 @@ static char *ChooseIndexName(const char *tabname, Oid namespaceId, bool primary, bool isconstraint); static char *ChooseIndexNameAddition(const List *colnames); static List *ChooseIndexColumnNames(const List *indexElems); +static void DefineIndexConcurrentInternal(Oid relationId, + Oid indexRelationId, + IndexInfo *indexInfo, + LOCKTAG heaplocktag, + LockRelId *heaprelid); static void ReindexIndex(const ReindexStmt *stmt, const ReindexParams *params, bool isTopLevel); static void RangeVarCallbackForReindexIndex(const RangeVar *relation, @@ -561,7 +566,6 @@ DefineIndex(Oid tableId, bool amissummarizing; amoptions_function amoptions; bool partitioned; - bool safe_index; Datum reloptions; int16 *coloptions; IndexInfo *indexInfo; @@ -569,12 +573,10 @@ DefineIndex(Oid tableId, bits16 constr_flags; int numberOfAttributes; int numberOfKeyAttributes; - TransactionId limitXmin; ObjectAddress address; LockRelId heaprelid; LOCKTAG heaplocktag; LOCKMODE lockmode; - Snapshot snapshot; Oid root_save_userid; int root_save_sec_context; int root_save_nestlevel; @@ -706,20 +708,6 @@ DefineIndex(Oid tableId, * partition. */ partitioned = rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE; - if (partitioned) - { - /* - * Note: we check 'stmt->concurrent' rather than 'concurrent', so that - * the error is thrown also for temporary tables. Seems better to be - * consistent, even though we could do it on temporary table because - * we're not actually doing it concurrently. - */ - if (stmt->concurrent) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot create index on partitioned table \"%s\" concurrently", - RelationGetRelationName(rel)))); - } /* * Don't try to CREATE INDEX on temp tables of other backends. @@ -1116,10 +1104,6 @@ DefineIndex(Oid tableId, } } - /* Is index safe for others to ignore? See set_indexsafe_procflags() */ - safe_index = indexInfo->ii_Expressions == NIL && - indexInfo->ii_Predicate == NIL; - /* * Report index creation if appropriate (delay this till after most of the * error checks) @@ -1184,6 +1168,11 @@ DefineIndex(Oid tableId, if (pd->nparts != 0) flags |= INDEX_CREATE_INVALID; } + else if (concurrent && OidIsValid(parentIndexId)) + { + /* If concurrent, initially build index partitions as "invalid" */ + flags |= INDEX_CREATE_INVALID; + } if (stmt->deferrable) constr_flags |= INDEX_CONSTR_CREATE_DEFERRABLE; @@ -1533,21 +1522,7 @@ DefineIndex(Oid tableId, */ if (invalidate_parent) { - Relation pg_index = table_open(IndexRelationId, RowExclusiveLock); - HeapTuple tup, - newtup; - - tup = SearchSysCache1(INDEXRELID, - ObjectIdGetDatum(indexRelationId)); - if (!HeapTupleIsValid(tup)) - elog(ERROR, "cache lookup failed for index %u", - indexRelationId); - newtup = heap_copytuple(tup); - ((Form_pg_index) GETSTRUCT(newtup))->indisvalid = false; - CatalogTupleUpdate(pg_index, &tup->t_self, newtup); - ReleaseSysCache(tup); - table_close(pg_index, RowExclusiveLock); - heap_freetuple(newtup); + index_set_state_flags(indexRelationId, INDEX_DROP_CLEAR_VALID); /* * CCI here to make this update visible, in case this recurses @@ -1559,37 +1534,49 @@ DefineIndex(Oid tableId, /* * Indexes on partitioned tables are not themselves built, so we're - * done here. + * done here in the non-concurrent case. */ - AtEOXact_GUC(false, root_save_nestlevel); - SetUserIdAndSecContext(root_save_userid, root_save_sec_context); - table_close(rel, NoLock); - if (!OidIsValid(parentIndexId)) - pgstat_progress_end_command(); - else + if (!concurrent) { - /* Update progress for an intermediate partitioned index itself */ - pgstat_progress_incr_param(PROGRESS_CREATEIDX_PARTITIONS_DONE, 1); - } + AtEOXact_GUC(false, root_save_nestlevel); + SetUserIdAndSecContext(root_save_userid, root_save_sec_context); + table_close(rel, NoLock); - return address; + if (!OidIsValid(parentIndexId)) + pgstat_progress_end_command(); + else + { + /* + * Update progress for an intermediate partitioned index + * itself + */ + pgstat_progress_incr_param(PROGRESS_CREATEIDX_PARTITIONS_DONE, 1); + } + + return address; + } } AtEOXact_GUC(false, root_save_nestlevel); SetUserIdAndSecContext(root_save_userid, root_save_sec_context); - if (!concurrent) + /* + * All done in the non-concurrent case, and when building catalog entries + * of partitions for CIC. + */ + if (!concurrent || OidIsValid(parentIndexId)) { - /* Close the heap and we're done, in the non-concurrent case */ table_close(rel, NoLock); /* * If this is the top-level index, the command is done overall; - * otherwise, increment progress to report one child index is done. + * otherwise (when being called recursively), increment progress to + * report that one child index is done. Except in the concurrent + * (catalog-only) case, which is handled later. */ if (!OidIsValid(parentIndexId)) pgstat_progress_end_command(); - else + else if (!concurrent) pgstat_progress_incr_param(PROGRESS_CREATEIDX_PARTITIONS_DONE, 1); return address; @@ -1600,28 +1587,180 @@ DefineIndex(Oid tableId, SET_LOCKTAG_RELATION(heaplocktag, heaprelid.dbId, heaprelid.relId); table_close(rel, NoLock); - /* - * For a concurrent build, it's important to make the catalog entries - * visible to other transactions before we start to build the index. That - * will prevent them from making incompatible HOT updates. The new index - * will be marked not indisready and not indisvalid, so that no one else - * tries to either insert into it or use it for queries. - * - * We must commit our current transaction so that the index becomes - * visible; then start another. Note that all the data structures we just - * built are lost in the commit. The only data we keep past here are the - * relation IDs. - * - * Before committing, get a session-level lock on the table, to ensure - * that neither it nor the index can be dropped before we finish. This - * cannot block, even if someone else is waiting for access, because we - * already have the same lock within our transaction. - * - * Note: we don't currently bother with a session lock on the index, - * because there are no operations that could change its state while we - * hold lock on the parent table. This might need to change later. - */ - LockRelationIdForSession(&heaprelid, ShareUpdateExclusiveLock); + if (!partitioned) + { + /* + * For a concurrent build, it's important to make the catalog entries + * visible to other transactions before we start to build the index. + * That will prevent them from making incompatible HOT updates. The + * new index will be marked not indisready and not indisvalid, so that + * no one else tries to either insert into it or use it for queries. + * + * DefineIndexConcurrentInternal will commit our current transaction + * so that the index becomes visible; then start another. Note that + * all the data structures we just built are lost in the commit. The + * only data we keep past here are the relation IDs. + * + * Before committing, get a session-level lock on the table, to ensure + * that neither it nor the index can be dropped before we finish. This + * cannot block, even if someone else is waiting for access, because + * we already have the same lock within our transaction. + * + * Note: we don't currently bother with a session lock on the index, + * because there are no operations that could change its state while + * we hold lock on the parent table. This might need to change later. + */ + + LockRelationIdForSession(&heaprelid, ShareUpdateExclusiveLock); + + /* CREATE INDEX CONCURRENTLY on a nonpartitioned table */ + DefineIndexConcurrentInternal(tableId, indexRelationId, + indexInfo, heaplocktag, &heaprelid); + pgstat_progress_end_command(); + return address; + } + else + { + /* + * For CIC on a partitioned table, finish by building indexes on + * partitions + */ + + ListCell *lc; + ListCell *lc2; + List *childs; + List *part_idxs = NIL; + List *leaf_idxs = NIL; + List *leaf_idx_lockids = NIL; + List *part_idx_lockids = NIL; + MemoryContext cic_context, + old_context; + + /* Create special memory context for cross-transaction storage */ + cic_context = AllocSetContextCreate(PortalContext, + "Create index concurrently", + ALLOCSET_DEFAULT_SIZES); + + old_context = MemoryContextSwitchTo(cic_context); + childs = find_all_inheritors(indexRelationId, ShareUpdateExclusiveLock, NULL); + MemoryContextSwitchTo(old_context); + + foreach(lc, childs) + { + Oid indrelid = lfirst_oid(lc); + char relkind; + LockRelId *lockrelid; + Oid tabrelid; + + /* + * Pre-existing partitions which were ATTACHED were already + * counted in the progress report. + */ + if (get_index_isvalid(indrelid)) + continue; + + tabrelid = IndexGetRelation(indrelid, false); + rel = table_open(tabrelid, ShareUpdateExclusiveLock); + lockrelid = palloc_object(LockRelId); + *lockrelid = rel->rd_lockInfo.lockRelId; + table_close(rel, ShareUpdateExclusiveLock); + + /* + * Split partitions in 2 lists: partitioned tables that just need + * to be marked as valid and leaf tables that actually need to + * have their indexes built. + */ + relkind = get_rel_relkind(indrelid); + if (!RELKIND_HAS_STORAGE(relkind)) + { + /* The toplevel index doesn't count towards "partitions done" */ + if (indrelid != indexRelationId) + pgstat_progress_incr_param(PROGRESS_CREATEIDX_PARTITIONS_DONE, 1); + + /* + * Build up a list of all the intermediate partitioned tables + * which will later need to be set valid. + */ + old_context = MemoryContextSwitchTo(cic_context); + part_idxs = lappend_oid(part_idxs, indrelid); + part_idx_lockids = lappend(part_idx_lockids, lockrelid); + MemoryContextSwitchTo(old_context); + } + else + { + old_context = MemoryContextSwitchTo(cic_context); + + leaf_idxs = lappend_oid(leaf_idxs, indrelid); + leaf_idx_lockids = lappend(leaf_idx_lockids, lockrelid); + + MemoryContextSwitchTo(old_context); + + } + + /* + * All partitions, including top-level parent, need to be locked + * for the session before proceeding with any index rebuilds as + * each of those are done in a separate transaction. After each + * leaf index is built, its corresponding table will be unlocked, + * all the partitioned tables will be unlocked at the very end. + */ + + LockRelationIdForSession(lockrelid, ShareUpdateExclusiveLock); + + } + + forboth(lc, leaf_idxs, lc2, leaf_idx_lockids) + { + Oid indrelid = lfirst_oid(lc); + LockRelId *heaprelid = lfirst(lc2); + Oid tabrelid = IndexGetRelation(indrelid, false); + + SET_LOCKTAG_RELATION(heaplocktag, heaprelid->dbId, heaprelid->relId); + + /* Process each partition in a separate transaction */ + DefineIndexConcurrentInternal(tabrelid, indrelid, indexInfo, + heaplocktag, heaprelid); + + PushActiveSnapshot(GetTransactionSnapshot()); + pgstat_progress_incr_param(PROGRESS_CREATEIDX_PARTITIONS_DONE, 1); + } + + /* Set as valid all partitioned indexes, including the parent */ + foreach(lc, part_idxs) + { + Oid indrelid = lfirst_oid(lc); + + index_set_state_flags(indrelid, INDEX_CREATE_SET_READY); + CommandCounterIncrement(); + index_set_state_flags(indrelid, INDEX_CREATE_SET_VALID); + } + + foreach(lc, part_idx_lockids) + { + LockRelId *lockrelid = lfirst(lc); + + UnlockRelationIdForSession(lockrelid, ShareUpdateExclusiveLock); + } + + + MemoryContextDelete(cic_context); + pgstat_progress_end_command(); + PopActiveSnapshot(); + return address; + } +} + + +static void +DefineIndexConcurrentInternal(Oid tableId, Oid indexRelationId, IndexInfo *indexInfo, + LOCKTAG heaplocktag, LockRelId *heaprelid) +{ + TransactionId limitXmin; + Snapshot snapshot; + + /* Is index safe for others to ignore? See set_indexsafe_procflags() */ + bool safe_index = indexInfo->ii_Expressions == NIL && + indexInfo->ii_Predicate == NIL; PopActiveSnapshot(); CommitTransactionCommand(); @@ -1789,16 +1928,12 @@ DefineIndex(Oid tableId, * would be useful. (Note that our earlier commits did not create reasons * to replan; so relcache flush on the index itself was sufficient.) */ - CacheInvalidateRelcacheByRelid(heaprelid.relId); + CacheInvalidateRelcacheByRelid(heaprelid->relId); /* * Last thing to do is release the session-level lock on the parent table. */ - UnlockRelationIdForSession(&heaprelid, ShareUpdateExclusiveLock); - - pgstat_progress_end_command(); - - return address; + UnlockRelationIdForSession(heaprelid, ShareUpdateExclusiveLock); } diff --git a/src/test/regress/expected/indexing.out b/src/test/regress/expected/indexing.out index f25723da92..44a6cf39df 100644 --- a/src/test/regress/expected/indexing.out +++ b/src/test/regress/expected/indexing.out @@ -50,11 +50,130 @@ select relname, relkind, relhassubclass, inhparent::regclass (8 rows) drop table idxpart; --- Some unsupported features +-- CIC on partitioned table create table idxpart (a int, b int, c text) partition by range (a); -create table idxpart1 partition of idxpart for values from (0) to (10); -create index concurrently on idxpart (a); -ERROR: cannot create index on partitioned table "idxpart" concurrently +create table idxpart1 partition of idxpart for values from (0) to (10) partition by range(a); +create table idxpart11 partition of idxpart1 for values from (0) to (10) partition by range(a); +create table idxpart111 partition of idxpart11 default partition by range(a); +create table idxpart1111 partition of idxpart111 default partition by range(a); +create table idxpart2 partition of idxpart for values from (10) to (20); +create table idxpart3 partition of idxpart for values from (30) to (40) partition by range(a); +create table idxpart31 partition of idxpart3 default; +insert into idxpart2 values(10),(10); -- not unique +create index concurrently on idxpart11 (a); -- partitioned and partition, with no leaves +create index concurrently on idxpart1 (a); -- partitioned and partition +create index concurrently on idxpart2 (a); -- leaf +create index concurrently on idxpart (a); -- partitioned +create unique index concurrently on idxpart (a); -- partitioned, unique failure +ERROR: could not create unique index "idxpart2_a_idx1" +DETAIL: Key (a)=(10) is duplicated. +\d idxpart + Partitioned table "public.idxpart" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | +Partition key: RANGE (a) +Indexes: + "idxpart_a_idx" btree (a) + "idxpart_a_idx1" UNIQUE, btree (a) INVALID +Number of partitions: 3 (Use \d+ to list them.) + +\d idxpart1 + Partitioned table "public.idxpart1" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | +Partition of: idxpart FOR VALUES FROM (0) TO (10) +Partition key: RANGE (a) +Indexes: + "idxpart1_a_idx" btree (a) + "idxpart1_a_idx1" UNIQUE, btree (a) INVALID +Number of partitions: 1 (Use \d+ to list them.) + +\d idxpart11 + Partitioned table "public.idxpart11" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | +Partition of: idxpart1 FOR VALUES FROM (0) TO (10) +Partition key: RANGE (a) +Indexes: + "idxpart11_a_idx" btree (a) + "idxpart11_a_idx1" UNIQUE, btree (a) INVALID +Number of partitions: 1 (Use \d+ to list them.) + +\d idxpart111 + Partitioned table "public.idxpart111" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | +Partition of: idxpart11 DEFAULT +Partition key: RANGE (a) +Indexes: + "idxpart111_a_idx" btree (a) + "idxpart111_a_idx1" UNIQUE, btree (a) INVALID +Number of partitions: 1 (Use \d+ to list them.) + +\d idxpart1111 + Partitioned table "public.idxpart1111" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | +Partition of: idxpart111 DEFAULT +Partition key: RANGE (a) +Indexes: + "idxpart1111_a_idx" btree (a) + "idxpart1111_a_idx1" UNIQUE, btree (a) INVALID +Number of partitions: 0 + +\d idxpart2 + Table "public.idxpart2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | +Partition of: idxpart FOR VALUES FROM (10) TO (20) +Indexes: + "idxpart2_a_idx" btree (a) + "idxpart2_a_idx1" UNIQUE, btree (a) INVALID + +\d idxpart3 + Partitioned table "public.idxpart3" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | +Partition of: idxpart FOR VALUES FROM (30) TO (40) +Partition key: RANGE (a) +Indexes: + "idxpart3_a_idx" btree (a) + "idxpart3_a_idx1" UNIQUE, btree (a) INVALID +Number of partitions: 1 (Use \d+ to list them.) + +\d idxpart31 + Table "public.idxpart31" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | +Partition of: idxpart3 DEFAULT +Indexes: + "idxpart31_a_idx" btree (a) + "idxpart31_a_idx1" UNIQUE, btree (a) INVALID + drop table idxpart; -- Verify bugfix with query on indexed partitioned table with no partitions -- https://postgr.es/m/20180124162006.pmapfiznhgngwtjf@alvherre.pgsql diff --git a/src/test/regress/sql/indexing.sql b/src/test/regress/sql/indexing.sql index 5f1f4b80c9..38a730d877 100644 --- a/src/test/regress/sql/indexing.sql +++ b/src/test/regress/sql/indexing.sql @@ -29,10 +29,30 @@ select relname, relkind, relhassubclass, inhparent::regclass where relname like 'idxpart%' order by relname; drop table idxpart; --- Some unsupported features +-- CIC on partitioned table create table idxpart (a int, b int, c text) partition by range (a); -create table idxpart1 partition of idxpart for values from (0) to (10); -create index concurrently on idxpart (a); +create table idxpart1 partition of idxpart for values from (0) to (10) partition by range(a); +create table idxpart11 partition of idxpart1 for values from (0) to (10) partition by range(a); +create table idxpart111 partition of idxpart11 default partition by range(a); +create table idxpart1111 partition of idxpart111 default partition by range(a); +create table idxpart2 partition of idxpart for values from (10) to (20); +create table idxpart3 partition of idxpart for values from (30) to (40) partition by range(a); +create table idxpart31 partition of idxpart3 default; + +insert into idxpart2 values(10),(10); -- not unique +create index concurrently on idxpart11 (a); -- partitioned and partition, with no leaves +create index concurrently on idxpart1 (a); -- partitioned and partition +create index concurrently on idxpart2 (a); -- leaf +create index concurrently on idxpart (a); -- partitioned +create unique index concurrently on idxpart (a); -- partitioned, unique failure +\d idxpart +\d idxpart1 +\d idxpart11 +\d idxpart111 +\d idxpart1111 +\d idxpart2 +\d idxpart3 +\d idxpart31 drop table idxpart; -- Verify bugfix with query on indexed partitioned table with no partitions -- 2.34.1