From e4d89d0e976283e72b33a48d6a6b7fb5d8fceeca Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Mon, 1 May 2023 15:38:29 -0700 Subject: [PATCH v18 5/6] Introduce collation provider "builtin". MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three locales are offered by the builtin provider: C, C.UTF-8, and UCS_BASIC. The builtin "C" locale is equal in semantics and implementation to the libc "C" locale (neither of which actually use libc). The builtin "C.UTF-8" locale offers similar semantics to the libc "C.UTF-8" locale, which is collation according to code point combined with simple Unicode character semantics. Unlike the libc "C.UTF-8" locale, the builtin "C.UTF-8" is available on all platforms with consistent behavior, and benefits from additional optimizations. The builtin "UCS_BASIC" locale offers collation according to code point order and more complete Unicode character semantics. As the SQL standard requires, it offers full case mappings that may increase the length of a string, such as "ß" changing to "SS" when uppercased. Discussion: https://postgr.es/m/ab925f69-5f9d-f85e-b87c-bd2a44798659@joeconway.com Discussion: https://postgr.es/m/dd9261f4-7a98-4565-93ec-336c1c110d90@manitou-mail.org --- doc/src/sgml/charset.sgml | 88 ++++-- doc/src/sgml/ref/create_collation.sgml | 11 +- doc/src/sgml/ref/create_database.sgml | 8 +- doc/src/sgml/ref/createdb.sgml | 2 +- doc/src/sgml/ref/initdb.sgml | 17 +- src/backend/catalog/pg_collation.c | 5 +- src/backend/commands/collationcmds.c | 93 ++++-- src/backend/commands/dbcommands.c | 121 ++++++-- src/backend/regex/regc_pg_locale.c | 41 ++- src/backend/utils/adt/formatting.c | 133 +++++++++ src/backend/utils/adt/pg_locale.c | 162 +++++++++-- src/backend/utils/init/postinit.c | 46 ++- src/bin/initdb/initdb.c | 58 ++-- src/bin/initdb/t/001_initdb.pl | 57 +++- src/bin/pg_dump/pg_dump.c | 49 ++-- src/bin/pg_upgrade/t/002_pg_upgrade.pl | 70 +++-- src/bin/psql/describe.c | 4 +- src/bin/scripts/createdb.c | 18 +- src/bin/scripts/t/020_createdb.pl | 78 +++++ src/include/catalog/pg_collation.dat | 6 +- src/include/catalog/pg_collation.h | 3 + src/include/utils/pg_locale.h | 9 +- src/test/icu/t/010_database.pl | 22 +- src/test/regress/expected/collate.out | 24 +- src/test/regress/expected/collate.utf8.out | 287 +++++++++++++++++++ src/test/regress/expected/collate.utf8_1.out | 8 + src/test/regress/parallel_schedule | 4 +- src/test/regress/sql/collate.sql | 10 + src/test/regress/sql/collate.utf8.sql | 122 ++++++++ 29 files changed, 1393 insertions(+), 163 deletions(-) create mode 100644 src/test/regress/expected/collate.utf8.out create mode 100644 src/test/regress/expected/collate.utf8_1.out create mode 100644 src/test/regress/sql/collate.utf8.sql diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index 4422b0cc92..2abd898115 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -342,22 +342,14 @@ initdb --locale=sv_SE Locale Providers - PostgreSQL supports multiple locale - providers. This specifies which library supplies the locale - data. One standard provider name is libc, which uses - the locales provided by the operating system C library. These are the - locales used by most tools provided by the operating system. Another - provider is icu, which uses the external - ICUICU library. ICU locales can - only be used if support for ICU was configured when PostgreSQL was built. + A locale provider specifies which library defines the locale behavior for + collations and character classifications. The commands and tools that select the locale settings, as described - above, each have an option to select the locale provider. The examples - shown earlier all use the libc provider, which is the - default. Here is an example to initialize a database cluster using the - ICU provider: + above, each have an option to select the locale provider. Here is an + example to initialize a database cluster using the ICU provider: initdb --locale-provider=icu --icu-locale=en @@ -370,12 +362,74 @@ initdb --locale-provider=icu --icu-locale=en - Which locale provider to use depends on individual requirements. For most - basic uses, either provider will give adequate results. For the libc - provider, it depends on what the operating system offers; some operating - systems are better than others. For advanced uses, ICU offers more locale - variants and customization options. + Regardless of the locale provider, the operating system is still used to + provide some locale-aware behavior, such as messages (see ). + + + The available locale providers are listed below. + + + + Builtin + + The builtin provider uses built-in operations. Only + the C and C.UTF-8 locales are + supported for this provider. + + + The C locale behavior is identical to the + C locale in the libc provider. When using this locale, + the behavior may depend on the database encoding. + + + The C.UTF-8 locale is available only for when the + database encoding is UTF-8, and the behavior is based + on Unicode. The collation uses the code point values only. The regular + expression character classes are based on the "POSIX Compatible" + semantics, and the case mapping is the "simple" variant. + + + + ICU + + The icu provider uses the external + ICUICU + library. PostgreSQL must have been configured + with support. + + + ICU provides collation and character classification behavior that is + independent of the operating system and database encoding, which is + preferable if you expect to transition to other platforms without any + change in results. LC_COLLATE and + LC_CTYPE can be set independently of the ICU locale. + + + + For the ICU provider, results may depend on the version of the ICU + library used, as it is updated to reflect changes in natural language + over time. + + + + + libc + + The libc provider uses the operating system's C + library. The collation and character classification behavior is + controlled by the settings LC_COLLATE and + LC_CTYPE, so they cannot be set independently. + + + + The same locale name may have different behavior on different platforms + when using the libc provider. + + + + diff --git a/doc/src/sgml/ref/create_collation.sgml b/doc/src/sgml/ref/create_collation.sgml index 5cf9777764..85f18cbbe5 100644 --- a/doc/src/sgml/ref/create_collation.sgml +++ b/doc/src/sgml/ref/create_collation.sgml @@ -96,6 +96,11 @@ CREATE COLLATION [ IF NOT EXISTS ] name FROM locale, you cannot specify either of those parameters. + + If provider is builtin, + then locale must be specified and set to + either C or C.UTF-8. + @@ -129,9 +134,9 @@ CREATE COLLATION [ IF NOT EXISTS ] name FROM Specifies the provider to use for locale services associated with this - collation. Possible values are - icuICU - (if the server was built with ICU support) or libc. + collation. Possible values are builtin, + icuICU (if + the server was built with ICU support) or libc. libc is the default. See for details. diff --git a/doc/src/sgml/ref/create_database.sgml b/doc/src/sgml/ref/create_database.sgml index 72927960eb..1f5cdf1271 100644 --- a/doc/src/sgml/ref/create_database.sgml +++ b/doc/src/sgml/ref/create_database.sgml @@ -162,6 +162,12 @@ CREATE DATABASE name linkend="create-database-lc-ctype"/>, or individually. + + If is + builtin, then locale + must be specified and set to either C or + C.UTF-8. + The other locale settings , name Specifies the provider to use for the default collation in this - database. Possible values are + database. Possible values are builtin, icuICU (if the server was built with ICU support) or libc. By default, the provider is the same as that of the - + Specifies the locale provider for the database's default collation. diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml index cd75cae10e..08a1c2538f 100644 --- a/doc/src/sgml/ref/initdb.sgml +++ b/doc/src/sgml/ref/initdb.sgml @@ -286,6 +286,11 @@ PostgreSQL documentation environment that initdb runs in. Locale support is described in . + + If is builtin, + must be specified and set to + C or C.UTF-8. + @@ -314,8 +319,18 @@ PostgreSQL documentation + + + + + Specifies the locale name when the builtin provider is used. Locale support + is described in . + + + + - + This option sets the locale provider for databases created in the new diff --git a/src/backend/catalog/pg_collation.c b/src/backend/catalog/pg_collation.c index 7bad94f908..01e91000af 100644 --- a/src/backend/catalog/pg_collation.c +++ b/src/backend/catalog/pg_collation.c @@ -68,7 +68,10 @@ CollationCreate(const char *collname, Oid collnamespace, Assert(collname); Assert(collnamespace); Assert(collowner); - Assert((collcollate && collctype) || colllocale); + Assert((collprovider == COLLPROVIDER_LIBC && + collcollate && collctype && !colllocale) || + (collprovider != COLLPROVIDER_LIBC && + !collcollate && !collctype && colllocale)); /* * Make sure there is no existing collation of same name & encoding. diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c index 27564e569a..0fa073496e 100644 --- a/src/backend/commands/collationcmds.c +++ b/src/backend/commands/collationcmds.c @@ -68,7 +68,7 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e DefElem *versionEl = NULL; char *collcollate; char *collctype; - char *colllocale; + const char *colllocale; char *collicurules; bool collisdeterministic; int collencoding; @@ -215,7 +215,9 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e if (collproviderstr) { - if (pg_strcasecmp(collproviderstr, "icu") == 0) + if (pg_strcasecmp(collproviderstr, "builtin") == 0) + collprovider = COLLPROVIDER_BUILTIN; + else if (pg_strcasecmp(collproviderstr, "icu") == 0) collprovider = COLLPROVIDER_ICU; else if (pg_strcasecmp(collproviderstr, "libc") == 0) collprovider = COLLPROVIDER_LIBC; @@ -245,7 +247,17 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e if (lcctypeEl) collctype = defGetString(lcctypeEl); - if (collprovider == COLLPROVIDER_LIBC) + if (collprovider == COLLPROVIDER_BUILTIN) + { + if (!colllocale) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("parameter \"locale\" must be specified"))); + + colllocale = builtin_validate_locale(GetDatabaseEncoding(), + colllocale); + } + else if (collprovider == COLLPROVIDER_LIBC) { if (!collcollate) ereport(ERROR, @@ -305,7 +317,17 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("ICU rules cannot be specified unless locale provider is ICU"))); - if (collprovider == COLLPROVIDER_ICU) + if (collprovider == COLLPROVIDER_BUILTIN) + { + /* + * Behavior may be different in different encodings, so set + * collencoding to the current database encoding. No validation is + * required, because the "builtin" provider is compatible with any + * encoding. + */ + collencoding = GetDatabaseEncoding(); + } + else if (collprovider == COLLPROVIDER_ICU) { #ifdef USE_ICU /* @@ -334,7 +356,18 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e } if (!collversion) - collversion = get_collation_actual_version(collprovider, collprovider == COLLPROVIDER_ICU ? colllocale : collcollate); + { + const char *locale; + + if (collprovider == COLLPROVIDER_ICU) + locale = colllocale; + else if (collprovider == COLLPROVIDER_LIBC) + locale = collcollate; + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ + + collversion = get_collation_actual_version(collprovider, locale); + } newoid = CollationCreate(collName, collNamespace, @@ -409,6 +442,7 @@ AlterCollation(AlterCollationStmt *stmt) Form_pg_collation collForm; Datum datum; bool isnull; + char *locale; char *oldversion; char *newversion; ObjectAddress address; @@ -435,8 +469,20 @@ AlterCollation(AlterCollationStmt *stmt) datum = SysCacheGetAttr(COLLOID, tup, Anum_pg_collation_collversion, &isnull); oldversion = isnull ? NULL : TextDatumGetCString(datum); - datum = SysCacheGetAttrNotNull(COLLOID, tup, collForm->collprovider == COLLPROVIDER_ICU ? Anum_pg_collation_colllocale : Anum_pg_collation_collcollate); - newversion = get_collation_actual_version(collForm->collprovider, TextDatumGetCString(datum)); + if (collForm->collprovider == COLLPROVIDER_ICU) + { + datum = SysCacheGetAttrNotNull(COLLOID, tup, Anum_pg_collation_colllocale); + locale = TextDatumGetCString(datum); + } + else if (collForm->collprovider == COLLPROVIDER_LIBC) + { + datum = SysCacheGetAttrNotNull(COLLOID, tup, Anum_pg_collation_collcollate); + locale = TextDatumGetCString(datum); + } + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ + + newversion = get_collation_actual_version(collForm->collprovider, locale); /* cannot change from NULL to non-NULL or vice versa */ if ((!oldversion && newversion) || (oldversion && !newversion)) @@ -500,11 +546,18 @@ pg_collation_actual_version(PG_FUNCTION_ARGS) provider = ((Form_pg_database) GETSTRUCT(dbtup))->datlocprovider; - datum = SysCacheGetAttrNotNull(DATABASEOID, dbtup, - provider == COLLPROVIDER_ICU ? - Anum_pg_database_datlocale : Anum_pg_database_datcollate); - - locale = TextDatumGetCString(datum); + if (provider == COLLPROVIDER_ICU) + { + datum = SysCacheGetAttrNotNull(DATABASEOID, dbtup, Anum_pg_database_datlocale); + locale = TextDatumGetCString(datum); + } + else if (provider == COLLPROVIDER_LIBC) + { + datum = SysCacheGetAttrNotNull(DATABASEOID, dbtup, Anum_pg_database_datcollate); + locale = TextDatumGetCString(datum); + } + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ ReleaseSysCache(dbtup); } @@ -521,11 +574,19 @@ pg_collation_actual_version(PG_FUNCTION_ARGS) provider = ((Form_pg_collation) GETSTRUCT(colltp))->collprovider; Assert(provider != COLLPROVIDER_DEFAULT); - datum = SysCacheGetAttrNotNull(COLLOID, colltp, - provider == COLLPROVIDER_ICU ? - Anum_pg_collation_colllocale : Anum_pg_collation_collcollate); - locale = TextDatumGetCString(datum); + if (provider == COLLPROVIDER_ICU) + { + datum = SysCacheGetAttrNotNull(COLLOID, colltp, Anum_pg_collation_colllocale); + locale = TextDatumGetCString(datum); + } + else if (provider == COLLPROVIDER_LIBC) + { + datum = SysCacheGetAttrNotNull(COLLOID, colltp, Anum_pg_collation_collcollate); + locale = TextDatumGetCString(datum); + } + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ ReleaseSysCache(colltp); } diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index d1de46e759..d7a21adc5c 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -698,6 +698,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) DefElem *dtemplate = NULL; DefElem *dencoding = NULL; DefElem *dlocale = NULL; + DefElem *dbuiltinlocale = NULL; DefElem *dcollate = NULL; DefElem *dctype = NULL; DefElem *diculocale = NULL; @@ -713,7 +714,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) const char *dbtemplate = NULL; char *dbcollate = NULL; char *dbctype = NULL; - char *dblocale = NULL; + const char *dblocale = NULL; char *dbicurules = NULL; char dblocprovider = '\0'; char *canonname; @@ -762,6 +763,12 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) errorConflictingDefElem(defel, pstate); dlocale = defel; } + else if (strcmp(defel->defname, "builtin_locale") == 0) + { + if (dbuiltinlocale) + errorConflictingDefElem(defel, pstate); + dbuiltinlocale = defel; + } else if (strcmp(defel->defname, "lc_collate") == 0) { if (dcollate) @@ -897,7 +904,10 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) { dbcollate = defGetString(dlocale); dbctype = defGetString(dlocale); + dblocale = defGetString(dlocale); } + if (dbuiltinlocale && dbuiltinlocale->arg) + dblocale = defGetString(dbuiltinlocale); if (dcollate && dcollate->arg) dbcollate = defGetString(dcollate); if (dctype && dctype->arg) @@ -910,7 +920,9 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) { char *locproviderstr = defGetString(dlocprovider); - if (pg_strcasecmp(locproviderstr, "icu") == 0) + if (pg_strcasecmp(locproviderstr, "builtin") == 0) + dblocprovider = COLLPROVIDER_BUILTIN; + else if (pg_strcasecmp(locproviderstr, "icu") == 0) dblocprovider = COLLPROVIDER_ICU; else if (pg_strcasecmp(locproviderstr, "libc") == 0) dblocprovider = COLLPROVIDER_LIBC; @@ -1027,14 +1039,9 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) dbctype = src_ctype; if (dblocprovider == '\0') dblocprovider = src_locprovider; - if (dblocale == NULL && dblocprovider == COLLPROVIDER_ICU) - { - if (dlocale && dlocale->arg) - dblocale = defGetString(dlocale); - else - dblocale = src_locale; - } - if (dbicurules == NULL && dblocprovider == COLLPROVIDER_ICU) + if (dblocale == NULL) + dblocale = src_locale; + if (dbicurules == NULL) dbicurules = src_icurules; /* Some encodings are client only */ @@ -1059,6 +1066,27 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) check_encoding_locale_matches(encoding, dbcollate, dbctype); + if (dblocprovider == COLLPROVIDER_BUILTIN) + { + /* + * This would happen if template0 uses the libc provider but the new + * database uses builtin. + */ + if (!dblocale) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("LOCALE must be specified for the builtin provider"))); + + dblocale = builtin_validate_locale(encoding, dblocale); + } + else + { + if (dbuiltinlocale && dbuiltinlocale->arg) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("BUILTIN_LOCALE cannot be specified unless locale provider is builtin"))); + } + if (dblocprovider == COLLPROVIDER_ICU) { if (!(is_encoding_supported_by_icu(encoding))) @@ -1100,7 +1128,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) } else { - if (dblocale) + if (diculocale && diculocale->arg) ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("ICU locale cannot be specified unless locale provider is ICU"))); @@ -1111,6 +1139,10 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) errmsg("ICU rules cannot be specified unless locale provider is ICU"))); } + /* for libc, locale comes from datcollate and datctype */ + if (dblocprovider == COLLPROVIDER_LIBC) + dblocale = NULL; + /* * Check that the new encoding and locale settings match the source * database. We insist on this because we simply copy the source data --- @@ -1196,8 +1228,16 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) if (src_collversion && !dcollversion) { char *actual_versionstr; + const char *locale; - actual_versionstr = get_collation_actual_version(dblocprovider, dblocprovider == COLLPROVIDER_ICU ? dblocale : dbcollate); + if (dblocprovider == COLLPROVIDER_ICU) + locale = dblocale; + else if (dblocprovider == COLLPROVIDER_LIBC) + locale = dbcollate; + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ + + actual_versionstr = get_collation_actual_version(dblocprovider, locale); if (!actual_versionstr) ereport(ERROR, (errmsg("template database \"%s\" has a collation version, but no actual collation version could be determined", @@ -1225,7 +1265,18 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) * collation version, which is normally only the case for template0. */ if (dbcollversion == NULL) - dbcollversion = get_collation_actual_version(dblocprovider, dblocprovider == COLLPROVIDER_ICU ? dblocale : dbcollate); + { + const char *locale; + + if (dblocprovider == COLLPROVIDER_ICU) + locale = dblocale; + else if (dblocprovider == COLLPROVIDER_LIBC) + locale = dbcollate; + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ + + dbcollversion = get_collation_actual_version(dblocprovider, locale); + } /* Resolve default tablespace for new database */ if (dtablespacename && dtablespacename->arg) @@ -1364,8 +1415,8 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) * block on the unique index, and fail after we commit). */ - Assert((dblocprovider == COLLPROVIDER_ICU && dblocale) || - (dblocprovider != COLLPROVIDER_ICU && !dblocale)); + Assert((dblocprovider != COLLPROVIDER_LIBC && dblocale) || + (dblocprovider == COLLPROVIDER_LIBC && !dblocale)); /* Form tuple */ new_record[Anum_pg_database_oid - 1] = ObjectIdGetDatum(dboid); @@ -2446,6 +2497,7 @@ AlterDatabaseRefreshColl(AlterDatabaseRefreshCollStmt *stmt) ObjectAddress address; Datum datum; bool isnull; + char *locale; char *oldversion; char *newversion; @@ -2472,10 +2524,24 @@ AlterDatabaseRefreshColl(AlterDatabaseRefreshCollStmt *stmt) datum = heap_getattr(tuple, Anum_pg_database_datcollversion, RelationGetDescr(rel), &isnull); oldversion = isnull ? NULL : TextDatumGetCString(datum); - datum = heap_getattr(tuple, datForm->datlocprovider == COLLPROVIDER_ICU ? Anum_pg_database_datlocale : Anum_pg_database_datcollate, RelationGetDescr(rel), &isnull); - if (isnull) - elog(ERROR, "unexpected null in pg_database"); - newversion = get_collation_actual_version(datForm->datlocprovider, TextDatumGetCString(datum)); + if (datForm->datlocprovider == COLLPROVIDER_ICU) + { + datum = heap_getattr(tuple, Anum_pg_database_datlocale, RelationGetDescr(rel), &isnull); + if (isnull) + elog(ERROR, "unexpected null in pg_database"); + locale = TextDatumGetCString(datum); + } + else if (datForm->datlocprovider == COLLPROVIDER_LIBC) + { + datum = heap_getattr(tuple, Anum_pg_database_datcollate, RelationGetDescr(rel), &isnull); + if (isnull) + elog(ERROR, "unexpected null in pg_database"); + locale = TextDatumGetCString(datum); + } + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ + + newversion = get_collation_actual_version(datForm->datlocprovider, locale); /* cannot change from NULL to non-NULL or vice versa */ if ((!oldversion && newversion) || (oldversion && !newversion)) @@ -2660,6 +2726,7 @@ pg_database_collation_actual_version(PG_FUNCTION_ARGS) HeapTuple tp; char datlocprovider; Datum datum; + char *locale; char *version; tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(dbid)); @@ -2670,8 +2737,20 @@ pg_database_collation_actual_version(PG_FUNCTION_ARGS) datlocprovider = ((Form_pg_database) GETSTRUCT(tp))->datlocprovider; - datum = SysCacheGetAttrNotNull(DATABASEOID, tp, datlocprovider == COLLPROVIDER_ICU ? Anum_pg_database_datlocale : Anum_pg_database_datcollate); - version = get_collation_actual_version(datlocprovider, TextDatumGetCString(datum)); + if (datlocprovider == COLLPROVIDER_ICU) + { + datum = SysCacheGetAttrNotNull(DATABASEOID, tp, Anum_pg_database_datlocale); + locale = TextDatumGetCString(datum); + } + else if (datlocprovider == COLLPROVIDER_LIBC) + { + datum = SysCacheGetAttrNotNull(DATABASEOID, tp, Anum_pg_database_datcollate); + locale = TextDatumGetCString(datum); + } + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ + + version = get_collation_actual_version(datlocprovider, locale); ReleaseSysCache(tp); diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index 6a26388bfa..7fe7b9f6d5 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -16,6 +16,8 @@ */ #include "catalog/pg_collation.h" +#include "common/unicode_case.h" +#include "common/unicode_category.h" #include "utils/pg_locale.h" /* @@ -64,6 +66,7 @@ typedef enum { PG_REGEX_LOCALE_C, /* C locale (encoding independent) */ + PG_REGEX_BUILTIN, /* built-in Unicode semantics */ PG_REGEX_LOCALE_WIDE, /* Use functions */ PG_REGEX_LOCALE_1BYTE, /* Use functions */ PG_REGEX_LOCALE_WIDE_L, /* Use locale_t functions */ @@ -75,6 +78,8 @@ static PG_Locale_Strategy pg_regex_strategy; static pg_locale_t pg_regex_locale; static Oid pg_regex_collation; +static bool regex_builtin_cclass_posix = false; + /* * Hard-wired character properties for C locale */ @@ -266,7 +271,15 @@ pg_set_regex_collation(Oid collation) if (GetDatabaseEncoding() == PG_UTF8) { if (pg_regex_locale) - pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L; + { + if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN) + { + pg_regex_strategy = PG_REGEX_BUILTIN; + regex_builtin_cclass_posix = pg_regex_locale->info.builtin.properties_posix; + } + else + pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L; + } else pg_regex_strategy = PG_REGEX_LOCALE_WIDE; } @@ -290,6 +303,8 @@ pg_wc_isdigit(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISDIGIT)); + case PG_REGEX_BUILTIN: + return pg_u_isdigit(c, regex_builtin_cclass_posix); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswdigit((wint_t) c); @@ -322,6 +337,8 @@ pg_wc_isalpha(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISALPHA)); + case PG_REGEX_BUILTIN: + return pg_u_isalpha(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswalpha((wint_t) c); @@ -354,6 +371,8 @@ pg_wc_isalnum(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISALNUM)); + case PG_REGEX_BUILTIN: + return pg_u_isalnum(c, regex_builtin_cclass_posix); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswalnum((wint_t) c); @@ -395,6 +414,8 @@ pg_wc_isupper(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISUPPER)); + case PG_REGEX_BUILTIN: + return pg_u_isupper(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswupper((wint_t) c); @@ -427,6 +448,8 @@ pg_wc_islower(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISLOWER)); + case PG_REGEX_BUILTIN: + return pg_u_islower(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswlower((wint_t) c); @@ -459,6 +482,8 @@ pg_wc_isgraph(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISGRAPH)); + case PG_REGEX_BUILTIN: + return pg_u_isgraph(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswgraph((wint_t) c); @@ -491,6 +516,8 @@ pg_wc_isprint(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISPRINT)); + case PG_REGEX_BUILTIN: + return pg_u_isprint(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswprint((wint_t) c); @@ -523,6 +550,8 @@ pg_wc_ispunct(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISPUNCT)); + case PG_REGEX_BUILTIN: + return pg_u_ispunct(c, regex_builtin_cclass_posix); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswpunct((wint_t) c); @@ -555,6 +584,8 @@ pg_wc_isspace(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISSPACE)); + case PG_REGEX_BUILTIN: + return pg_u_isspace(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswspace((wint_t) c); @@ -588,6 +619,8 @@ pg_wc_toupper(pg_wchar c) if (c <= (pg_wchar) 127) return pg_ascii_toupper((unsigned char) c); return c; + case PG_REGEX_BUILTIN: + return unicode_case_simple(c, CaseUpper); case PG_REGEX_LOCALE_WIDE: /* force C behavior for ASCII characters, per comments above */ if (c <= (pg_wchar) 127) @@ -628,6 +661,8 @@ pg_wc_tolower(pg_wchar c) if (c <= (pg_wchar) 127) return pg_ascii_tolower((unsigned char) c); return c; + case PG_REGEX_BUILTIN: + return unicode_case_simple(c, CaseLower); case PG_REGEX_LOCALE_WIDE: /* force C behavior for ASCII characters, per comments above */ if (c <= (pg_wchar) 127) @@ -792,6 +827,9 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode) max_chr = (pg_wchar) MAX_SIMPLE_CHR; #endif break; + case PG_REGEX_BUILTIN: + max_chr = (pg_wchar) MAX_SIMPLE_CHR; + break; case PG_REGEX_LOCALE_WIDE: case PG_REGEX_LOCALE_WIDE_L: max_chr = (pg_wchar) MAX_SIMPLE_CHR; @@ -809,6 +847,7 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode) max_chr = (pg_wchar) MAX_SIMPLE_CHR; break; default: + Assert(false); max_chr = 0; /* can't get here, but keep compiler quiet */ break; } diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 829aaa8d0e..e6cbf9e588 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -77,6 +77,8 @@ #include "catalog/pg_collation.h" #include "catalog/pg_type.h" +#include "common/unicode_case.h" +#include "common/unicode_category.h" #include "mb/pg_wchar.h" #include "nodes/miscnodes.h" #include "parser/scansup.h" @@ -1680,6 +1682,35 @@ str_tolower(const char *buff, size_t nbytes, Oid collid) } else #endif + if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN) + { + const char *src = buff; + size_t srclen = nbytes; + size_t dstsize = srclen + 1; + char *dst = palloc(dstsize); + size_t needed; + + Assert(GetDatabaseEncoding() == PG_UTF8); + + /* first try buffer of equal size */ + dstsize = srclen + 1; + result = palloc(dstsize); + + needed = unicode_convert_case(dst, dstsize, src, srclen, CaseLower, + mylocale->info.builtin.casemap_full); + if (needed + 1 > dstsize) + { + /* grow buffer if needed and retry */ + dstsize = needed + 1; + dst = repalloc(dst, dstsize); + needed = unicode_convert_case(dst, dstsize, src, srclen, CaseLower, + mylocale->info.builtin.casemap_full); + Assert(needed + 1 == dstsize); + } + + result = dst; + } + else { if (pg_database_encoding_max_length() > 1) { @@ -1798,6 +1829,35 @@ str_toupper(const char *buff, size_t nbytes, Oid collid) } else #endif + if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN) + { + const char *src = buff; + size_t srclen = nbytes; + size_t dstsize = srclen + 1; + char *dst = palloc(dstsize); + size_t needed; + + Assert(GetDatabaseEncoding() == PG_UTF8); + + /* first try buffer of equal size */ + dstsize = srclen + 1; + result = palloc(dstsize); + + needed = unicode_convert_case(dst, dstsize, src, srclen, CaseUpper, + mylocale->info.builtin.casemap_full); + if (needed + 1 > dstsize) + { + /* grow buffer if needed and retry */ + dstsize = needed + 1; + dst = repalloc(dst, dstsize); + needed = unicode_convert_case(dst, dstsize, src, srclen, CaseUpper, + mylocale->info.builtin.casemap_full); + Assert(needed + 1 == dstsize); + } + + result = dst; + } + else { if (pg_database_encoding_max_length() > 1) { @@ -1917,6 +1977,79 @@ str_initcap(const char *buff, size_t nbytes, Oid collid) } else #endif + if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN) + { + const unsigned char *src = (unsigned char *) buff; + unsigned char *dst; + size_t dstsize = nbytes + 1; + int srcoff = 0; + int dstoff = 0; + CaseKind casekind; + + if (mylocale->info.builtin.titlecase) + casekind = CaseTitle; + else + casekind = CaseUpper; + + Assert(GetDatabaseEncoding() == PG_UTF8); + + /* Output workspace cannot have more codes than input bytes */ + dst = (unsigned char *) palloc(dstsize); + + while (srcoff < nbytes) + { + pg_wchar u1 = utf8_to_unicode(src + srcoff); + pg_wchar u2; + int u1len = unicode_utf8len(u1); + int u2len; + + if (wasalnum) + u2 = unicode_case_simple(u1, CaseLower); + else + u2 = unicode_case_simple(u1, casekind); + + u2len = unicode_utf8len(u2); + + wasalnum = pg_u_isalnum(u2, mylocale->info.builtin.properties_posix); + + /* + * If we can't fit the necessary bytes and a terminating NUL, + * reallocate buffer to the maximum size we might need, and + * shrink it later. + */ + if (dstoff + u2len + 1 > dstsize) + { + /* Overflow paranoia */ + if ((nbytes + 1) > (INT_MAX / sizeof(pg_wchar))) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + dstsize = (nbytes + 1) * sizeof(pg_wchar); + dst = repalloc(dst, dstsize); + } + + unicode_to_utf8(u2, dst + dstoff); + srcoff += u1len; + dstoff += u2len; + } + + *(dst + dstoff) = '\0'; + dstoff++; + + if (dstsize == dstoff) + { + result = (char *) dst; + } + else + { + /* shrink buffer and store result */ + result = palloc(dstoff); + memcpy(result, dst, dstoff); + pfree(dst); + } + } + else { if (pg_database_encoding_max_length() > 1) { diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 45fe847320..74aaf82566 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1269,7 +1269,19 @@ lookup_collation_cache(Oid collation, bool set_flags) elog(ERROR, "cache lookup failed for collation %u", collation); collform = (Form_pg_collation) GETSTRUCT(tp); - if (collform->collprovider == COLLPROVIDER_LIBC) + if (collform->collprovider == COLLPROVIDER_BUILTIN) + { + Datum datum; + const char *colllocale; + + datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale); + colllocale = TextDatumGetCString(datum); + + cache_entry->collate_is_c = true; + cache_entry->ctype_is_c = ((strcmp(colllocale, "C") == 0) || + (strcmp(colllocale, "POSIX") == 0)); + } + else if (collform->collprovider == COLLPROVIDER_LIBC) { Datum datum; const char *collcollate; @@ -1320,16 +1332,30 @@ lc_collate_is_c(Oid collation) if (collation == DEFAULT_COLLATION_OID) { static int result = -1; - char *localeptr; - - if (default_locale.provider == COLLPROVIDER_ICU) - return false; + const char *localeptr; if (result >= 0) return (bool) result; - localeptr = setlocale(LC_COLLATE, NULL); - if (!localeptr) - elog(ERROR, "invalid LC_COLLATE setting"); + + if (default_locale.provider == COLLPROVIDER_BUILTIN) + { + result = true; + return (bool) result; + } + else if (default_locale.provider == COLLPROVIDER_ICU) + { + result = false; + return (bool) result; + } + else if (default_locale.provider == COLLPROVIDER_LIBC) + { + localeptr = setlocale(LC_CTYPE, NULL); + if (!localeptr) + elog(ERROR, "invalid LC_CTYPE setting"); + } + else + elog(ERROR, "unexpected collation provider '%c'", + default_locale.provider); if (strcmp(localeptr, "C") == 0) result = true; @@ -1373,16 +1399,29 @@ lc_ctype_is_c(Oid collation) if (collation == DEFAULT_COLLATION_OID) { static int result = -1; - char *localeptr; - - if (default_locale.provider == COLLPROVIDER_ICU) - return false; + const char *localeptr; if (result >= 0) return (bool) result; - localeptr = setlocale(LC_CTYPE, NULL); - if (!localeptr) - elog(ERROR, "invalid LC_CTYPE setting"); + + if (default_locale.provider == COLLPROVIDER_BUILTIN) + { + localeptr = default_locale.info.builtin.locale; + } + else if (default_locale.provider == COLLPROVIDER_ICU) + { + result = false; + return (bool) result; + } + else if (default_locale.provider == COLLPROVIDER_LIBC) + { + localeptr = setlocale(LC_CTYPE, NULL); + if (!localeptr) + elog(ERROR, "invalid LC_CTYPE setting"); + } + else + elog(ERROR, "unexpected collation provider '%c'", + default_locale.provider); if (strcmp(localeptr, "C") == 0) result = true; @@ -1390,6 +1429,7 @@ lc_ctype_is_c(Oid collation) result = true; else result = false; + return (bool) result; } @@ -1520,10 +1560,10 @@ pg_newlocale_from_collation(Oid collid) if (collid == DEFAULT_COLLATION_OID) { - if (default_locale.provider == COLLPROVIDER_ICU) - return &default_locale; - else + if (default_locale.provider == COLLPROVIDER_LIBC) return (pg_locale_t) 0; + else + return &default_locale; } cache_entry = lookup_collation_cache(collid, false); @@ -1548,7 +1588,45 @@ pg_newlocale_from_collation(Oid collid) result.provider = collform->collprovider; result.deterministic = collform->collisdeterministic; - if (collform->collprovider == COLLPROVIDER_LIBC) + if (collform->collprovider == COLLPROVIDER_BUILTIN) + { + bool casemap_full; + bool titlecase; + bool properties_posix; + + const char *locstr; + + datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale); + locstr = TextDatumGetCString(datum); + + if (strcmp(locstr, "UCS_BASIC") == 0) + { + casemap_full = true; + titlecase = true; + properties_posix = false; + } + else if (strcmp(locstr, "C.UTF-8") == 0) + { + casemap_full = false; + titlecase = false; + properties_posix = true; + } + else if (strcmp(locstr, "C") == 0) + { + casemap_full = false; + titlecase = false; + properties_posix = true; + } + else + elog(ERROR, "unexpected builtin locale: %s", locstr); + + result.info.builtin.locale = MemoryContextStrdup(TopMemoryContext, + locstr); + result.info.builtin.casemap_full = casemap_full; + result.info.builtin.titlecase = titlecase; + result.info.builtin.properties_posix = properties_posix; + } + else if (collform->collprovider == COLLPROVIDER_LIBC) { const char *collcollate; const char *collctype pg_attribute_unused(); @@ -1627,6 +1705,7 @@ pg_newlocale_from_collation(Oid collid) collversionstr = TextDatumGetCString(datum); + Assert(collform->collprovider != COLLPROVIDER_BUILTIN); datum = SysCacheGetAttrNotNull(COLLOID, tp, collform->collprovider == COLLPROVIDER_ICU ? Anum_pg_collation_colllocale : Anum_pg_collation_collcollate); actual_versionstr = get_collation_actual_version(collform->collprovider, @@ -1678,6 +1757,14 @@ get_collation_actual_version(char collprovider, const char *collcollate) { char *collversion = NULL; + /* + * The only two supported locales (C and C.UTF-8) are both based on memcmp + * and do not change. (The ctype behavior can change, but the versioning + * does not track that.) + */ + if (collprovider == COLLPROVIDER_BUILTIN) + return NULL; + #ifdef USE_ICU if (collprovider == COLLPROVIDER_ICU) { @@ -2444,6 +2531,43 @@ pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, return result; } +const char * +builtin_validate_locale(int encoding, const char *locale) +{ + const char *canonical_name = NULL; + int required_encoding = -1; + + if (strcmp(locale, "C") == 0 || strcmp(locale, "POSIX") == 0) + { + canonical_name = "C"; + } + else if (strcmp(locale, "UCS_BASIC") == 0 || strcmp(locale, "UCS_BASIC") == 0) + { + required_encoding = PG_UTF8; + canonical_name = "UCS_BASIC"; + } + else if (strcmp(locale, "C.UTF-8") == 0 || strcmp(locale, "C.UTF8") == 0) + { + required_encoding = PG_UTF8; + canonical_name = "C.UTF-8"; + } + + if (!canonical_name) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("invalid locale name \"%s\" for builtin provider", + locale))); + + if (required_encoding >= 0 && encoding != required_encoding) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("encoding \"%s\" does not match locale \"%s\"", + pg_encoding_to_char(encoding), locale))); + + return canonical_name; +} + + #ifdef USE_ICU /* diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 154912ecb4..0cfa1ba54f 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -425,7 +425,43 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect strcmp(ctype, "POSIX") == 0) database_ctype_is_c = true; - if (dbform->datlocprovider == COLLPROVIDER_ICU) + if (dbform->datlocprovider == COLLPROVIDER_BUILTIN) + { + bool casemap_full; + bool titlecase; + bool properties_posix; + + datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datlocale); + datlocale = TextDatumGetCString(datum); + + if (strcmp(datlocale, "UCS_BASIC") == 0) + { + casemap_full = true; + titlecase = true; + properties_posix = false; + } + else if (strcmp(datlocale, "C.UTF-8") == 0) + { + casemap_full = false; + titlecase = false; + properties_posix = true; + } + else if (strcmp(datlocale, "C") == 0) + { + casemap_full = false; + titlecase = false; + properties_posix = true; + } + else + elog(ERROR, "unexpected builtin locale: %s", datlocale); + + default_locale.info.builtin.locale = MemoryContextStrdup( + TopMemoryContext, datlocale); + default_locale.info.builtin.casemap_full = casemap_full; + default_locale.info.builtin.titlecase = titlecase; + default_locale.info.builtin.properties_posix = properties_posix; + } + else if (dbform->datlocprovider == COLLPROVIDER_ICU) { char *icurules; @@ -463,10 +499,16 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect { char *actual_versionstr; char *collversionstr; + char *locale; collversionstr = TextDatumGetCString(datum); - actual_versionstr = get_collation_actual_version(dbform->datlocprovider, dbform->datlocprovider == COLLPROVIDER_ICU ? datlocale : collate); + if (dbform->datlocprovider == COLLPROVIDER_LIBC) + locale = collate; + else + locale = datlocale; + + actual_versionstr = get_collation_actual_version(dbform->datlocprovider, locale); if (!actual_versionstr) /* should not happen */ elog(WARNING, diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 90f793632a..7419c38722 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -146,6 +146,7 @@ static char *lc_time = NULL; static char *lc_messages = NULL; static char locale_provider = COLLPROVIDER_LIBC; static char *datlocale = NULL; +static bool icu_locale_specified = false; static char *icu_rules = NULL; static const char *default_text_search_config = NULL; static char *username = NULL; @@ -2390,14 +2391,13 @@ setlocales(void) lc_messages = canonname; #endif + if (locale_provider != COLLPROVIDER_LIBC && datlocale == NULL) + pg_fatal("locale must be specified unless provider is libc"); + if (locale_provider == COLLPROVIDER_ICU) { char *langtag; - /* acquire default locale from the environment, if not specified */ - if (datlocale == NULL) - pg_fatal("ICU locale must be specified"); - /* canonicalize to a language tag */ langtag = icu_language_tag(datlocale); printf(_("Using language tag \"%s\" for ICU locale \"%s\".\n"), @@ -2442,7 +2442,8 @@ usage(const char *progname) " set default locale in the respective category for\n" " new databases (default taken from environment)\n")); printf(_(" --no-locale equivalent to --locale=C\n")); - printf(_(" --locale-provider={libc|icu}\n" + printf(_(" --builtin-locale=LOCALE set builtin locale name for new databases\n")); + printf(_(" --locale-provider={builtin|libc|icu}\n" " set default locale provider for new databases\n")); printf(_(" --pwfile=FILE read password for the new superuser from file\n")); printf(_(" -T, --text-search-config=CFG\n" @@ -2593,20 +2594,28 @@ setup_locale_encoding(void) { setlocales(); - if (locale_provider == COLLPROVIDER_LIBC && - strcmp(lc_ctype, lc_collate) == 0 && - strcmp(lc_ctype, lc_time) == 0 && - strcmp(lc_ctype, lc_numeric) == 0 && - strcmp(lc_ctype, lc_monetary) == 0 && - strcmp(lc_ctype, lc_messages) == 0 && - (!datlocale || strcmp(lc_ctype, datlocale) == 0)) + if (locale_provider == COLLPROVIDER_BUILTIN && + strcmp(lc_ctype, "C") == 0 && + strcmp(lc_collate, "C") == 0 && + strcmp(lc_time, "C") == 0 && + strcmp(lc_numeric, "C") == 0 && + strcmp(lc_monetary, "C") == 0 && + strcmp(lc_messages, "C") == 0) + printf(_("The database cluster will be initialized with no locale.\n")); + else if (locale_provider == COLLPROVIDER_LIBC && + strcmp(lc_ctype, lc_collate) == 0 && + strcmp(lc_ctype, lc_time) == 0 && + strcmp(lc_ctype, lc_numeric) == 0 && + strcmp(lc_ctype, lc_monetary) == 0 && + strcmp(lc_ctype, lc_messages) == 0 && + (!datlocale || strcmp(lc_ctype, datlocale) == 0)) printf(_("The database cluster will be initialized with locale \"%s\".\n"), lc_ctype); else { printf(_("The database cluster will be initialized with this locale configuration:\n")); - printf(_(" provider: %s\n"), collprovider_name(locale_provider)); - if (datlocale) - printf(_(" ICU locale: %s\n"), datlocale); + printf(_(" default collation provider: %s\n"), collprovider_name(locale_provider)); + if (locale_provider != COLLPROVIDER_LIBC) + printf(_(" default collation locale: %s\n"), datlocale); printf(_(" LC_COLLATE: %s\n" " LC_CTYPE: %s\n" " LC_MESSAGES: %s\n" @@ -3099,9 +3108,10 @@ main(int argc, char *argv[]) {"allow-group-access", no_argument, NULL, 'g'}, {"discard-caches", no_argument, NULL, 14}, {"locale-provider", required_argument, NULL, 15}, - {"icu-locale", required_argument, NULL, 16}, - {"icu-rules", required_argument, NULL, 17}, - {"sync-method", required_argument, NULL, 18}, + {"builtin-locale", required_argument, NULL, 16}, + {"icu-locale", required_argument, NULL, 17}, + {"icu-rules", required_argument, NULL, 18}, + {"sync-method", required_argument, NULL, 19}, {NULL, 0, NULL, 0} }; @@ -3269,7 +3279,9 @@ main(int argc, char *argv[]) "-c debug_discard_caches=1"); break; case 15: - if (strcmp(optarg, "icu") == 0) + if (strcmp(optarg, "builtin") == 0) + locale_provider = COLLPROVIDER_BUILTIN; + else if (strcmp(optarg, "icu") == 0) locale_provider = COLLPROVIDER_ICU; else if (strcmp(optarg, "libc") == 0) locale_provider = COLLPROVIDER_LIBC; @@ -3280,9 +3292,13 @@ main(int argc, char *argv[]) datlocale = pg_strdup(optarg); break; case 17: - icu_rules = pg_strdup(optarg); + datlocale = pg_strdup(optarg); + icu_locale_specified = true; break; case 18: + icu_rules = pg_strdup(optarg); + break; + case 19: if (!parse_sync_method(optarg, &sync_method)) exit(1); break; @@ -3312,7 +3328,7 @@ main(int argc, char *argv[]) exit(1); } - if (datlocale && locale_provider != COLLPROVIDER_ICU) + if (icu_locale_specified && locale_provider != COLLPROVIDER_ICU) pg_fatal("%s cannot be specified unless locale provider \"%s\" is chosen", "--icu-locale", "icu"); diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl index 03376cc0f7..242f4581a5 100644 --- a/src/bin/initdb/t/001_initdb.pl +++ b/src/bin/initdb/t/001_initdb.pl @@ -117,7 +117,7 @@ if ($ENV{with_icu} eq 'yes') { command_fails_like( [ 'initdb', '--no-sync', '--locale-provider=icu', "$tempdir/data2" ], - qr/initdb: error: ICU locale must be specified/, + qr/initdb: error: locale must be specified unless provider is libc/, 'locale provider ICU requires --icu-locale'); command_ok( @@ -138,7 +138,7 @@ if ($ENV{with_icu} eq 'yes') '--lc-monetary=C', '--lc-time=C', "$tempdir/data4" ], - qr/^\s+ICU locale:\s+und\n/ms, + qr/^\s+default collation locale:\s+und\n/ms, 'options --locale-provider=icu --locale=und --lc-*=C'); command_fails_like( @@ -184,6 +184,59 @@ else 'locale provider ICU fails since no ICU support'); } +command_fails( + [ 'initdb', '--no-sync', '--locale-provider=builtin', "$tempdir/data6" ], + 'locale provider builtin fails without --locale'); + +command_ok( + [ + 'initdb', '--no-sync', + '--locale-provider=builtin', '--locale=C', + "$tempdir/data7" + ], + 'locale provider builtin with --locale'); + +command_ok( + [ + 'initdb', '--no-sync', + '--locale-provider=builtin', '-E UTF-8', + '--builtin-locale=C.UTF-8', "$tempdir/data8" + ], + 'locale provider builtin with -E UTF-8 --builtin-locale=C.UTF-8'); + +command_fails( + [ + 'initdb', '--no-sync', + '--locale-provider=builtin', '-E SQL_ASCII', + '--builtin-locale=C.UTF-8', "$tempdir/data9" + ], + 'locale provider builtin with --builtin-locale=C.UTF-8 fails for SQL_ASCII' +); + +command_ok( + [ + 'initdb', '--no-sync', + '--locale-provider=builtin', '--lc-ctype=C', + '--locale=C', "$tempdir/data10" + ], + 'locale provider builtin with --lc-ctype'); + +command_fails( + [ + 'initdb', '--no-sync', + '--locale-provider=builtin', '--icu-locale=en', + "$tempdir/dataX" + ], + 'fails for locale provider builtin with ICU locale'); + +command_fails( + [ + 'initdb', '--no-sync', + '--locale-provider=builtin', '--icu-rules=""', + "$tempdir/dataX" + ], + 'fails for locale provider builtin with ICU rules'); + command_fails( [ 'initdb', '--no-sync', '--locale-provider=xyz', "$tempdir/dataX" ], 'fails for invalid locale provider'); diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index a67b4b8225..450c322205 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -3114,7 +3114,9 @@ dumpDatabase(Archive *fout) } appendPQExpBufferStr(creaQry, " LOCALE_PROVIDER = "); - if (datlocprovider[0] == 'c') + if (datlocprovider[0] == 'b') + appendPQExpBufferStr(creaQry, "builtin"); + else if (datlocprovider[0] == 'c') appendPQExpBufferStr(creaQry, "libc"); else if (datlocprovider[0] == 'i') appendPQExpBufferStr(creaQry, "icu"); @@ -3122,27 +3124,33 @@ dumpDatabase(Archive *fout) pg_fatal("unrecognized locale provider: %s", datlocprovider); - if (strlen(collate) > 0 && strcmp(collate, ctype) == 0) + if (!locale && datlocprovider[0] != 'c') + pg_log_warning("database '%s' with provider '%s' missing datlocale", + datname, datlocprovider); + + if (locale && datlocprovider[0] == 'c') + pg_log_warning("database '%s' with provider 'c' has non-NULL locale '%s'", + datname, locale); + + /* if collate and ctype are equal, and locale is NULL, use LOCALE */ + if (!locale && strlen(collate) > 0 && strcmp(collate, ctype) == 0) + locale = collate; + + /* output LC_COLLATE and LC_CTYPE if different from LOCALE */ + if (strlen(collate) > 0 && (!locale || strcmp(collate, locale) != 0)) { - appendPQExpBufferStr(creaQry, " LOCALE = "); + appendPQExpBufferStr(creaQry, " LC_COLLATE = "); appendStringLiteralAH(creaQry, collate, fout); } - else + if (strlen(ctype) > 0 && (!locale || strcmp(ctype, locale) != 0)) { - if (strlen(collate) > 0) - { - appendPQExpBufferStr(creaQry, " LC_COLLATE = "); - appendStringLiteralAH(creaQry, collate, fout); - } - if (strlen(ctype) > 0) - { - appendPQExpBufferStr(creaQry, " LC_CTYPE = "); - appendStringLiteralAH(creaQry, ctype, fout); - } + appendPQExpBufferStr(creaQry, " LC_CTYPE = "); + appendStringLiteralAH(creaQry, ctype, fout); } + if (locale) { - appendPQExpBufferStr(creaQry, " ICU_LOCALE = "); + appendPQExpBufferStr(creaQry, " LOCALE = "); appendStringLiteralAH(creaQry, locale, fout); } @@ -13870,7 +13878,9 @@ dumpCollation(Archive *fout, const CollInfo *collinfo) fmtQualifiedDumpable(collinfo)); appendPQExpBufferStr(q, "provider = "); - if (collprovider[0] == 'c') + if (collprovider[0] == 'b') + appendPQExpBufferStr(q, "builtin"); + else if (collprovider[0] == 'c') appendPQExpBufferStr(q, "libc"); else if (collprovider[0] == 'i') appendPQExpBufferStr(q, "icu"); @@ -13891,6 +13901,13 @@ dumpCollation(Archive *fout, const CollInfo *collinfo) /* no locale -- the default collation cannot be reloaded anyway */ } + else if (collprovider[0] == 'b') + { + if (collcollate || collctype || colllocale || collicurules) + pg_log_warning("invalid collation \"%s\"", qcollname); + + appendPQExpBufferStr(q, ", locale = 'C'"); + } else if (collprovider[0] == 'i') { if (fout->remoteVersion >= 150000) diff --git a/src/bin/pg_upgrade/t/002_pg_upgrade.pl b/src/bin/pg_upgrade/t/002_pg_upgrade.pl index 41d06d272b..94bf086ba8 100644 --- a/src/bin/pg_upgrade/t/002_pg_upgrade.pl +++ b/src/bin/pg_upgrade/t/002_pg_upgrade.pl @@ -110,13 +110,16 @@ my $oldversion = int($oldnode->pg_version =~ s/([0-9]*).*/$1/rg); # can test that pg_upgrade copies the locale settings of template0 # from the old to the new cluster. -my $original_encoding = "6"; # UTF-8 -my $original_provider = "c"; -my $original_locale = "C"; -my $original_datlocale = ""; -my $provider_field = "'c' AS datlocprovider"; -my $datlocale_field = "NULL AS datlocale"; -if ($oldversion >= 15 && $ENV{with_icu} eq 'yes') +my %encoding_number = ('UTF-8' => 6, 'SQL_ASCII' => 0); +my $provider_field; +my $datlocale_field; +my $original_encoding; +my $original_provider; +my $original_datcollate = "C"; +my $original_datctype = "C"; +my $original_datlocale; + +if ($oldversion >= 15) { $provider_field = "datlocprovider"; if ($oldversion >= 17) @@ -127,18 +130,52 @@ if ($oldversion >= 15 && $ENV{with_icu} eq 'yes') { $datlocale_field = "daticulocale AS datlocale"; } +} +else +{ + $provider_field = "'c' AS datlocprovider"; + $datlocale_field = "NULL AS datlocale"; +} + +if ($oldversion >= 17) +{ + $original_encoding = "UTF-8"; + $original_provider = "b"; + $original_datlocale = "C.UTF-8"; +} +elsif ($oldversion >= 15 && $ENV{with_icu} eq 'yes') +{ + $original_encoding = "UTF-8"; $original_provider = "i"; $original_datlocale = "fr-CA"; } +else +{ + my $original_encoding = "SQL_ASCII"; + my $original_provider = "c"; + my $original_datlocale = ""; +} my @initdb_params = @custom_opts; -push @initdb_params, ('--encoding', 'UTF-8'); -push @initdb_params, ('--locale', $original_locale); -if ($original_provider eq "i") +push @initdb_params, ('--encoding', $original_encoding); +push @initdb_params, ('--lc-collate', $original_datcollate); +push @initdb_params, ('--lc-ctype', $original_datctype); + +# add --locale-provider, if supported +my %provider_name = ('b' => 'builtin', 'i' => 'icu', 'c' => 'libc'); +if ($oldnode->pg_version >= 15) { - push @initdb_params, ('--locale-provider', 'icu'); - push @initdb_params, ('--icu-locale', 'fr-CA'); + push @initdb_params, + ('--locale-provider', $provider_name{$original_provider}); + if ($original_provider eq 'b') + { + push @initdb_params, ('--builtin-locale', $original_datlocale); + } + elsif ($original_provider eq 'i') + { + push @initdb_params, ('--icu-locale', $original_datlocale); + } } $node_params{extra} = \@initdb_params; @@ -151,7 +188,7 @@ $result = $oldnode->safe_psql( "SELECT encoding, $provider_field, datcollate, datctype, $datlocale_field FROM pg_database WHERE datname='template0'"); is( $result, - "$original_encoding|$original_provider|$original_locale|$original_locale|$original_datlocale", + "$encoding_number{$original_encoding}|$original_provider|$original_datcollate|$original_datctype|$original_datlocale", "check locales in original cluster"); # The default location of the source code is the root of this directory. @@ -327,7 +364,8 @@ if (defined($ENV{oldinstall})) } # Create an invalid database, will be deleted below -$oldnode->safe_psql('postgres', qq( +$oldnode->safe_psql( + 'postgres', qq( CREATE DATABASE regression_invalid; UPDATE pg_database SET datconnlimit = -2 WHERE datname = 'regression_invalid'; )); @@ -370,7 +408,7 @@ command_checks_all( $mode, '--check', ], 1, - [qr/invalid/], # pg_upgrade prints errors on stdout :( + [qr/invalid/], # pg_upgrade prints errors on stdout :( [qr//], 'invalid database causes failure'); rmtree($newnode->data_dir . "/pg_upgrade_output.d"); @@ -434,7 +472,7 @@ $result = $newnode->safe_psql( "SELECT encoding, $provider_field, datcollate, datctype, $datlocale_field FROM pg_database WHERE datname='template0'"); is( $result, - "$original_encoding|$original_provider|$original_locale|$original_locale|$original_datlocale", + "$encoding_number{$original_encoding}|$original_provider|$original_datcollate|$original_datctype|$original_datlocale", "check that locales in new cluster match original cluster"); # Second dump from the upgraded instance. diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c index b943569050..c649477505 100644 --- a/src/bin/psql/describe.c +++ b/src/bin/psql/describe.c @@ -926,7 +926,7 @@ listAllDbs(const char *pattern, bool verbose) gettext_noop("Encoding")); if (pset.sversion >= 150000) appendPQExpBuffer(&buf, - " CASE d.datlocprovider WHEN 'c' THEN 'libc' WHEN 'i' THEN 'icu' END AS \"%s\",\n", + " CASE d.datlocprovider WHEN 'b' THEN 'builtin' WHEN 'c' THEN 'libc' WHEN 'i' THEN 'icu' END AS \"%s\",\n", gettext_noop("Locale Provider")); else appendPQExpBuffer(&buf, @@ -4974,7 +4974,7 @@ listCollations(const char *pattern, bool verbose, bool showSystem) if (pset.sversion >= 100000) appendPQExpBuffer(&buf, - " CASE c.collprovider WHEN 'd' THEN 'default' WHEN 'c' THEN 'libc' WHEN 'i' THEN 'icu' END AS \"%s\",\n", + " CASE c.collprovider WHEN 'd' THEN 'default' WHEN 'b' THEN 'builtin' WHEN 'c' THEN 'libc' WHEN 'i' THEN 'icu' END AS \"%s\",\n", gettext_noop("Provider")); else appendPQExpBuffer(&buf, diff --git a/src/bin/scripts/createdb.c b/src/bin/scripts/createdb.c index 14970a6a5f..4af4b98181 100644 --- a/src/bin/scripts/createdb.c +++ b/src/bin/scripts/createdb.c @@ -40,8 +40,9 @@ main(int argc, char *argv[]) {"locale", required_argument, NULL, 'l'}, {"maintenance-db", required_argument, NULL, 3}, {"locale-provider", required_argument, NULL, 4}, - {"icu-locale", required_argument, NULL, 5}, - {"icu-rules", required_argument, NULL, 6}, + {"builtin-locale", required_argument, NULL, 5}, + {"icu-locale", required_argument, NULL, 6}, + {"icu-rules", required_argument, NULL, 7}, {NULL, 0, NULL, 0} }; @@ -67,6 +68,7 @@ main(int argc, char *argv[]) char *lc_ctype = NULL; char *locale = NULL; char *locale_provider = NULL; + char *builtin_locale = NULL; char *icu_locale = NULL; char *icu_rules = NULL; @@ -134,9 +136,12 @@ main(int argc, char *argv[]) locale_provider = pg_strdup(optarg); break; case 5: - icu_locale = pg_strdup(optarg); + builtin_locale = pg_strdup(optarg); break; case 6: + icu_locale = pg_strdup(optarg); + break; + case 7: icu_rules = pg_strdup(optarg); break; default: @@ -216,6 +221,11 @@ main(int argc, char *argv[]) appendPQExpBufferStr(&sql, " LOCALE "); appendStringLiteralConn(&sql, locale, conn); } + if (builtin_locale) + { + appendPQExpBufferStr(&sql, " BUILTIN_LOCALE "); + appendStringLiteralConn(&sql, builtin_locale, conn); + } if (lc_collate) { appendPQExpBufferStr(&sql, " LC_COLLATE "); @@ -296,7 +306,7 @@ help(const char *progname) printf(_(" --lc-ctype=LOCALE LC_CTYPE setting for the database\n")); printf(_(" --icu-locale=LOCALE ICU locale setting for the database\n")); printf(_(" --icu-rules=RULES ICU rules setting for the database\n")); - printf(_(" --locale-provider={libc|icu}\n" + printf(_(" --locale-provider={builtin|libc|icu}\n" " locale provider for the database's default collation\n")); printf(_(" -O, --owner=OWNER database user to own the new database\n")); printf(_(" -S, --strategy=STRATEGY database creation strategy wal_log or file_copy\n")); diff --git a/src/bin/scripts/t/020_createdb.pl b/src/bin/scripts/t/020_createdb.pl index 37e47b0078..3ba623f9d1 100644 --- a/src/bin/scripts/t/020_createdb.pl +++ b/src/bin/scripts/t/020_createdb.pl @@ -105,6 +105,84 @@ else 'create database with ICU fails since no ICU support'); } +$node->command_fails( + [ + 'createdb', '-T', + 'template0', '--locale-provider=builtin', + 'tbuiltin1' + ], + 'create database with provider "builtin" fails without --locale'); + +$node->command_ok( + [ + 'createdb', '-T', + 'template0', '--locale-provider=builtin', + '--locale=C', 'tbuiltin2' + ], + 'create database with provider "builtin" and locale "C"'); + +$node->command_ok( + [ + 'createdb', '-T', + 'template0', '--locale-provider=builtin', + '--locale=C', '--lc-collate=C', + 'tbuiltin3' + ], + 'create database with provider "builtin" and LC_COLLATE=C'); + +$node->command_ok( + [ + 'createdb', '-T', + 'template0', '--locale-provider=builtin', + '--locale=C', '--lc-ctype=C', + 'tbuiltin4' + ], + 'create database with provider "builtin" and LC_CTYPE=C'); + +$node->command_ok( + [ + 'createdb', '-T', + 'template0', '--locale-provider=builtin', + '-E UTF-8', '--builtin-locale=C.UTF8', + 'tbuiltin5' + ], + 'create database with provider "builtin" and --builtin-locale C.UTF-8'); + +$node->command_fails( + [ + 'createdb', '-T', + 'template0', '--locale-provider=builtin', + '-E LATIN1', '--builtin-locale=C.UTF-8', + 'tbuiltin6' + ], + 'create database with provider "builtin" and --builtin-locale C.UTF-8'); + +$node->command_fails( + [ + 'createdb', '-T', + 'template0', '--locale-provider=builtin', + '--locale=C', '--icu-locale=en', + 'tbuiltin7' + ], + 'create database with provider "builtin" and ICU_LOCALE="en"'); + +$node->command_fails( + [ + 'createdb', '-T', + 'template0', '--locale-provider=builtin', + '--locale=C', '--icu-rules=""', + 'tbuiltin8' + ], + 'create database with provider "builtin" and ICU_RULES=""'); + +$node->command_fails( + [ + 'createdb', '-T', + 'template1', '--locale-provider=builtin', + '--locale=C', 'tbuiltin9' + ], + 'create database with provider "builtin" not matching template'); + $node->command_fails([ 'createdb', 'foobar1' ], 'fails if database already exists'); diff --git a/src/include/catalog/pg_collation.dat b/src/include/catalog/pg_collation.dat index 7396ff10c4..938432e8a4 100644 --- a/src/include/catalog/pg_collation.dat +++ b/src/include/catalog/pg_collation.dat @@ -23,9 +23,9 @@ descr => 'standard POSIX collation', collname => 'POSIX', collprovider => 'c', collencoding => '-1', collcollate => 'POSIX', collctype => 'POSIX' }, -{ oid => '962', descr => 'sorts by Unicode code point', - collname => 'ucs_basic', collprovider => 'c', collencoding => '6', - collcollate => 'C', collctype => 'C' }, +{ oid => '962', descr => 'sorts by Unicode code point, C character semantics', + collname => 'ucs_basic', collprovider => 'b', collencoding => '6', + colllocale => 'C' }, { oid => '963', descr => 'sorts using the Unicode Collation Algorithm with default settings', collname => 'unicode', collprovider => 'i', collencoding => '-1', diff --git a/src/include/catalog/pg_collation.h b/src/include/catalog/pg_collation.h index a3e196fb53..5ce289d74b 100644 --- a/src/include/catalog/pg_collation.h +++ b/src/include/catalog/pg_collation.h @@ -68,6 +68,7 @@ MAKE_SYSCACHE(COLLOID, pg_collation_oid_index, 8); #ifdef EXPOSE_TO_CLIENT_CODE #define COLLPROVIDER_DEFAULT 'd' +#define COLLPROVIDER_BUILTIN 'b' #define COLLPROVIDER_ICU 'i' #define COLLPROVIDER_LIBC 'c' @@ -76,6 +77,8 @@ collprovider_name(char c) { switch (c) { + case COLLPROVIDER_BUILTIN: + return "builtin"; case COLLPROVIDER_ICU: return "icu"; case COLLPROVIDER_LIBC: diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 28c925b5af..4dfcb99872 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -76,6 +76,13 @@ struct pg_locale_struct bool deterministic; union { + struct + { + const char *locale; + bool casemap_full; + bool titlecase; + bool properties_posix; + } builtin; locale_t lt; #ifdef USE_ICU struct @@ -112,7 +119,7 @@ extern size_t pg_strxfrm_prefix(char *dest, const char *src, size_t destsize, pg_locale_t locale); extern size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale); - +extern const char *builtin_validate_locale(int encoding, const char *loc_str); extern void icu_validate_locale(const char *loc_str); extern char *icu_language_tag(const char *loc_str, int elevel); diff --git a/src/test/icu/t/010_database.pl b/src/test/icu/t/010_database.pl index 8a1fc12ec6..5f8ef16803 100644 --- a/src/test/icu/t/010_database.pl +++ b/src/test/icu/t/010_database.pl @@ -27,9 +27,8 @@ CREATE TABLE icu (def text, en text COLLATE "en-x-icu", upfirst text COLLATE upp INSERT INTO icu VALUES ('a', 'a', 'a'), ('b', 'b', 'b'), ('A', 'A', 'A'), ('B', 'B', 'B'); }); -is( $node1->safe_psql('dbicu', q{SELECT icu_unicode_version() IS NOT NULL}), - qq(t), - 'ICU unicode version defined'); +is($node1->safe_psql('dbicu', q{SELECT icu_unicode_version() IS NOT NULL}), + qq(t), 'ICU unicode version defined'); is( $node1->safe_psql('dbicu', q{SELECT def FROM icu ORDER BY def}), qq(A @@ -63,14 +62,13 @@ is( $node1->psql( 0, "C locale works for ICU"); -# Test that LOCALE works for ICU locales if LC_COLLATE and LC_CTYPE -# are specified -is( $node1->psql( - 'postgres', - q{CREATE DATABASE dbicu2 LOCALE_PROVIDER icu LOCALE '@colStrength=primary' - LC_COLLATE='C' LC_CTYPE='C' TEMPLATE template0 ENCODING UTF8} - ), - 0, - "LOCALE works for ICU locales if LC_COLLATE and LC_CTYPE are specified"); +my ($ret, $stdout, $stderr) = $node1->psql('postgres', + q{CREATE DATABASE dbicu LOCALE_PROVIDER builtin LOCALE 'C' TEMPLATE dbicu} +); +isnt($ret, 0, "locale provider must match template: exit code not 0"); +like( + $stderr, + qr/ERROR: new locale provider \(builtin\) does not match locale provider of the template database \(icu\)/, + "locale provider must match template: error message"); done_testing(); diff --git a/src/test/regress/expected/collate.out b/src/test/regress/expected/collate.out index 0649564485..ece4a8e99d 100644 --- a/src/test/regress/expected/collate.out +++ b/src/test/regress/expected/collate.out @@ -650,6 +650,26 @@ EXPLAIN (COSTS OFF) (3 rows) -- CREATE/DROP COLLATION +CREATE COLLATION builtin_c ( PROVIDER = builtin, LOCALE = "C" ); +CREATE COLLATION builtin_posix ( PROVIDER = builtin, LOCALE = "POSIX" ); +SELECT b FROM collate_test1 ORDER BY b COLLATE builtin_c; + b +----- + ABD + Abc + abc + bbc +(4 rows) + +CREATE COLLATION builtin2 ( PROVIDER = builtin ); -- fails +ERROR: parameter "locale" must be specified +CREATE COLLATION builtin2 ( PROVIDER = builtin, LOCALE = "en_US" ); -- fails +ERROR: invalid locale name "en_US" for builtin provider +CREATE COLLATION builtin2 ( PROVIDER = builtin, LC_CTYPE = "C", LC_COLLATE = "C" ); -- fails +ERROR: parameter "locale" must be specified +CREATE COLLATION builtin2 ( PROVIDER = builtin, LOCALE = "POSIX", LC_CTYPE = "POSIX" ); -- fails +ERROR: conflicting or redundant options +DETAIL: LOCALE cannot be specified together with LC_COLLATE or LC_CTYPE. CREATE COLLATION mycoll1 FROM "C"; CREATE COLLATION mycoll2 ( LC_COLLATE = "POSIX", LC_CTYPE = "POSIX" ); CREATE COLLATION mycoll3 FROM "default"; -- intentionally unsupported @@ -754,7 +774,7 @@ DETAIL: FROM cannot be specified together with any other options. -- must get rid of them. -- DROP SCHEMA collate_tests CASCADE; -NOTICE: drop cascades to 19 other objects +NOTICE: drop cascades to 21 other objects DETAIL: drop cascades to table collate_test1 drop cascades to table collate_test_like drop cascades to table collate_test2 @@ -771,6 +791,8 @@ drop cascades to function dup(anyelement) drop cascades to table collate_test20 drop cascades to table collate_test21 drop cascades to table collate_test22 +drop cascades to collation builtin_c +drop cascades to collation builtin_posix drop cascades to collation mycoll2 drop cascades to table collate_test23 drop cascades to view collate_on_int diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out new file mode 100644 index 0000000000..58085f300c --- /dev/null +++ b/src/test/regress/expected/collate.utf8.out @@ -0,0 +1,287 @@ +/* + * This test is for collations and character operations when using the + * builtin provider with the C.UTF-8 locale. + */ +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif +SET client_encoding TO UTF8; +-- +-- Test builtin UCS_BASIC locale. +-- +CREATE COLLATION BUILTIN_UCS_BASIC ( provider = builtin, locale = 'UCS_BASIC' ); +CREATE TABLE builtin_test1 ( + t TEXT COLLATE BUILTIN_UCS_BASIC +); +INSERT INTO builtin_test1 VALUES + ('abc DEF'), + ('ábc sßs DÉF'), + ('DŽxxDŽ džxxDž Džxxdž'), + ('ȺȺȺ'), + ('ⱥⱥⱥ'), + ('ⱥȺ'); +SELECT + t, lower(t), initcap(t), upper(t), + length(convert_to(t, 'UTF8')) AS t_bytes, + length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes, + length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes, + length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes + FROM builtin_test1; + t | lower | initcap | upper | t_bytes | lower_t_bytes | initcap_t_bytes | upper_t_bytes +----------------+----------------+----------------+----------------+---------+---------------+-----------------+--------------- + abc DEF | abc def | Abc Def | ABC DEF | 7 | 7 | 7 | 7 + ábc sßs DÉF | ábc sßs déf | Ábc Sßs Déf | ÁBC SSSS DÉF | 14 | 14 | 14 | 14 + DŽxxDŽ džxxDž Džxxdž | džxxdž džxxdž džxxdž | Džxxdž Džxxdž Džxxdž | DŽXXDŽ DŽXXDŽ DŽXXDŽ | 20 | 20 | 20 | 20 + ȺȺȺ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 6 | 9 | 8 | 6 + ⱥⱥⱥ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 9 | 9 | 8 | 6 + ⱥȺ | ⱥⱥ | Ⱥⱥ | ȺȺ | 5 | 6 | 5 | 4 +(6 rows) + +DROP TABLE builtin_test1; +-- test Final_Sigma +SELECT lower('ΑΣ' COLLATE BUILTIN_UCS_BASIC); -- 0391 03A3 + lower +------- + ας +(1 row) + +SELECT lower('ΑΣ0' COLLATE BUILTIN_UCS_BASIC); -- 0391 03A3 0030 + lower +------- + ας0 +(1 row) + +SELECT lower('ἈΣ̓' COLLATE BUILTIN_UCS_BASIC); -- 0391 0343 03A3 0343 + lower +------- + ἀς̓ +(1 row) + +SELECT lower('ᾼΣͅ' COLLATE BUILTIN_UCS_BASIC); -- 0391 0345 03A3 0345 + lower +------- + ᾳςͅ +(1 row) + +-- test !Final_Sigma +SELECT lower('Σ' COLLATE BUILTIN_UCS_BASIC); -- 03A3 + lower +------- + σ +(1 row) + +SELECT lower('0Σ' COLLATE BUILTIN_UCS_BASIC); -- 0030 03A3 + lower +------- + 0σ +(1 row) + +SELECT lower('ΑΣΑ' COLLATE BUILTIN_UCS_BASIC); -- 0391 03A3 0391 + lower +------- + ασα +(1 row) + +SELECT lower('ἈΣ̓Α' COLLATE BUILTIN_UCS_BASIC); -- 0391 0343 03A3 0343 0391 + lower +------- + ἀσ̓α +(1 row) + +SELECT lower('ᾼΣͅΑ' COLLATE BUILTIN_UCS_BASIC); -- 0391 0345 03A3 0345 0391 + lower +------- + ᾳσͅα +(1 row) + +-- properties +SELECT 'xyz' ~ '[[:alnum:]]' COLLATE BUILTIN_UCS_BASIC; + ?column? +---------- + t +(1 row) + +SELECT 'xyz' !~ '[[:upper:]]' COLLATE BUILTIN_UCS_BASIC; + ?column? +---------- + t +(1 row) + +SELECT '@' !~ '[[:alnum:]]' COLLATE BUILTIN_UCS_BASIC; + ?column? +---------- + t +(1 row) + +SELECT '=' !~ '[[:punct:]]' COLLATE BUILTIN_UCS_BASIC; -- symbols are not punctuation + ?column? +---------- + t +(1 row) + +SELECT 'a8a' ~ '[[:digit:]]' COLLATE BUILTIN_UCS_BASIC; + ?column? +---------- + t +(1 row) + +SELECT '൧' ~ '\d' COLLATE BUILTIN_UCS_BASIC; + ?column? +---------- + t +(1 row) + +-- case mapping +SELECT 'xYz' ~* 'XyZ' COLLATE BUILTIN_UCS_BASIC; + ?column? +---------- + t +(1 row) + +SELECT 'xAb' ~* '[W-Y]' COLLATE BUILTIN_UCS_BASIC; + ?column? +---------- + t +(1 row) + +SELECT 'xAb' !~* '[c-d]' COLLATE BUILTIN_UCS_BASIC; + ?column? +---------- + t +(1 row) + +SELECT 'Δ' ~* '[α-λ]' COLLATE BUILTIN_UCS_BASIC; + ?column? +---------- + t +(1 row) + +SELECT 'δ' ~* '[Γ-Λ]' COLLATE BUILTIN_UCS_BASIC; -- same as above with cases reversed + ?column? +---------- + t +(1 row) + +DROP COLLATION BUILTIN_UCS_BASIC; +-- +-- Test builtin C.UTF-8 locale. +-- +CREATE COLLATION BUILTIN_C_UTF8 ( provider = builtin, locale = 'C.UTF-8' ); +CREATE TABLE builtin_test2 ( + t TEXT COLLATE BUILTIN_C_UTF8 +); +INSERT INTO builtin_test2 VALUES + ('abc DEF'), + ('ábc sßs DÉF'), + ('DŽxxDŽ džxxDž Džxxdž'), + ('ȺȺȺ'), + ('ⱥⱥⱥ'), + ('ⱥȺ'); +SELECT + t, lower(t), initcap(t), upper(t), + length(convert_to(t, 'UTF8')) AS t_bytes, + length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes, + length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes, + length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes + FROM builtin_test2; + t | lower | initcap | upper | t_bytes | lower_t_bytes | initcap_t_bytes | upper_t_bytes +----------------+----------------+----------------+----------------+---------+---------------+-----------------+--------------- + abc DEF | abc def | Abc Def | ABC DEF | 7 | 7 | 7 | 7 + ábc sßs DÉF | ábc sßs déf | Ábc Sßs Déf | ÁBC SßS DÉF | 14 | 14 | 14 | 14 + DŽxxDŽ džxxDž Džxxdž | džxxdž džxxdž džxxdž | DŽxxdž DŽxxdž DŽxxdž | DŽXXDŽ DŽXXDŽ DŽXXDŽ | 20 | 20 | 20 | 20 + ȺȺȺ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 6 | 9 | 8 | 6 + ⱥⱥⱥ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 9 | 9 | 8 | 6 + ⱥȺ | ⱥⱥ | Ⱥⱥ | ȺȺ | 5 | 6 | 5 | 4 +(6 rows) + +DROP TABLE builtin_test2; +-- negative test: Final_Sigma not used for builtin locale C.UTF-8 +SELECT lower('ΑΣ' COLLATE BUILTIN_C_UTF8); + lower +------- + ασ +(1 row) + +SELECT lower('ΑͺΣͺ' COLLATE BUILTIN_C_UTF8); + lower +------- + αͺσͺ +(1 row) + +SELECT lower('Α΄Σ΄' COLLATE BUILTIN_C_UTF8); + lower +------- + α΄σ΄ +(1 row) + +-- properties +SELECT 'xyz' ~ '[[:alnum:]]' COLLATE BUILTIN_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'xyz' !~ '[[:upper:]]' COLLATE BUILTIN_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT '@' !~ '[[:alnum:]]' COLLATE BUILTIN_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT '=' ~ '[[:punct:]]' COLLATE BUILTIN_C_UTF8; -- symbols are punctuation in posix + ?column? +---------- + t +(1 row) + +SELECT 'a8a' ~ '[[:digit:]]' COLLATE BUILTIN_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT '൧' !~ '\d' COLLATE BUILTIN_C_UTF8; -- only 0-9 considered digits in posix + ?column? +---------- + t +(1 row) + +-- case mapping +SELECT 'xYz' ~* 'XyZ' COLLATE BUILTIN_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'xAb' ~* '[W-Y]' COLLATE BUILTIN_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'xAb' !~* '[c-d]' COLLATE BUILTIN_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'Δ' ~* '[α-λ]' COLLATE BUILTIN_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'δ' ~* '[Γ-Λ]' COLLATE BUILTIN_C_UTF8; -- same as above with cases reversed + ?column? +---------- + t +(1 row) + +DROP COLLATION BUILTIN_C_UTF8; diff --git a/src/test/regress/expected/collate.utf8_1.out b/src/test/regress/expected/collate.utf8_1.out new file mode 100644 index 0000000000..e73fdf50c3 --- /dev/null +++ b/src/test/regress/expected/collate.utf8_1.out @@ -0,0 +1,8 @@ +/* + * This test is for collations and character operations when using the + * builtin provider with the C.UTF-8 locale. + */ +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 1d8a414eea..e48cb4b7a3 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -78,9 +78,9 @@ test: brin_bloom brin_multi # psql depends on create_am # amutils depends on geometry, create_index_spgist, hash_index, brin # ---------- -test: create_table_like alter_generic alter_operator misc async dbsize merge misc_functions sysviews tsrf tid tidscan tidrangescan collate.icu.utf8 incremental_sort create_role without_overlaps +test: create_table_like alter_generic alter_operator misc async dbsize merge misc_functions sysviews tsrf tid tidscan tidrangescan collate.utf8 collate.icu.utf8 incremental_sort create_role without_overlaps -# collate.*.utf8 tests cannot be run in parallel with each other +# collate.linux.utf8 and collate.icu.utf8 tests cannot be run in parallel with each other test: rules psql psql_crosstab amutils stats_ext collate.linux.utf8 collate.windows.win1252 # ---------- diff --git a/src/test/regress/sql/collate.sql b/src/test/regress/sql/collate.sql index c3d40fc195..01d5c69fe4 100644 --- a/src/test/regress/sql/collate.sql +++ b/src/test/regress/sql/collate.sql @@ -244,6 +244,16 @@ EXPLAIN (COSTS OFF) -- CREATE/DROP COLLATION +CREATE COLLATION builtin_c ( PROVIDER = builtin, LOCALE = "C" ); +CREATE COLLATION builtin_posix ( PROVIDER = builtin, LOCALE = "POSIX" ); + +SELECT b FROM collate_test1 ORDER BY b COLLATE builtin_c; + +CREATE COLLATION builtin2 ( PROVIDER = builtin ); -- fails +CREATE COLLATION builtin2 ( PROVIDER = builtin, LOCALE = "en_US" ); -- fails +CREATE COLLATION builtin2 ( PROVIDER = builtin, LC_CTYPE = "C", LC_COLLATE = "C" ); -- fails +CREATE COLLATION builtin2 ( PROVIDER = builtin, LOCALE = "POSIX", LC_CTYPE = "POSIX" ); -- fails + CREATE COLLATION mycoll1 FROM "C"; CREATE COLLATION mycoll2 ( LC_COLLATE = "POSIX", LC_CTYPE = "POSIX" ); CREATE COLLATION mycoll3 FROM "default"; -- intentionally unsupported diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql new file mode 100644 index 0000000000..747fc63bf4 --- /dev/null +++ b/src/test/regress/sql/collate.utf8.sql @@ -0,0 +1,122 @@ +/* + * This test is for collations and character operations when using the + * builtin provider with the C.UTF-8 locale. + */ + +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif + +SET client_encoding TO UTF8; + +-- +-- Test builtin UCS_BASIC locale. +-- + +CREATE COLLATION BUILTIN_UCS_BASIC ( provider = builtin, locale = 'UCS_BASIC' ); + +CREATE TABLE builtin_test1 ( + t TEXT COLLATE BUILTIN_UCS_BASIC +); +INSERT INTO builtin_test1 VALUES + ('abc DEF'), + ('ábc sßs DÉF'), + ('DŽxxDŽ džxxDž Džxxdž'), + ('ȺȺȺ'), + ('ⱥⱥⱥ'), + ('ⱥȺ'); + +SELECT + t, lower(t), initcap(t), upper(t), + length(convert_to(t, 'UTF8')) AS t_bytes, + length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes, + length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes, + length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes + FROM builtin_test1; + +DROP TABLE builtin_test1; + +-- test Final_Sigma +SELECT lower('ΑΣ' COLLATE BUILTIN_UCS_BASIC); -- 0391 03A3 +SELECT lower('ΑΣ0' COLLATE BUILTIN_UCS_BASIC); -- 0391 03A3 0030 +SELECT lower('ἈΣ̓' COLLATE BUILTIN_UCS_BASIC); -- 0391 0343 03A3 0343 +SELECT lower('ᾼΣͅ' COLLATE BUILTIN_UCS_BASIC); -- 0391 0345 03A3 0345 + +-- test !Final_Sigma +SELECT lower('Σ' COLLATE BUILTIN_UCS_BASIC); -- 03A3 +SELECT lower('0Σ' COLLATE BUILTIN_UCS_BASIC); -- 0030 03A3 +SELECT lower('ΑΣΑ' COLLATE BUILTIN_UCS_BASIC); -- 0391 03A3 0391 +SELECT lower('ἈΣ̓Α' COLLATE BUILTIN_UCS_BASIC); -- 0391 0343 03A3 0343 0391 +SELECT lower('ᾼΣͅΑ' COLLATE BUILTIN_UCS_BASIC); -- 0391 0345 03A3 0345 0391 + +-- properties + +SELECT 'xyz' ~ '[[:alnum:]]' COLLATE BUILTIN_UCS_BASIC; +SELECT 'xyz' !~ '[[:upper:]]' COLLATE BUILTIN_UCS_BASIC; +SELECT '@' !~ '[[:alnum:]]' COLLATE BUILTIN_UCS_BASIC; +SELECT '=' !~ '[[:punct:]]' COLLATE BUILTIN_UCS_BASIC; -- symbols are not punctuation +SELECT 'a8a' ~ '[[:digit:]]' COLLATE BUILTIN_UCS_BASIC; +SELECT '൧' ~ '\d' COLLATE BUILTIN_UCS_BASIC; + +-- case mapping + +SELECT 'xYz' ~* 'XyZ' COLLATE BUILTIN_UCS_BASIC; +SELECT 'xAb' ~* '[W-Y]' COLLATE BUILTIN_UCS_BASIC; +SELECT 'xAb' !~* '[c-d]' COLLATE BUILTIN_UCS_BASIC; +SELECT 'Δ' ~* '[α-λ]' COLLATE BUILTIN_UCS_BASIC; +SELECT 'δ' ~* '[Γ-Λ]' COLLATE BUILTIN_UCS_BASIC; -- same as above with cases reversed + +DROP COLLATION BUILTIN_UCS_BASIC; + +-- +-- Test builtin C.UTF-8 locale. +-- + +CREATE COLLATION BUILTIN_C_UTF8 ( provider = builtin, locale = 'C.UTF-8' ); + +CREATE TABLE builtin_test2 ( + t TEXT COLLATE BUILTIN_C_UTF8 +); +INSERT INTO builtin_test2 VALUES + ('abc DEF'), + ('ábc sßs DÉF'), + ('DŽxxDŽ džxxDž Džxxdž'), + ('ȺȺȺ'), + ('ⱥⱥⱥ'), + ('ⱥȺ'); + +SELECT + t, lower(t), initcap(t), upper(t), + length(convert_to(t, 'UTF8')) AS t_bytes, + length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes, + length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes, + length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes + FROM builtin_test2; + +DROP TABLE builtin_test2; + +-- negative test: Final_Sigma not used for builtin locale C.UTF-8 +SELECT lower('ΑΣ' COLLATE BUILTIN_C_UTF8); +SELECT lower('ΑͺΣͺ' COLLATE BUILTIN_C_UTF8); +SELECT lower('Α΄Σ΄' COLLATE BUILTIN_C_UTF8); + +-- properties + +SELECT 'xyz' ~ '[[:alnum:]]' COLLATE BUILTIN_C_UTF8; +SELECT 'xyz' !~ '[[:upper:]]' COLLATE BUILTIN_C_UTF8; +SELECT '@' !~ '[[:alnum:]]' COLLATE BUILTIN_C_UTF8; +SELECT '=' ~ '[[:punct:]]' COLLATE BUILTIN_C_UTF8; -- symbols are punctuation in posix +SELECT 'a8a' ~ '[[:digit:]]' COLLATE BUILTIN_C_UTF8; +SELECT '൧' !~ '\d' COLLATE BUILTIN_C_UTF8; -- only 0-9 considered digits in posix + +-- case mapping + +SELECT 'xYz' ~* 'XyZ' COLLATE BUILTIN_C_UTF8; +SELECT 'xAb' ~* '[W-Y]' COLLATE BUILTIN_C_UTF8; +SELECT 'xAb' !~* '[c-d]' COLLATE BUILTIN_C_UTF8; +SELECT 'Δ' ~* '[α-λ]' COLLATE BUILTIN_C_UTF8; +SELECT 'δ' ~* '[Γ-Λ]' COLLATE BUILTIN_C_UTF8; -- same as above with cases reversed + +DROP COLLATION BUILTIN_C_UTF8; -- 2.34.1