From befdcd94fe45bf9ef6fbecdc6859b8f837502360 Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Mon, 1 May 2023 15:38:29 -0700 Subject: [PATCH v17 4/4] Introduce collation provider "builtin" for "C" and "C.UTF-8". The builtin "C" locale is equal (in semantics and implementation) to the libc "C" locale. The builtin "C.UTF-8" locale is especially useful. It provides a fast memcmp-based collation (like "C") that supports abbrevated keys, while also providing richer ctype semantics (upper/lower and regexes). The semantics are derived from Unicode by building in lookup tables in the same way as for text normalization. By using built-in semantics, the behavior is stabilized within a Postgres major version, and also matches the behavior of other built-in Unicode functionality, such as normalization. Discussion: https://postgr.es/m/ab925f69-5f9d-f85e-b87c-bd2a44798659@joeconway.com Discussion: https://postgr.es/m/dd9261f4-7a98-4565-93ec-336c1c110d90@manitou-mail.org --- doc/src/sgml/charset.sgml | 88 +++++++-- doc/src/sgml/ref/create_collation.sgml | 11 +- doc/src/sgml/ref/create_database.sgml | 8 +- doc/src/sgml/ref/createdb.sgml | 2 +- doc/src/sgml/ref/initdb.sgml | 17 +- src/backend/catalog/pg_collation.c | 5 +- src/backend/commands/collationcmds.c | 93 ++++++++-- src/backend/commands/dbcommands.c | 121 +++++++++--- src/backend/regex/regc_pg_locale.c | 41 +++- src/backend/utils/adt/formatting.c | 185 +++++++++++++++++++ src/backend/utils/adt/pg_locale.c | 130 +++++++++++-- src/backend/utils/init/postinit.c | 19 +- src/bin/initdb/initdb.c | 58 +++--- src/bin/initdb/t/001_initdb.pl | 57 +++++- src/bin/pg_dump/pg_dump.c | 49 +++-- src/bin/pg_upgrade/t/002_pg_upgrade.pl | 70 +++++-- src/bin/psql/describe.c | 4 +- src/bin/scripts/createdb.c | 18 +- src/bin/scripts/t/020_createdb.pl | 78 ++++++++ src/common/wchar.c | 4 +- src/include/catalog/pg_collation.dat | 9 +- src/include/catalog/pg_collation.h | 5 +- src/include/mb/pg_wchar.h | 15 ++ src/include/utils/pg_locale.h | 7 +- src/test/icu/t/010_database.pl | 22 +-- src/test/regress/expected/collate.out | 24 ++- src/test/regress/expected/collate.utf8.out | 109 +++++++++++ src/test/regress/expected/collate.utf8_1.out | 8 + src/test/regress/parallel_schedule | 4 +- src/test/regress/sql/collate.sql | 10 + src/test/regress/sql/collate.utf8.sql | 54 ++++++ 31 files changed, 1159 insertions(+), 166 deletions(-) create mode 100644 src/test/regress/expected/collate.utf8.out create mode 100644 src/test/regress/expected/collate.utf8_1.out create mode 100644 src/test/regress/sql/collate.utf8.sql diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index 74783d148f..1553deea20 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -342,22 +342,14 @@ initdb --locale=sv_SE Locale Providers - PostgreSQL supports multiple locale - providers. This specifies which library supplies the locale - data. One standard provider name is libc, which uses - the locales provided by the operating system C library. These are the - locales used by most tools provided by the operating system. Another - provider is icu, which uses the external - ICUICU library. ICU locales can - only be used if support for ICU was configured when PostgreSQL was built. + A locale provider specifies which library defines the locale behavior for + collations and character classifications. The commands and tools that select the locale settings, as described - above, each have an option to select the locale provider. The examples - shown earlier all use the libc provider, which is the - default. Here is an example to initialize a database cluster using the - ICU provider: + above, each have an option to select the locale provider. Here is an + example to initialize a database cluster using the ICU provider: initdb --locale-provider=icu --icu-locale=en @@ -370,12 +362,74 @@ initdb --locale-provider=icu --icu-locale=en - Which locale provider to use depends on individual requirements. For most - basic uses, either provider will give adequate results. For the libc - provider, it depends on what the operating system offers; some operating - systems are better than others. For advanced uses, ICU offers more locale - variants and customization options. + Regardless of the locale provider, the operating system is still used to + provide some locale-aware behavior, such as messages (see ). + + + The available locale providers are listed below. + + + + Builtin + + The builtin provider uses built-in operations. Only + the C and C.UTF-8 locales are + supported for this provider. + + + The C locale behavior is identical to the + C locale in the libc provider. When using this locale, + the behavior may depend on the database encoding. + + + The C.UTF-8 locale is available only for when the + database encoding is UTF-8, and the behavior is based + on Unicode. The collation uses the code point values only. The regular + expression character classes are based on the "POSIX Compatible" + semantics, and the case mapping is the "simple" variant. + + + + ICU + + The icu provider uses the external + ICUICU + library. PostgreSQL must have been configured + with support. + + + ICU provides collation and character classification behavior that is + independent of the operating system and database encoding, which is + preferable if you expect to transition to other platforms without any + change in results. LC_COLLATE and + LC_CTYPE can be set independently of the ICU locale. + + + + For the ICU provider, results may depend on the version of the ICU + library used, as it is updated to reflect changes in natural language + over time. + + + + + libc + + The libc provider uses the operating system's C + library. The collation and character classification behavior is + controlled by the settings LC_COLLATE and + LC_CTYPE, so they cannot be set independently. + + + + The same locale name may have different behavior on different platforms + when using the libc provider. + + + + diff --git a/doc/src/sgml/ref/create_collation.sgml b/doc/src/sgml/ref/create_collation.sgml index 5cf9777764..85f18cbbe5 100644 --- a/doc/src/sgml/ref/create_collation.sgml +++ b/doc/src/sgml/ref/create_collation.sgml @@ -96,6 +96,11 @@ CREATE COLLATION [ IF NOT EXISTS ] name FROM locale, you cannot specify either of those parameters. + + If provider is builtin, + then locale must be specified and set to + either C or C.UTF-8. + @@ -129,9 +134,9 @@ CREATE COLLATION [ IF NOT EXISTS ] name FROM Specifies the provider to use for locale services associated with this - collation. Possible values are - icuICU - (if the server was built with ICU support) or libc. + collation. Possible values are builtin, + icuICU (if + the server was built with ICU support) or libc. libc is the default. See for details. diff --git a/doc/src/sgml/ref/create_database.sgml b/doc/src/sgml/ref/create_database.sgml index 72927960eb..1f5cdf1271 100644 --- a/doc/src/sgml/ref/create_database.sgml +++ b/doc/src/sgml/ref/create_database.sgml @@ -162,6 +162,12 @@ CREATE DATABASE name linkend="create-database-lc-ctype"/>, or individually. + + If is + builtin, then locale + must be specified and set to either C or + C.UTF-8. + The other locale settings , name Specifies the provider to use for the default collation in this - database. Possible values are + database. Possible values are builtin, icuICU (if the server was built with ICU support) or libc. By default, the provider is the same as that of the - + Specifies the locale provider for the database's default collation. diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml index cd75cae10e..08a1c2538f 100644 --- a/doc/src/sgml/ref/initdb.sgml +++ b/doc/src/sgml/ref/initdb.sgml @@ -286,6 +286,11 @@ PostgreSQL documentation environment that initdb runs in. Locale support is described in . + + If is builtin, + must be specified and set to + C or C.UTF-8. + @@ -314,8 +319,18 @@ PostgreSQL documentation + + + + + Specifies the locale name when the builtin provider is used. Locale support + is described in . + + + + - + This option sets the locale provider for databases created in the new diff --git a/src/backend/catalog/pg_collation.c b/src/backend/catalog/pg_collation.c index 7bad94f908..01e91000af 100644 --- a/src/backend/catalog/pg_collation.c +++ b/src/backend/catalog/pg_collation.c @@ -68,7 +68,10 @@ CollationCreate(const char *collname, Oid collnamespace, Assert(collname); Assert(collnamespace); Assert(collowner); - Assert((collcollate && collctype) || colllocale); + Assert((collprovider == COLLPROVIDER_LIBC && + collcollate && collctype && !colllocale) || + (collprovider != COLLPROVIDER_LIBC && + !collcollate && !collctype && colllocale)); /* * Make sure there is no existing collation of same name & encoding. diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c index 27564e569a..0fa073496e 100644 --- a/src/backend/commands/collationcmds.c +++ b/src/backend/commands/collationcmds.c @@ -68,7 +68,7 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e DefElem *versionEl = NULL; char *collcollate; char *collctype; - char *colllocale; + const char *colllocale; char *collicurules; bool collisdeterministic; int collencoding; @@ -215,7 +215,9 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e if (collproviderstr) { - if (pg_strcasecmp(collproviderstr, "icu") == 0) + if (pg_strcasecmp(collproviderstr, "builtin") == 0) + collprovider = COLLPROVIDER_BUILTIN; + else if (pg_strcasecmp(collproviderstr, "icu") == 0) collprovider = COLLPROVIDER_ICU; else if (pg_strcasecmp(collproviderstr, "libc") == 0) collprovider = COLLPROVIDER_LIBC; @@ -245,7 +247,17 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e if (lcctypeEl) collctype = defGetString(lcctypeEl); - if (collprovider == COLLPROVIDER_LIBC) + if (collprovider == COLLPROVIDER_BUILTIN) + { + if (!colllocale) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("parameter \"locale\" must be specified"))); + + colllocale = builtin_validate_locale(GetDatabaseEncoding(), + colllocale); + } + else if (collprovider == COLLPROVIDER_LIBC) { if (!collcollate) ereport(ERROR, @@ -305,7 +317,17 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("ICU rules cannot be specified unless locale provider is ICU"))); - if (collprovider == COLLPROVIDER_ICU) + if (collprovider == COLLPROVIDER_BUILTIN) + { + /* + * Behavior may be different in different encodings, so set + * collencoding to the current database encoding. No validation is + * required, because the "builtin" provider is compatible with any + * encoding. + */ + collencoding = GetDatabaseEncoding(); + } + else if (collprovider == COLLPROVIDER_ICU) { #ifdef USE_ICU /* @@ -334,7 +356,18 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e } if (!collversion) - collversion = get_collation_actual_version(collprovider, collprovider == COLLPROVIDER_ICU ? colllocale : collcollate); + { + const char *locale; + + if (collprovider == COLLPROVIDER_ICU) + locale = colllocale; + else if (collprovider == COLLPROVIDER_LIBC) + locale = collcollate; + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ + + collversion = get_collation_actual_version(collprovider, locale); + } newoid = CollationCreate(collName, collNamespace, @@ -409,6 +442,7 @@ AlterCollation(AlterCollationStmt *stmt) Form_pg_collation collForm; Datum datum; bool isnull; + char *locale; char *oldversion; char *newversion; ObjectAddress address; @@ -435,8 +469,20 @@ AlterCollation(AlterCollationStmt *stmt) datum = SysCacheGetAttr(COLLOID, tup, Anum_pg_collation_collversion, &isnull); oldversion = isnull ? NULL : TextDatumGetCString(datum); - datum = SysCacheGetAttrNotNull(COLLOID, tup, collForm->collprovider == COLLPROVIDER_ICU ? Anum_pg_collation_colllocale : Anum_pg_collation_collcollate); - newversion = get_collation_actual_version(collForm->collprovider, TextDatumGetCString(datum)); + if (collForm->collprovider == COLLPROVIDER_ICU) + { + datum = SysCacheGetAttrNotNull(COLLOID, tup, Anum_pg_collation_colllocale); + locale = TextDatumGetCString(datum); + } + else if (collForm->collprovider == COLLPROVIDER_LIBC) + { + datum = SysCacheGetAttrNotNull(COLLOID, tup, Anum_pg_collation_collcollate); + locale = TextDatumGetCString(datum); + } + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ + + newversion = get_collation_actual_version(collForm->collprovider, locale); /* cannot change from NULL to non-NULL or vice versa */ if ((!oldversion && newversion) || (oldversion && !newversion)) @@ -500,11 +546,18 @@ pg_collation_actual_version(PG_FUNCTION_ARGS) provider = ((Form_pg_database) GETSTRUCT(dbtup))->datlocprovider; - datum = SysCacheGetAttrNotNull(DATABASEOID, dbtup, - provider == COLLPROVIDER_ICU ? - Anum_pg_database_datlocale : Anum_pg_database_datcollate); - - locale = TextDatumGetCString(datum); + if (provider == COLLPROVIDER_ICU) + { + datum = SysCacheGetAttrNotNull(DATABASEOID, dbtup, Anum_pg_database_datlocale); + locale = TextDatumGetCString(datum); + } + else if (provider == COLLPROVIDER_LIBC) + { + datum = SysCacheGetAttrNotNull(DATABASEOID, dbtup, Anum_pg_database_datcollate); + locale = TextDatumGetCString(datum); + } + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ ReleaseSysCache(dbtup); } @@ -521,11 +574,19 @@ pg_collation_actual_version(PG_FUNCTION_ARGS) provider = ((Form_pg_collation) GETSTRUCT(colltp))->collprovider; Assert(provider != COLLPROVIDER_DEFAULT); - datum = SysCacheGetAttrNotNull(COLLOID, colltp, - provider == COLLPROVIDER_ICU ? - Anum_pg_collation_colllocale : Anum_pg_collation_collcollate); - locale = TextDatumGetCString(datum); + if (provider == COLLPROVIDER_ICU) + { + datum = SysCacheGetAttrNotNull(COLLOID, colltp, Anum_pg_collation_colllocale); + locale = TextDatumGetCString(datum); + } + else if (provider == COLLPROVIDER_LIBC) + { + datum = SysCacheGetAttrNotNull(COLLOID, colltp, Anum_pg_collation_collcollate); + locale = TextDatumGetCString(datum); + } + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ ReleaseSysCache(colltp); } diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index d1de46e759..d7a21adc5c 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -698,6 +698,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) DefElem *dtemplate = NULL; DefElem *dencoding = NULL; DefElem *dlocale = NULL; + DefElem *dbuiltinlocale = NULL; DefElem *dcollate = NULL; DefElem *dctype = NULL; DefElem *diculocale = NULL; @@ -713,7 +714,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) const char *dbtemplate = NULL; char *dbcollate = NULL; char *dbctype = NULL; - char *dblocale = NULL; + const char *dblocale = NULL; char *dbicurules = NULL; char dblocprovider = '\0'; char *canonname; @@ -762,6 +763,12 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) errorConflictingDefElem(defel, pstate); dlocale = defel; } + else if (strcmp(defel->defname, "builtin_locale") == 0) + { + if (dbuiltinlocale) + errorConflictingDefElem(defel, pstate); + dbuiltinlocale = defel; + } else if (strcmp(defel->defname, "lc_collate") == 0) { if (dcollate) @@ -897,7 +904,10 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) { dbcollate = defGetString(dlocale); dbctype = defGetString(dlocale); + dblocale = defGetString(dlocale); } + if (dbuiltinlocale && dbuiltinlocale->arg) + dblocale = defGetString(dbuiltinlocale); if (dcollate && dcollate->arg) dbcollate = defGetString(dcollate); if (dctype && dctype->arg) @@ -910,7 +920,9 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) { char *locproviderstr = defGetString(dlocprovider); - if (pg_strcasecmp(locproviderstr, "icu") == 0) + if (pg_strcasecmp(locproviderstr, "builtin") == 0) + dblocprovider = COLLPROVIDER_BUILTIN; + else if (pg_strcasecmp(locproviderstr, "icu") == 0) dblocprovider = COLLPROVIDER_ICU; else if (pg_strcasecmp(locproviderstr, "libc") == 0) dblocprovider = COLLPROVIDER_LIBC; @@ -1027,14 +1039,9 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) dbctype = src_ctype; if (dblocprovider == '\0') dblocprovider = src_locprovider; - if (dblocale == NULL && dblocprovider == COLLPROVIDER_ICU) - { - if (dlocale && dlocale->arg) - dblocale = defGetString(dlocale); - else - dblocale = src_locale; - } - if (dbicurules == NULL && dblocprovider == COLLPROVIDER_ICU) + if (dblocale == NULL) + dblocale = src_locale; + if (dbicurules == NULL) dbicurules = src_icurules; /* Some encodings are client only */ @@ -1059,6 +1066,27 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) check_encoding_locale_matches(encoding, dbcollate, dbctype); + if (dblocprovider == COLLPROVIDER_BUILTIN) + { + /* + * This would happen if template0 uses the libc provider but the new + * database uses builtin. + */ + if (!dblocale) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("LOCALE must be specified for the builtin provider"))); + + dblocale = builtin_validate_locale(encoding, dblocale); + } + else + { + if (dbuiltinlocale && dbuiltinlocale->arg) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("BUILTIN_LOCALE cannot be specified unless locale provider is builtin"))); + } + if (dblocprovider == COLLPROVIDER_ICU) { if (!(is_encoding_supported_by_icu(encoding))) @@ -1100,7 +1128,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) } else { - if (dblocale) + if (diculocale && diculocale->arg) ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("ICU locale cannot be specified unless locale provider is ICU"))); @@ -1111,6 +1139,10 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) errmsg("ICU rules cannot be specified unless locale provider is ICU"))); } + /* for libc, locale comes from datcollate and datctype */ + if (dblocprovider == COLLPROVIDER_LIBC) + dblocale = NULL; + /* * Check that the new encoding and locale settings match the source * database. We insist on this because we simply copy the source data --- @@ -1196,8 +1228,16 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) if (src_collversion && !dcollversion) { char *actual_versionstr; + const char *locale; - actual_versionstr = get_collation_actual_version(dblocprovider, dblocprovider == COLLPROVIDER_ICU ? dblocale : dbcollate); + if (dblocprovider == COLLPROVIDER_ICU) + locale = dblocale; + else if (dblocprovider == COLLPROVIDER_LIBC) + locale = dbcollate; + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ + + actual_versionstr = get_collation_actual_version(dblocprovider, locale); if (!actual_versionstr) ereport(ERROR, (errmsg("template database \"%s\" has a collation version, but no actual collation version could be determined", @@ -1225,7 +1265,18 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) * collation version, which is normally only the case for template0. */ if (dbcollversion == NULL) - dbcollversion = get_collation_actual_version(dblocprovider, dblocprovider == COLLPROVIDER_ICU ? dblocale : dbcollate); + { + const char *locale; + + if (dblocprovider == COLLPROVIDER_ICU) + locale = dblocale; + else if (dblocprovider == COLLPROVIDER_LIBC) + locale = dbcollate; + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ + + dbcollversion = get_collation_actual_version(dblocprovider, locale); + } /* Resolve default tablespace for new database */ if (dtablespacename && dtablespacename->arg) @@ -1364,8 +1415,8 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) * block on the unique index, and fail after we commit). */ - Assert((dblocprovider == COLLPROVIDER_ICU && dblocale) || - (dblocprovider != COLLPROVIDER_ICU && !dblocale)); + Assert((dblocprovider != COLLPROVIDER_LIBC && dblocale) || + (dblocprovider == COLLPROVIDER_LIBC && !dblocale)); /* Form tuple */ new_record[Anum_pg_database_oid - 1] = ObjectIdGetDatum(dboid); @@ -2446,6 +2497,7 @@ AlterDatabaseRefreshColl(AlterDatabaseRefreshCollStmt *stmt) ObjectAddress address; Datum datum; bool isnull; + char *locale; char *oldversion; char *newversion; @@ -2472,10 +2524,24 @@ AlterDatabaseRefreshColl(AlterDatabaseRefreshCollStmt *stmt) datum = heap_getattr(tuple, Anum_pg_database_datcollversion, RelationGetDescr(rel), &isnull); oldversion = isnull ? NULL : TextDatumGetCString(datum); - datum = heap_getattr(tuple, datForm->datlocprovider == COLLPROVIDER_ICU ? Anum_pg_database_datlocale : Anum_pg_database_datcollate, RelationGetDescr(rel), &isnull); - if (isnull) - elog(ERROR, "unexpected null in pg_database"); - newversion = get_collation_actual_version(datForm->datlocprovider, TextDatumGetCString(datum)); + if (datForm->datlocprovider == COLLPROVIDER_ICU) + { + datum = heap_getattr(tuple, Anum_pg_database_datlocale, RelationGetDescr(rel), &isnull); + if (isnull) + elog(ERROR, "unexpected null in pg_database"); + locale = TextDatumGetCString(datum); + } + else if (datForm->datlocprovider == COLLPROVIDER_LIBC) + { + datum = heap_getattr(tuple, Anum_pg_database_datcollate, RelationGetDescr(rel), &isnull); + if (isnull) + elog(ERROR, "unexpected null in pg_database"); + locale = TextDatumGetCString(datum); + } + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ + + newversion = get_collation_actual_version(datForm->datlocprovider, locale); /* cannot change from NULL to non-NULL or vice versa */ if ((!oldversion && newversion) || (oldversion && !newversion)) @@ -2660,6 +2726,7 @@ pg_database_collation_actual_version(PG_FUNCTION_ARGS) HeapTuple tp; char datlocprovider; Datum datum; + char *locale; char *version; tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(dbid)); @@ -2670,8 +2737,20 @@ pg_database_collation_actual_version(PG_FUNCTION_ARGS) datlocprovider = ((Form_pg_database) GETSTRUCT(tp))->datlocprovider; - datum = SysCacheGetAttrNotNull(DATABASEOID, tp, datlocprovider == COLLPROVIDER_ICU ? Anum_pg_database_datlocale : Anum_pg_database_datcollate); - version = get_collation_actual_version(datlocprovider, TextDatumGetCString(datum)); + if (datlocprovider == COLLPROVIDER_ICU) + { + datum = SysCacheGetAttrNotNull(DATABASEOID, tp, Anum_pg_database_datlocale); + locale = TextDatumGetCString(datum); + } + else if (datlocprovider == COLLPROVIDER_LIBC) + { + datum = SysCacheGetAttrNotNull(DATABASEOID, tp, Anum_pg_database_datcollate); + locale = TextDatumGetCString(datum); + } + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ + + version = get_collation_actual_version(datlocprovider, locale); ReleaseSysCache(tp); diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index 6a26388bfa..3cb8678298 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -16,6 +16,8 @@ */ #include "catalog/pg_collation.h" +#include "common/unicode_case.h" +#include "common/unicode_category.h" #include "utils/pg_locale.h" /* @@ -64,6 +66,7 @@ typedef enum { PG_REGEX_LOCALE_C, /* C locale (encoding independent) */ + PG_REGEX_BUILTIN, /* built-in Unicode semantics */ PG_REGEX_LOCALE_WIDE, /* Use functions */ PG_REGEX_LOCALE_1BYTE, /* Use functions */ PG_REGEX_LOCALE_WIDE_L, /* Use locale_t functions */ @@ -75,6 +78,8 @@ static PG_Locale_Strategy pg_regex_strategy; static pg_locale_t pg_regex_locale; static Oid pg_regex_collation; +static bool regex_builtin_cclass_posix = false; + /* * Hard-wired character properties for C locale */ @@ -266,7 +271,15 @@ pg_set_regex_collation(Oid collation) if (GetDatabaseEncoding() == PG_UTF8) { if (pg_regex_locale) - pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L; + { + if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN) + { + pg_regex_strategy = PG_REGEX_BUILTIN; + regex_builtin_cclass_posix = pg_regex_locale->info.builtin.cclass_posix; + } + else + pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L; + } else pg_regex_strategy = PG_REGEX_LOCALE_WIDE; } @@ -290,6 +303,8 @@ pg_wc_isdigit(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISDIGIT)); + case PG_REGEX_BUILTIN: + return pg_u_isdigit(c, regex_builtin_cclass_posix); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswdigit((wint_t) c); @@ -322,6 +337,8 @@ pg_wc_isalpha(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISALPHA)); + case PG_REGEX_BUILTIN: + return pg_u_isalpha(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswalpha((wint_t) c); @@ -354,6 +371,8 @@ pg_wc_isalnum(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISALNUM)); + case PG_REGEX_BUILTIN: + return pg_u_isalnum(c, regex_builtin_cclass_posix); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswalnum((wint_t) c); @@ -395,6 +414,8 @@ pg_wc_isupper(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISUPPER)); + case PG_REGEX_BUILTIN: + return pg_u_isupper(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswupper((wint_t) c); @@ -427,6 +448,8 @@ pg_wc_islower(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISLOWER)); + case PG_REGEX_BUILTIN: + return pg_u_islower(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswlower((wint_t) c); @@ -459,6 +482,8 @@ pg_wc_isgraph(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISGRAPH)); + case PG_REGEX_BUILTIN: + return pg_u_isgraph(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswgraph((wint_t) c); @@ -491,6 +516,8 @@ pg_wc_isprint(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISPRINT)); + case PG_REGEX_BUILTIN: + return pg_u_isprint(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswprint((wint_t) c); @@ -523,6 +550,8 @@ pg_wc_ispunct(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISPUNCT)); + case PG_REGEX_BUILTIN: + return pg_u_ispunct(c, regex_builtin_cclass_posix); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswpunct((wint_t) c); @@ -555,6 +584,8 @@ pg_wc_isspace(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISSPACE)); + case PG_REGEX_BUILTIN: + return pg_u_isspace(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswspace((wint_t) c); @@ -588,6 +619,8 @@ pg_wc_toupper(pg_wchar c) if (c <= (pg_wchar) 127) return pg_ascii_toupper((unsigned char) c); return c; + case PG_REGEX_BUILTIN: + return unicode_uppercase_simple(c); case PG_REGEX_LOCALE_WIDE: /* force C behavior for ASCII characters, per comments above */ if (c <= (pg_wchar) 127) @@ -628,6 +661,8 @@ pg_wc_tolower(pg_wchar c) if (c <= (pg_wchar) 127) return pg_ascii_tolower((unsigned char) c); return c; + case PG_REGEX_BUILTIN: + return unicode_lowercase_simple(c); case PG_REGEX_LOCALE_WIDE: /* force C behavior for ASCII characters, per comments above */ if (c <= (pg_wchar) 127) @@ -792,6 +827,9 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode) max_chr = (pg_wchar) MAX_SIMPLE_CHR; #endif break; + case PG_REGEX_BUILTIN: + max_chr = (pg_wchar) MAX_SIMPLE_CHR; + break; case PG_REGEX_LOCALE_WIDE: case PG_REGEX_LOCALE_WIDE_L: max_chr = (pg_wchar) MAX_SIMPLE_CHR; @@ -809,6 +847,7 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode) max_chr = (pg_wchar) MAX_SIMPLE_CHR; break; default: + Assert(false); max_chr = 0; /* can't get here, but keep compiler quiet */ break; } diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 829aaa8d0e..9895368ff8 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -77,6 +77,8 @@ #include "catalog/pg_collation.h" #include "catalog/pg_type.h" +#include "common/unicode_case.h" +#include "common/unicode_category.h" #include "mb/pg_wchar.h" #include "nodes/miscnodes.h" #include "parser/scansup.h" @@ -1680,6 +1682,64 @@ str_tolower(const char *buff, size_t nbytes, Oid collid) } else #endif + if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN) + { + const unsigned char *src = (unsigned char *) buff; + unsigned char *dst; + size_t dstsize = nbytes + 1; + int srcoff = 0; + int dstoff = 0; + + Assert(GetDatabaseEncoding() == PG_UTF8); + + /* Output workspace cannot have more codes than input bytes */ + dst = (unsigned char *) palloc(dstsize); + + while (srcoff < nbytes) + { + pg_wchar u1 = utf8_to_unicode(src + srcoff); + pg_wchar u2 = unicode_lowercase_simple(u1); + int u1len = unicode_utf8len(u1); + int u2len = unicode_utf8len(u2); + + /* + * If we can't fit the necessary bytes and a terminating NUL, + * reallocate buffer to the maximum size we might need, and + * shrink it later. + */ + if (dstoff + u2len + 1 > dstsize) + { + /* Overflow paranoia */ + if ((nbytes + 1) > (INT_MAX / sizeof(pg_wchar))) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + dstsize = (nbytes + 1) * sizeof(pg_wchar); + dst = repalloc(dst, dstsize); + } + + unicode_to_utf8(u2, dst + dstoff); + srcoff += u1len; + dstoff += u2len; + } + + *(dst + dstoff) = '\0'; + dstoff++; + + if (dstsize == dstoff) + { + result = (char *) dst; + } + else + { + /* shrink buffer and store result */ + result = palloc(dstoff); + memcpy(result, dst, dstoff); + pfree(dst); + } + } + else { if (pg_database_encoding_max_length() > 1) { @@ -1798,6 +1858,64 @@ str_toupper(const char *buff, size_t nbytes, Oid collid) } else #endif + if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN) + { + const unsigned char *src = (unsigned char *) buff; + unsigned char *dst; + size_t dstsize = nbytes + 1; + int srcoff = 0; + int dstoff = 0; + + Assert(GetDatabaseEncoding() == PG_UTF8); + + /* Output workspace cannot have more codes than input bytes */ + dst = (unsigned char *) palloc(dstsize); + + while (srcoff < nbytes) + { + pg_wchar u1 = utf8_to_unicode(src + srcoff); + pg_wchar u2 = unicode_uppercase_simple(u1); + int u1len = unicode_utf8len(u1); + int u2len = unicode_utf8len(u2); + + /* + * If we can't fit the necessary bytes and a terminating NUL, + * reallocate buffer to the maximum size we might need, and + * shrink it later. + */ + if (dstoff + u2len + 1 > dstsize) + { + /* Overflow paranoia */ + if ((nbytes + 1) > (INT_MAX / sizeof(pg_wchar))) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + dstsize = (nbytes + 1) * sizeof(pg_wchar); + dst = repalloc(dst, dstsize); + } + + unicode_to_utf8(u2, dst + dstoff); + srcoff += u1len; + dstoff += u2len; + } + + *(dst + dstoff) = '\0'; + dstoff++; + + if (dstsize == dstoff) + { + result = (char *) dst; + } + else + { + /* shrink buffer and store result */ + result = palloc(dstoff); + memcpy(result, dst, dstoff); + pfree(dst); + } + } + else { if (pg_database_encoding_max_length() > 1) { @@ -1917,6 +2035,73 @@ str_initcap(const char *buff, size_t nbytes, Oid collid) } else #endif + if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN) + { + const unsigned char *src = (unsigned char *) buff; + unsigned char *dst; + size_t dstsize = nbytes + 1; + int srcoff = 0; + int dstoff = 0; + + Assert(GetDatabaseEncoding() == PG_UTF8); + + /* Output workspace cannot have more codes than input bytes */ + dst = (unsigned char *) palloc(dstsize); + + while (srcoff < nbytes) + { + pg_wchar u1 = utf8_to_unicode(src + srcoff); + pg_wchar u2; + int u1len = unicode_utf8len(u1); + int u2len; + + if (wasalnum) + u2 = unicode_lowercase_simple(u1); + else + u2 = unicode_uppercase_simple(u1); + + u2len = unicode_utf8len(u2); + + wasalnum = pg_u_isalnum(u2, mylocale->info.builtin.cclass_posix); + + /* + * If we can't fit the necessary bytes and a terminating NUL, + * reallocate buffer to the maximum size we might need, and + * shrink it later. + */ + if (dstoff + u2len + 1 > dstsize) + { + /* Overflow paranoia */ + if ((nbytes + 1) > (INT_MAX / sizeof(pg_wchar))) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + dstsize = (nbytes + 1) * sizeof(pg_wchar); + dst = repalloc(dst, dstsize); + } + + unicode_to_utf8(u2, dst + dstoff); + srcoff += u1len; + dstoff += u2len; + } + + *(dst + dstoff) = '\0'; + dstoff++; + + if (dstsize == dstoff) + { + result = (char *) dst; + } + else + { + /* shrink buffer and store result */ + result = palloc(dstoff); + memcpy(result, dst, dstoff); + pfree(dst); + } + } + else { if (pg_database_encoding_max_length() > 1) { diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 45fe847320..850af2daf4 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1269,7 +1269,19 @@ lookup_collation_cache(Oid collation, bool set_flags) elog(ERROR, "cache lookup failed for collation %u", collation); collform = (Form_pg_collation) GETSTRUCT(tp); - if (collform->collprovider == COLLPROVIDER_LIBC) + if (collform->collprovider == COLLPROVIDER_BUILTIN) + { + Datum datum; + const char *colllocale; + + datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale); + colllocale = TextDatumGetCString(datum); + + cache_entry->collate_is_c = true; + cache_entry->ctype_is_c = ((strcmp(colllocale, "C") == 0) || + (strcmp(colllocale, "POSIX") == 0)); + } + else if (collform->collprovider == COLLPROVIDER_LIBC) { Datum datum; const char *collcollate; @@ -1320,16 +1332,30 @@ lc_collate_is_c(Oid collation) if (collation == DEFAULT_COLLATION_OID) { static int result = -1; - char *localeptr; - - if (default_locale.provider == COLLPROVIDER_ICU) - return false; + const char *localeptr; if (result >= 0) return (bool) result; - localeptr = setlocale(LC_COLLATE, NULL); - if (!localeptr) - elog(ERROR, "invalid LC_COLLATE setting"); + + if (default_locale.provider == COLLPROVIDER_BUILTIN) + { + result = true; + return (bool) result; + } + else if (default_locale.provider == COLLPROVIDER_ICU) + { + result = false; + return (bool) result; + } + else if (default_locale.provider == COLLPROVIDER_LIBC) + { + localeptr = setlocale(LC_CTYPE, NULL); + if (!localeptr) + elog(ERROR, "invalid LC_CTYPE setting"); + } + else + elog(ERROR, "unexpected collation provider '%c'", + default_locale.provider); if (strcmp(localeptr, "C") == 0) result = true; @@ -1373,16 +1399,29 @@ lc_ctype_is_c(Oid collation) if (collation == DEFAULT_COLLATION_OID) { static int result = -1; - char *localeptr; - - if (default_locale.provider == COLLPROVIDER_ICU) - return false; + const char *localeptr; if (result >= 0) return (bool) result; - localeptr = setlocale(LC_CTYPE, NULL); - if (!localeptr) - elog(ERROR, "invalid LC_CTYPE setting"); + + if (default_locale.provider == COLLPROVIDER_BUILTIN) + { + localeptr = default_locale.info.builtin.locale; + } + else if (default_locale.provider == COLLPROVIDER_ICU) + { + result = false; + return (bool) result; + } + else if (default_locale.provider == COLLPROVIDER_LIBC) + { + localeptr = setlocale(LC_CTYPE, NULL); + if (!localeptr) + elog(ERROR, "invalid LC_CTYPE setting"); + } + else + elog(ERROR, "unexpected collation provider '%c'", + default_locale.provider); if (strcmp(localeptr, "C") == 0) result = true; @@ -1390,6 +1429,7 @@ lc_ctype_is_c(Oid collation) result = true; else result = false; + return (bool) result; } @@ -1520,10 +1560,10 @@ pg_newlocale_from_collation(Oid collid) if (collid == DEFAULT_COLLATION_OID) { - if (default_locale.provider == COLLPROVIDER_ICU) - return &default_locale; - else + if (default_locale.provider == COLLPROVIDER_LIBC) return (pg_locale_t) 0; + else + return &default_locale; } cache_entry = lookup_collation_cache(collid, false); @@ -1548,7 +1588,18 @@ pg_newlocale_from_collation(Oid collid) result.provider = collform->collprovider; result.deterministic = collform->collisdeterministic; - if (collform->collprovider == COLLPROVIDER_LIBC) + if (collform->collprovider == COLLPROVIDER_BUILTIN) + { + const char *locstr; + + datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale); + locstr = TextDatumGetCString(datum); + + result.info.builtin.locale = MemoryContextStrdup(TopMemoryContext, + locstr); + result.info.builtin.cclass_posix = true; + } + else if (collform->collprovider == COLLPROVIDER_LIBC) { const char *collcollate; const char *collctype pg_attribute_unused(); @@ -1627,6 +1678,7 @@ pg_newlocale_from_collation(Oid collid) collversionstr = TextDatumGetCString(datum); + Assert(collform->collprovider != COLLPROVIDER_BUILTIN); datum = SysCacheGetAttrNotNull(COLLOID, tp, collform->collprovider == COLLPROVIDER_ICU ? Anum_pg_collation_colllocale : Anum_pg_collation_collcollate); actual_versionstr = get_collation_actual_version(collform->collprovider, @@ -1678,6 +1730,14 @@ get_collation_actual_version(char collprovider, const char *collcollate) { char *collversion = NULL; + /* + * The only two supported locales (C and C.UTF-8) are both based on memcmp + * and do not change. (The ctype behavior can change, but the versioning + * does not track that.) + */ + if (collprovider == COLLPROVIDER_BUILTIN) + return NULL; + #ifdef USE_ICU if (collprovider == COLLPROVIDER_ICU) { @@ -2444,6 +2504,38 @@ pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, return result; } +const char * +builtin_validate_locale(int encoding, const char *locale) +{ + const char *canonical_name = NULL; + int required_encoding = -1; + + if (strcmp(locale, "C") == 0 || strcmp(locale, "POSIX") == 0) + { + canonical_name = "C"; + } + else if (strcmp(locale, "C.UTF-8") == 0 || strcmp(locale, "C.UTF8") == 0) + { + required_encoding = PG_UTF8; + canonical_name = "C.UTF-8"; + } + + if (!canonical_name) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("invalid locale name \"%s\" for builtin provider", + locale))); + + if (required_encoding >= 0 && encoding != required_encoding) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("encoding \"%s\" does not match locale \"%s\"", + pg_encoding_to_char(encoding), locale))); + + return canonical_name; +} + + #ifdef USE_ICU /* diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 9818077d51..575ba0281d 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -423,7 +423,16 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect strcmp(ctype, "POSIX") == 0) database_ctype_is_c = true; - if (dbform->datlocprovider == COLLPROVIDER_ICU) + if (dbform->datlocprovider == COLLPROVIDER_BUILTIN) + { + datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datlocale); + datlocale = TextDatumGetCString(datum); + + default_locale.info.builtin.locale = MemoryContextStrdup( + TopMemoryContext, datlocale); + default_locale.info.builtin.cclass_posix = true; + } + else if (dbform->datlocprovider == COLLPROVIDER_ICU) { char *icurules; @@ -461,10 +470,16 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect { char *actual_versionstr; char *collversionstr; + char *locale; collversionstr = TextDatumGetCString(datum); - actual_versionstr = get_collation_actual_version(dbform->datlocprovider, dbform->datlocprovider == COLLPROVIDER_ICU ? datlocale : collate); + if (dbform->datlocprovider == COLLPROVIDER_LIBC) + locale = collate; + else + locale = datlocale; + + actual_versionstr = get_collation_actual_version(dbform->datlocprovider, locale); if (!actual_versionstr) /* should not happen */ elog(WARNING, diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 90f793632a..7419c38722 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -146,6 +146,7 @@ static char *lc_time = NULL; static char *lc_messages = NULL; static char locale_provider = COLLPROVIDER_LIBC; static char *datlocale = NULL; +static bool icu_locale_specified = false; static char *icu_rules = NULL; static const char *default_text_search_config = NULL; static char *username = NULL; @@ -2390,14 +2391,13 @@ setlocales(void) lc_messages = canonname; #endif + if (locale_provider != COLLPROVIDER_LIBC && datlocale == NULL) + pg_fatal("locale must be specified unless provider is libc"); + if (locale_provider == COLLPROVIDER_ICU) { char *langtag; - /* acquire default locale from the environment, if not specified */ - if (datlocale == NULL) - pg_fatal("ICU locale must be specified"); - /* canonicalize to a language tag */ langtag = icu_language_tag(datlocale); printf(_("Using language tag \"%s\" for ICU locale \"%s\".\n"), @@ -2442,7 +2442,8 @@ usage(const char *progname) " set default locale in the respective category for\n" " new databases (default taken from environment)\n")); printf(_(" --no-locale equivalent to --locale=C\n")); - printf(_(" --locale-provider={libc|icu}\n" + printf(_(" --builtin-locale=LOCALE set builtin locale name for new databases\n")); + printf(_(" --locale-provider={builtin|libc|icu}\n" " set default locale provider for new databases\n")); printf(_(" --pwfile=FILE read password for the new superuser from file\n")); printf(_(" -T, --text-search-config=CFG\n" @@ -2593,20 +2594,28 @@ setup_locale_encoding(void) { setlocales(); - if (locale_provider == COLLPROVIDER_LIBC && - strcmp(lc_ctype, lc_collate) == 0 && - strcmp(lc_ctype, lc_time) == 0 && - strcmp(lc_ctype, lc_numeric) == 0 && - strcmp(lc_ctype, lc_monetary) == 0 && - strcmp(lc_ctype, lc_messages) == 0 && - (!datlocale || strcmp(lc_ctype, datlocale) == 0)) + if (locale_provider == COLLPROVIDER_BUILTIN && + strcmp(lc_ctype, "C") == 0 && + strcmp(lc_collate, "C") == 0 && + strcmp(lc_time, "C") == 0 && + strcmp(lc_numeric, "C") == 0 && + strcmp(lc_monetary, "C") == 0 && + strcmp(lc_messages, "C") == 0) + printf(_("The database cluster will be initialized with no locale.\n")); + else if (locale_provider == COLLPROVIDER_LIBC && + strcmp(lc_ctype, lc_collate) == 0 && + strcmp(lc_ctype, lc_time) == 0 && + strcmp(lc_ctype, lc_numeric) == 0 && + strcmp(lc_ctype, lc_monetary) == 0 && + strcmp(lc_ctype, lc_messages) == 0 && + (!datlocale || strcmp(lc_ctype, datlocale) == 0)) printf(_("The database cluster will be initialized with locale \"%s\".\n"), lc_ctype); else { printf(_("The database cluster will be initialized with this locale configuration:\n")); - printf(_(" provider: %s\n"), collprovider_name(locale_provider)); - if (datlocale) - printf(_(" ICU locale: %s\n"), datlocale); + printf(_(" default collation provider: %s\n"), collprovider_name(locale_provider)); + if (locale_provider != COLLPROVIDER_LIBC) + printf(_(" default collation locale: %s\n"), datlocale); printf(_(" LC_COLLATE: %s\n" " LC_CTYPE: %s\n" " LC_MESSAGES: %s\n" @@ -3099,9 +3108,10 @@ main(int argc, char *argv[]) {"allow-group-access", no_argument, NULL, 'g'}, {"discard-caches", no_argument, NULL, 14}, {"locale-provider", required_argument, NULL, 15}, - {"icu-locale", required_argument, NULL, 16}, - {"icu-rules", required_argument, NULL, 17}, - {"sync-method", required_argument, NULL, 18}, + {"builtin-locale", required_argument, NULL, 16}, + {"icu-locale", required_argument, NULL, 17}, + {"icu-rules", required_argument, NULL, 18}, + {"sync-method", required_argument, NULL, 19}, {NULL, 0, NULL, 0} }; @@ -3269,7 +3279,9 @@ main(int argc, char *argv[]) "-c debug_discard_caches=1"); break; case 15: - if (strcmp(optarg, "icu") == 0) + if (strcmp(optarg, "builtin") == 0) + locale_provider = COLLPROVIDER_BUILTIN; + else if (strcmp(optarg, "icu") == 0) locale_provider = COLLPROVIDER_ICU; else if (strcmp(optarg, "libc") == 0) locale_provider = COLLPROVIDER_LIBC; @@ -3280,9 +3292,13 @@ main(int argc, char *argv[]) datlocale = pg_strdup(optarg); break; case 17: - icu_rules = pg_strdup(optarg); + datlocale = pg_strdup(optarg); + icu_locale_specified = true; break; case 18: + icu_rules = pg_strdup(optarg); + break; + case 19: if (!parse_sync_method(optarg, &sync_method)) exit(1); break; @@ -3312,7 +3328,7 @@ main(int argc, char *argv[]) exit(1); } - if (datlocale && locale_provider != COLLPROVIDER_ICU) + if (icu_locale_specified && locale_provider != COLLPROVIDER_ICU) pg_fatal("%s cannot be specified unless locale provider \"%s\" is chosen", "--icu-locale", "icu"); diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl index 03376cc0f7..242f4581a5 100644 --- a/src/bin/initdb/t/001_initdb.pl +++ b/src/bin/initdb/t/001_initdb.pl @@ -117,7 +117,7 @@ if ($ENV{with_icu} eq 'yes') { command_fails_like( [ 'initdb', '--no-sync', '--locale-provider=icu', "$tempdir/data2" ], - qr/initdb: error: ICU locale must be specified/, + qr/initdb: error: locale must be specified unless provider is libc/, 'locale provider ICU requires --icu-locale'); command_ok( @@ -138,7 +138,7 @@ if ($ENV{with_icu} eq 'yes') '--lc-monetary=C', '--lc-time=C', "$tempdir/data4" ], - qr/^\s+ICU locale:\s+und\n/ms, + qr/^\s+default collation locale:\s+und\n/ms, 'options --locale-provider=icu --locale=und --lc-*=C'); command_fails_like( @@ -184,6 +184,59 @@ else 'locale provider ICU fails since no ICU support'); } +command_fails( + [ 'initdb', '--no-sync', '--locale-provider=builtin', "$tempdir/data6" ], + 'locale provider builtin fails without --locale'); + +command_ok( + [ + 'initdb', '--no-sync', + '--locale-provider=builtin', '--locale=C', + "$tempdir/data7" + ], + 'locale provider builtin with --locale'); + +command_ok( + [ + 'initdb', '--no-sync', + '--locale-provider=builtin', '-E UTF-8', + '--builtin-locale=C.UTF-8', "$tempdir/data8" + ], + 'locale provider builtin with -E UTF-8 --builtin-locale=C.UTF-8'); + +command_fails( + [ + 'initdb', '--no-sync', + '--locale-provider=builtin', '-E SQL_ASCII', + '--builtin-locale=C.UTF-8', "$tempdir/data9" + ], + 'locale provider builtin with --builtin-locale=C.UTF-8 fails for SQL_ASCII' +); + +command_ok( + [ + 'initdb', '--no-sync', + '--locale-provider=builtin', '--lc-ctype=C', + '--locale=C', "$tempdir/data10" + ], + 'locale provider builtin with --lc-ctype'); + +command_fails( + [ + 'initdb', '--no-sync', + '--locale-provider=builtin', '--icu-locale=en', + "$tempdir/dataX" + ], + 'fails for locale provider builtin with ICU locale'); + +command_fails( + [ + 'initdb', '--no-sync', + '--locale-provider=builtin', '--icu-rules=""', + "$tempdir/dataX" + ], + 'fails for locale provider builtin with ICU rules'); + command_fails( [ 'initdb', '--no-sync', '--locale-provider=xyz', "$tempdir/dataX" ], 'fails for invalid locale provider'); diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 303d1ff4a8..f16436d179 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -3112,7 +3112,9 @@ dumpDatabase(Archive *fout) } appendPQExpBufferStr(creaQry, " LOCALE_PROVIDER = "); - if (datlocprovider[0] == 'c') + if (datlocprovider[0] == 'b') + appendPQExpBufferStr(creaQry, "builtin"); + else if (datlocprovider[0] == 'c') appendPQExpBufferStr(creaQry, "libc"); else if (datlocprovider[0] == 'i') appendPQExpBufferStr(creaQry, "icu"); @@ -3120,27 +3122,33 @@ dumpDatabase(Archive *fout) pg_fatal("unrecognized locale provider: %s", datlocprovider); - if (strlen(collate) > 0 && strcmp(collate, ctype) == 0) + if (!locale && datlocprovider[0] != 'c') + pg_log_warning("database '%s' with provider '%s' missing datlocale", + datname, datlocprovider); + + if (locale && datlocprovider[0] == 'c') + pg_log_warning("database '%s' with provider 'c' has non-NULL locale '%s'", + datname, locale); + + /* if collate and ctype are equal, and locale is NULL, use LOCALE */ + if (!locale && strlen(collate) > 0 && strcmp(collate, ctype) == 0) + locale = collate; + + /* output LC_COLLATE and LC_CTYPE if different from LOCALE */ + if (strlen(collate) > 0 && (!locale || strcmp(collate, locale) != 0)) { - appendPQExpBufferStr(creaQry, " LOCALE = "); + appendPQExpBufferStr(creaQry, " LC_COLLATE = "); appendStringLiteralAH(creaQry, collate, fout); } - else + if (strlen(ctype) > 0 && (!locale || strcmp(ctype, locale) != 0)) { - if (strlen(collate) > 0) - { - appendPQExpBufferStr(creaQry, " LC_COLLATE = "); - appendStringLiteralAH(creaQry, collate, fout); - } - if (strlen(ctype) > 0) - { - appendPQExpBufferStr(creaQry, " LC_CTYPE = "); - appendStringLiteralAH(creaQry, ctype, fout); - } + appendPQExpBufferStr(creaQry, " LC_CTYPE = "); + appendStringLiteralAH(creaQry, ctype, fout); } + if (locale) { - appendPQExpBufferStr(creaQry, " ICU_LOCALE = "); + appendPQExpBufferStr(creaQry, " LOCALE = "); appendStringLiteralAH(creaQry, locale, fout); } @@ -13868,7 +13876,9 @@ dumpCollation(Archive *fout, const CollInfo *collinfo) fmtQualifiedDumpable(collinfo)); appendPQExpBufferStr(q, "provider = "); - if (collprovider[0] == 'c') + if (collprovider[0] == 'b') + appendPQExpBufferStr(q, "builtin"); + else if (collprovider[0] == 'c') appendPQExpBufferStr(q, "libc"); else if (collprovider[0] == 'i') appendPQExpBufferStr(q, "icu"); @@ -13889,6 +13899,13 @@ dumpCollation(Archive *fout, const CollInfo *collinfo) /* no locale -- the default collation cannot be reloaded anyway */ } + else if (collprovider[0] == 'b') + { + if (collcollate || collctype || colllocale || collicurules) + pg_log_warning("invalid collation \"%s\"", qcollname); + + appendPQExpBufferStr(q, ", locale = 'C'"); + } else if (collprovider[0] == 'i') { if (fout->remoteVersion >= 150000) diff --git a/src/bin/pg_upgrade/t/002_pg_upgrade.pl b/src/bin/pg_upgrade/t/002_pg_upgrade.pl index 41d06d272b..94bf086ba8 100644 --- a/src/bin/pg_upgrade/t/002_pg_upgrade.pl +++ b/src/bin/pg_upgrade/t/002_pg_upgrade.pl @@ -110,13 +110,16 @@ my $oldversion = int($oldnode->pg_version =~ s/([0-9]*).*/$1/rg); # can test that pg_upgrade copies the locale settings of template0 # from the old to the new cluster. -my $original_encoding = "6"; # UTF-8 -my $original_provider = "c"; -my $original_locale = "C"; -my $original_datlocale = ""; -my $provider_field = "'c' AS datlocprovider"; -my $datlocale_field = "NULL AS datlocale"; -if ($oldversion >= 15 && $ENV{with_icu} eq 'yes') +my %encoding_number = ('UTF-8' => 6, 'SQL_ASCII' => 0); +my $provider_field; +my $datlocale_field; +my $original_encoding; +my $original_provider; +my $original_datcollate = "C"; +my $original_datctype = "C"; +my $original_datlocale; + +if ($oldversion >= 15) { $provider_field = "datlocprovider"; if ($oldversion >= 17) @@ -127,18 +130,52 @@ if ($oldversion >= 15 && $ENV{with_icu} eq 'yes') { $datlocale_field = "daticulocale AS datlocale"; } +} +else +{ + $provider_field = "'c' AS datlocprovider"; + $datlocale_field = "NULL AS datlocale"; +} + +if ($oldversion >= 17) +{ + $original_encoding = "UTF-8"; + $original_provider = "b"; + $original_datlocale = "C.UTF-8"; +} +elsif ($oldversion >= 15 && $ENV{with_icu} eq 'yes') +{ + $original_encoding = "UTF-8"; $original_provider = "i"; $original_datlocale = "fr-CA"; } +else +{ + my $original_encoding = "SQL_ASCII"; + my $original_provider = "c"; + my $original_datlocale = ""; +} my @initdb_params = @custom_opts; -push @initdb_params, ('--encoding', 'UTF-8'); -push @initdb_params, ('--locale', $original_locale); -if ($original_provider eq "i") +push @initdb_params, ('--encoding', $original_encoding); +push @initdb_params, ('--lc-collate', $original_datcollate); +push @initdb_params, ('--lc-ctype', $original_datctype); + +# add --locale-provider, if supported +my %provider_name = ('b' => 'builtin', 'i' => 'icu', 'c' => 'libc'); +if ($oldnode->pg_version >= 15) { - push @initdb_params, ('--locale-provider', 'icu'); - push @initdb_params, ('--icu-locale', 'fr-CA'); + push @initdb_params, + ('--locale-provider', $provider_name{$original_provider}); + if ($original_provider eq 'b') + { + push @initdb_params, ('--builtin-locale', $original_datlocale); + } + elsif ($original_provider eq 'i') + { + push @initdb_params, ('--icu-locale', $original_datlocale); + } } $node_params{extra} = \@initdb_params; @@ -151,7 +188,7 @@ $result = $oldnode->safe_psql( "SELECT encoding, $provider_field, datcollate, datctype, $datlocale_field FROM pg_database WHERE datname='template0'"); is( $result, - "$original_encoding|$original_provider|$original_locale|$original_locale|$original_datlocale", + "$encoding_number{$original_encoding}|$original_provider|$original_datcollate|$original_datctype|$original_datlocale", "check locales in original cluster"); # The default location of the source code is the root of this directory. @@ -327,7 +364,8 @@ if (defined($ENV{oldinstall})) } # Create an invalid database, will be deleted below -$oldnode->safe_psql('postgres', qq( +$oldnode->safe_psql( + 'postgres', qq( CREATE DATABASE regression_invalid; UPDATE pg_database SET datconnlimit = -2 WHERE datname = 'regression_invalid'; )); @@ -370,7 +408,7 @@ command_checks_all( $mode, '--check', ], 1, - [qr/invalid/], # pg_upgrade prints errors on stdout :( + [qr/invalid/], # pg_upgrade prints errors on stdout :( [qr//], 'invalid database causes failure'); rmtree($newnode->data_dir . "/pg_upgrade_output.d"); @@ -434,7 +472,7 @@ $result = $newnode->safe_psql( "SELECT encoding, $provider_field, datcollate, datctype, $datlocale_field FROM pg_database WHERE datname='template0'"); is( $result, - "$original_encoding|$original_provider|$original_locale|$original_locale|$original_datlocale", + "$encoding_number{$original_encoding}|$original_provider|$original_datcollate|$original_datctype|$original_datlocale", "check that locales in new cluster match original cluster"); # Second dump from the upgraded instance. diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c index b943569050..c649477505 100644 --- a/src/bin/psql/describe.c +++ b/src/bin/psql/describe.c @@ -926,7 +926,7 @@ listAllDbs(const char *pattern, bool verbose) gettext_noop("Encoding")); if (pset.sversion >= 150000) appendPQExpBuffer(&buf, - " CASE d.datlocprovider WHEN 'c' THEN 'libc' WHEN 'i' THEN 'icu' END AS \"%s\",\n", + " CASE d.datlocprovider WHEN 'b' THEN 'builtin' WHEN 'c' THEN 'libc' WHEN 'i' THEN 'icu' END AS \"%s\",\n", gettext_noop("Locale Provider")); else appendPQExpBuffer(&buf, @@ -4974,7 +4974,7 @@ listCollations(const char *pattern, bool verbose, bool showSystem) if (pset.sversion >= 100000) appendPQExpBuffer(&buf, - " CASE c.collprovider WHEN 'd' THEN 'default' WHEN 'c' THEN 'libc' WHEN 'i' THEN 'icu' END AS \"%s\",\n", + " CASE c.collprovider WHEN 'd' THEN 'default' WHEN 'b' THEN 'builtin' WHEN 'c' THEN 'libc' WHEN 'i' THEN 'icu' END AS \"%s\",\n", gettext_noop("Provider")); else appendPQExpBuffer(&buf, diff --git a/src/bin/scripts/createdb.c b/src/bin/scripts/createdb.c index 14970a6a5f..4af4b98181 100644 --- a/src/bin/scripts/createdb.c +++ b/src/bin/scripts/createdb.c @@ -40,8 +40,9 @@ main(int argc, char *argv[]) {"locale", required_argument, NULL, 'l'}, {"maintenance-db", required_argument, NULL, 3}, {"locale-provider", required_argument, NULL, 4}, - {"icu-locale", required_argument, NULL, 5}, - {"icu-rules", required_argument, NULL, 6}, + {"builtin-locale", required_argument, NULL, 5}, + {"icu-locale", required_argument, NULL, 6}, + {"icu-rules", required_argument, NULL, 7}, {NULL, 0, NULL, 0} }; @@ -67,6 +68,7 @@ main(int argc, char *argv[]) char *lc_ctype = NULL; char *locale = NULL; char *locale_provider = NULL; + char *builtin_locale = NULL; char *icu_locale = NULL; char *icu_rules = NULL; @@ -134,9 +136,12 @@ main(int argc, char *argv[]) locale_provider = pg_strdup(optarg); break; case 5: - icu_locale = pg_strdup(optarg); + builtin_locale = pg_strdup(optarg); break; case 6: + icu_locale = pg_strdup(optarg); + break; + case 7: icu_rules = pg_strdup(optarg); break; default: @@ -216,6 +221,11 @@ main(int argc, char *argv[]) appendPQExpBufferStr(&sql, " LOCALE "); appendStringLiteralConn(&sql, locale, conn); } + if (builtin_locale) + { + appendPQExpBufferStr(&sql, " BUILTIN_LOCALE "); + appendStringLiteralConn(&sql, builtin_locale, conn); + } if (lc_collate) { appendPQExpBufferStr(&sql, " LC_COLLATE "); @@ -296,7 +306,7 @@ help(const char *progname) printf(_(" --lc-ctype=LOCALE LC_CTYPE setting for the database\n")); printf(_(" --icu-locale=LOCALE ICU locale setting for the database\n")); printf(_(" --icu-rules=RULES ICU rules setting for the database\n")); - printf(_(" --locale-provider={libc|icu}\n" + printf(_(" --locale-provider={builtin|libc|icu}\n" " locale provider for the database's default collation\n")); printf(_(" -O, --owner=OWNER database user to own the new database\n")); printf(_(" -S, --strategy=STRATEGY database creation strategy wal_log or file_copy\n")); diff --git a/src/bin/scripts/t/020_createdb.pl b/src/bin/scripts/t/020_createdb.pl index 37e47b0078..3ba623f9d1 100644 --- a/src/bin/scripts/t/020_createdb.pl +++ b/src/bin/scripts/t/020_createdb.pl @@ -105,6 +105,84 @@ else 'create database with ICU fails since no ICU support'); } +$node->command_fails( + [ + 'createdb', '-T', + 'template0', '--locale-provider=builtin', + 'tbuiltin1' + ], + 'create database with provider "builtin" fails without --locale'); + +$node->command_ok( + [ + 'createdb', '-T', + 'template0', '--locale-provider=builtin', + '--locale=C', 'tbuiltin2' + ], + 'create database with provider "builtin" and locale "C"'); + +$node->command_ok( + [ + 'createdb', '-T', + 'template0', '--locale-provider=builtin', + '--locale=C', '--lc-collate=C', + 'tbuiltin3' + ], + 'create database with provider "builtin" and LC_COLLATE=C'); + +$node->command_ok( + [ + 'createdb', '-T', + 'template0', '--locale-provider=builtin', + '--locale=C', '--lc-ctype=C', + 'tbuiltin4' + ], + 'create database with provider "builtin" and LC_CTYPE=C'); + +$node->command_ok( + [ + 'createdb', '-T', + 'template0', '--locale-provider=builtin', + '-E UTF-8', '--builtin-locale=C.UTF8', + 'tbuiltin5' + ], + 'create database with provider "builtin" and --builtin-locale C.UTF-8'); + +$node->command_fails( + [ + 'createdb', '-T', + 'template0', '--locale-provider=builtin', + '-E LATIN1', '--builtin-locale=C.UTF-8', + 'tbuiltin6' + ], + 'create database with provider "builtin" and --builtin-locale C.UTF-8'); + +$node->command_fails( + [ + 'createdb', '-T', + 'template0', '--locale-provider=builtin', + '--locale=C', '--icu-locale=en', + 'tbuiltin7' + ], + 'create database with provider "builtin" and ICU_LOCALE="en"'); + +$node->command_fails( + [ + 'createdb', '-T', + 'template0', '--locale-provider=builtin', + '--locale=C', '--icu-rules=""', + 'tbuiltin8' + ], + 'create database with provider "builtin" and ICU_RULES=""'); + +$node->command_fails( + [ + 'createdb', '-T', + 'template1', '--locale-provider=builtin', + '--locale=C', 'tbuiltin9' + ], + 'create database with provider "builtin" not matching template'); + $node->command_fails([ 'createdb', 'foobar1' ], 'fails if database already exists'); diff --git a/src/common/wchar.c b/src/common/wchar.c index 7c5ce5ca08..95e6b499e3 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -477,8 +477,8 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) /* - * Map a Unicode code point to UTF-8. utf8string must have 4 bytes of - * space allocated. + * Map a Unicode code point to UTF-8. utf8string must have at least + * unicode_utf8len(c) bytes available. */ unsigned char * unicode_to_utf8(pg_wchar c, unsigned char *utf8string) diff --git a/src/include/catalog/pg_collation.dat b/src/include/catalog/pg_collation.dat index 7396ff10c4..1e439e6975 100644 --- a/src/include/catalog/pg_collation.dat +++ b/src/include/catalog/pg_collation.dat @@ -23,12 +23,15 @@ descr => 'standard POSIX collation', collname => 'POSIX', collprovider => 'c', collencoding => '-1', collcollate => 'POSIX', collctype => 'POSIX' }, -{ oid => '962', descr => 'sorts by Unicode code point', - collname => 'ucs_basic', collprovider => 'c', collencoding => '6', - collcollate => 'C', collctype => 'C' }, +{ oid => '962', descr => 'sorts by Unicode code point, C character semantics', + collname => 'ucs_basic', collprovider => 'b', collencoding => '6', + colllocale => 'C' }, { oid => '963', descr => 'sorts using the Unicode Collation Algorithm with default settings', collname => 'unicode', collprovider => 'i', collencoding => '-1', colllocale => 'und' }, +{ oid => '970', descr => 'sorts by Unicode code point; Unicode & POSIX character semantics', + collname => 'pg_c_utf8', collprovider => 'b', collencoding => '6', + colllocale => 'C.UTF8' }, ] diff --git a/src/include/catalog/pg_collation.h b/src/include/catalog/pg_collation.h index 85cb09c4f8..5ce289d74b 100644 --- a/src/include/catalog/pg_collation.h +++ b/src/include/catalog/pg_collation.h @@ -42,7 +42,7 @@ CATALOG(pg_collation,3456,CollationRelationId) #ifdef CATALOG_VARLEN /* variable-length fields start here */ text collcollate BKI_DEFAULT(_null_); /* LC_COLLATE setting */ text collctype BKI_DEFAULT(_null_); /* LC_CTYPE setting */ - text colllocale BKI_DEFAULT(_null_); /* locale ID */ + text colllocale BKI_DEFAULT(_null_); /* locale ID */ text collicurules BKI_DEFAULT(_null_); /* ICU collation rules */ text collversion BKI_DEFAULT(_null_); /* provider-dependent * version of collation @@ -68,6 +68,7 @@ MAKE_SYSCACHE(COLLOID, pg_collation_oid_index, 8); #ifdef EXPOSE_TO_CLIENT_CODE #define COLLPROVIDER_DEFAULT 'd' +#define COLLPROVIDER_BUILTIN 'b' #define COLLPROVIDER_ICU 'i' #define COLLPROVIDER_LIBC 'c' @@ -76,6 +77,8 @@ collprovider_name(char c) { switch (c) { + case COLLPROVIDER_BUILTIN: + return "builtin"; case COLLPROVIDER_ICU: return "icu"; case COLLPROVIDER_LIBC: diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 1d521bea24..fbd9e58ed3 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -562,6 +562,21 @@ surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second) return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF); } +/* + * Number of bytes needed to represent the given char in UTF8. + */ +static inline int +unicode_utf8len(pg_wchar c) +{ + if (c <= 0x7F) + return 1; + else if (c <= 0x7FF) + return 2; + else if (c <= 0xFFFF) + return 3; + else + return 4; +} /* * The functions in this list are exported by libpq, and we need to be sure diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 28c925b5af..ac5948dadd 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -76,6 +76,11 @@ struct pg_locale_struct bool deterministic; union { + struct + { + const char *locale; + bool cclass_posix; + } builtin; locale_t lt; #ifdef USE_ICU struct @@ -112,7 +117,7 @@ extern size_t pg_strxfrm_prefix(char *dest, const char *src, size_t destsize, pg_locale_t locale); extern size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale); - +extern const char *builtin_validate_locale(int encoding, const char *loc_str); extern void icu_validate_locale(const char *loc_str); extern char *icu_language_tag(const char *loc_str, int elevel); diff --git a/src/test/icu/t/010_database.pl b/src/test/icu/t/010_database.pl index 8a1fc12ec6..5f8ef16803 100644 --- a/src/test/icu/t/010_database.pl +++ b/src/test/icu/t/010_database.pl @@ -27,9 +27,8 @@ CREATE TABLE icu (def text, en text COLLATE "en-x-icu", upfirst text COLLATE upp INSERT INTO icu VALUES ('a', 'a', 'a'), ('b', 'b', 'b'), ('A', 'A', 'A'), ('B', 'B', 'B'); }); -is( $node1->safe_psql('dbicu', q{SELECT icu_unicode_version() IS NOT NULL}), - qq(t), - 'ICU unicode version defined'); +is($node1->safe_psql('dbicu', q{SELECT icu_unicode_version() IS NOT NULL}), + qq(t), 'ICU unicode version defined'); is( $node1->safe_psql('dbicu', q{SELECT def FROM icu ORDER BY def}), qq(A @@ -63,14 +62,13 @@ is( $node1->psql( 0, "C locale works for ICU"); -# Test that LOCALE works for ICU locales if LC_COLLATE and LC_CTYPE -# are specified -is( $node1->psql( - 'postgres', - q{CREATE DATABASE dbicu2 LOCALE_PROVIDER icu LOCALE '@colStrength=primary' - LC_COLLATE='C' LC_CTYPE='C' TEMPLATE template0 ENCODING UTF8} - ), - 0, - "LOCALE works for ICU locales if LC_COLLATE and LC_CTYPE are specified"); +my ($ret, $stdout, $stderr) = $node1->psql('postgres', + q{CREATE DATABASE dbicu LOCALE_PROVIDER builtin LOCALE 'C' TEMPLATE dbicu} +); +isnt($ret, 0, "locale provider must match template: exit code not 0"); +like( + $stderr, + qr/ERROR: new locale provider \(builtin\) does not match locale provider of the template database \(icu\)/, + "locale provider must match template: error message"); done_testing(); diff --git a/src/test/regress/expected/collate.out b/src/test/regress/expected/collate.out index 0649564485..ece4a8e99d 100644 --- a/src/test/regress/expected/collate.out +++ b/src/test/regress/expected/collate.out @@ -650,6 +650,26 @@ EXPLAIN (COSTS OFF) (3 rows) -- CREATE/DROP COLLATION +CREATE COLLATION builtin_c ( PROVIDER = builtin, LOCALE = "C" ); +CREATE COLLATION builtin_posix ( PROVIDER = builtin, LOCALE = "POSIX" ); +SELECT b FROM collate_test1 ORDER BY b COLLATE builtin_c; + b +----- + ABD + Abc + abc + bbc +(4 rows) + +CREATE COLLATION builtin2 ( PROVIDER = builtin ); -- fails +ERROR: parameter "locale" must be specified +CREATE COLLATION builtin2 ( PROVIDER = builtin, LOCALE = "en_US" ); -- fails +ERROR: invalid locale name "en_US" for builtin provider +CREATE COLLATION builtin2 ( PROVIDER = builtin, LC_CTYPE = "C", LC_COLLATE = "C" ); -- fails +ERROR: parameter "locale" must be specified +CREATE COLLATION builtin2 ( PROVIDER = builtin, LOCALE = "POSIX", LC_CTYPE = "POSIX" ); -- fails +ERROR: conflicting or redundant options +DETAIL: LOCALE cannot be specified together with LC_COLLATE or LC_CTYPE. CREATE COLLATION mycoll1 FROM "C"; CREATE COLLATION mycoll2 ( LC_COLLATE = "POSIX", LC_CTYPE = "POSIX" ); CREATE COLLATION mycoll3 FROM "default"; -- intentionally unsupported @@ -754,7 +774,7 @@ DETAIL: FROM cannot be specified together with any other options. -- must get rid of them. -- DROP SCHEMA collate_tests CASCADE; -NOTICE: drop cascades to 19 other objects +NOTICE: drop cascades to 21 other objects DETAIL: drop cascades to table collate_test1 drop cascades to table collate_test_like drop cascades to table collate_test2 @@ -771,6 +791,8 @@ drop cascades to function dup(anyelement) drop cascades to table collate_test20 drop cascades to table collate_test21 drop cascades to table collate_test22 +drop cascades to collation builtin_c +drop cascades to collation builtin_posix drop cascades to collation mycoll2 drop cascades to table collate_test23 drop cascades to view collate_on_int diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out new file mode 100644 index 0000000000..2ef1826d91 --- /dev/null +++ b/src/test/regress/expected/collate.utf8.out @@ -0,0 +1,109 @@ +/* + * This test is for collations and character operations when using the + * builtin provider with the C.UTF-8 locale. + */ +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif +SET client_encoding TO UTF8; +-- +-- Test preinstalled PG_C_UTF8 collation. +-- +CREATE TABLE builtin_test ( + t TEXT COLLATE PG_C_UTF8 +); +INSERT INTO builtin_test VALUES + ('abc DEF'), + ('ábc DÉF'), + ('DŽxxDŽ džxxDž Džxxdž'), + ('ȺȺȺ'), + ('ⱥⱥⱥ'), + ('ⱥȺ'); +SELECT + t, lower(t), initcap(t), upper(t), + length(convert_to(t, 'UTF8')) AS t_bytes, + length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes, + length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes, + length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes + FROM builtin_test; + t | lower | initcap | upper | t_bytes | lower_t_bytes | initcap_t_bytes | upper_t_bytes +----------------+----------------+----------------+----------------+---------+---------------+-----------------+--------------- + abc DEF | abc def | Abc Def | ABC DEF | 7 | 7 | 7 | 7 + ábc DÉF | ábc déf | Ábc Déf | ÁBC DÉF | 9 | 9 | 9 | 9 + DŽxxDŽ džxxDž Džxxdž | džxxdž džxxdž džxxdž | DŽxxdž DŽxxdž DŽxxdž | DŽXXDŽ DŽXXDŽ DŽXXDŽ | 20 | 20 | 20 | 20 + ȺȺȺ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 6 | 9 | 8 | 6 + ⱥⱥⱥ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 9 | 9 | 8 | 6 + ⱥȺ | ⱥⱥ | Ⱥⱥ | ȺȺ | 5 | 6 | 5 | 4 +(6 rows) + +DROP TABLE builtin_test; +-- character classes +SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT '@' !~ '[[:alnum:]]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT '@' ~ '[[:punct:]]' COLLATE PG_C_UTF8; -- symbols are punctuation in posix + ?column? +---------- + t +(1 row) + +SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT '൧' !~ '\d' COLLATE PG_C_UTF8; -- only 0-9 considered digits in posix + ?column? +---------- + t +(1 row) + +-- case mapping +SELECT 'xYz' ~* 'XyZ' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'xAb' ~* '[W-Y]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'Δ' ~* '[α-λ]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed + ?column? +---------- + t +(1 row) + diff --git a/src/test/regress/expected/collate.utf8_1.out b/src/test/regress/expected/collate.utf8_1.out new file mode 100644 index 0000000000..e73fdf50c3 --- /dev/null +++ b/src/test/regress/expected/collate.utf8_1.out @@ -0,0 +1,8 @@ +/* + * This test is for collations and character operations when using the + * builtin provider with the C.UTF-8 locale. + */ +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 1d8a414eea..e48cb4b7a3 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -78,9 +78,9 @@ test: brin_bloom brin_multi # psql depends on create_am # amutils depends on geometry, create_index_spgist, hash_index, brin # ---------- -test: create_table_like alter_generic alter_operator misc async dbsize merge misc_functions sysviews tsrf tid tidscan tidrangescan collate.icu.utf8 incremental_sort create_role without_overlaps +test: create_table_like alter_generic alter_operator misc async dbsize merge misc_functions sysviews tsrf tid tidscan tidrangescan collate.utf8 collate.icu.utf8 incremental_sort create_role without_overlaps -# collate.*.utf8 tests cannot be run in parallel with each other +# collate.linux.utf8 and collate.icu.utf8 tests cannot be run in parallel with each other test: rules psql psql_crosstab amutils stats_ext collate.linux.utf8 collate.windows.win1252 # ---------- diff --git a/src/test/regress/sql/collate.sql b/src/test/regress/sql/collate.sql index c3d40fc195..01d5c69fe4 100644 --- a/src/test/regress/sql/collate.sql +++ b/src/test/regress/sql/collate.sql @@ -244,6 +244,16 @@ EXPLAIN (COSTS OFF) -- CREATE/DROP COLLATION +CREATE COLLATION builtin_c ( PROVIDER = builtin, LOCALE = "C" ); +CREATE COLLATION builtin_posix ( PROVIDER = builtin, LOCALE = "POSIX" ); + +SELECT b FROM collate_test1 ORDER BY b COLLATE builtin_c; + +CREATE COLLATION builtin2 ( PROVIDER = builtin ); -- fails +CREATE COLLATION builtin2 ( PROVIDER = builtin, LOCALE = "en_US" ); -- fails +CREATE COLLATION builtin2 ( PROVIDER = builtin, LC_CTYPE = "C", LC_COLLATE = "C" ); -- fails +CREATE COLLATION builtin2 ( PROVIDER = builtin, LOCALE = "POSIX", LC_CTYPE = "POSIX" ); -- fails + CREATE COLLATION mycoll1 FROM "C"; CREATE COLLATION mycoll2 ( LC_COLLATE = "POSIX", LC_CTYPE = "POSIX" ); CREATE COLLATION mycoll3 FROM "default"; -- intentionally unsupported diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql new file mode 100644 index 0000000000..584c50f915 --- /dev/null +++ b/src/test/regress/sql/collate.utf8.sql @@ -0,0 +1,54 @@ +/* + * This test is for collations and character operations when using the + * builtin provider with the C.UTF-8 locale. + */ + +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif + +SET client_encoding TO UTF8; + +-- +-- Test preinstalled PG_C_UTF8 collation. +-- + +CREATE TABLE builtin_test ( + t TEXT COLLATE PG_C_UTF8 +); +INSERT INTO builtin_test VALUES + ('abc DEF'), + ('ábc DÉF'), + ('DŽxxDŽ džxxDž Džxxdž'), + ('ȺȺȺ'), + ('ⱥⱥⱥ'), + ('ⱥȺ'); + +SELECT + t, lower(t), initcap(t), upper(t), + length(convert_to(t, 'UTF8')) AS t_bytes, + length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes, + length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes, + length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes + FROM builtin_test; + +DROP TABLE builtin_test; + +-- character classes + +SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_C_UTF8; +SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_C_UTF8; +SELECT '@' !~ '[[:alnum:]]' COLLATE PG_C_UTF8; +SELECT '@' ~ '[[:punct:]]' COLLATE PG_C_UTF8; -- symbols are punctuation in posix +SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_C_UTF8; +SELECT '൧' !~ '\d' COLLATE PG_C_UTF8; -- only 0-9 considered digits in posix + +-- case mapping + +SELECT 'xYz' ~* 'XyZ' COLLATE PG_C_UTF8; +SELECT 'xAb' ~* '[W-Y]' COLLATE PG_C_UTF8; +SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8; +SELECT 'Δ' ~* '[α-λ]' COLLATE PG_C_UTF8; +SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed -- 2.34.1