From c48d91ab80e151093dbae0ada6a9691a2f112c5a Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Mon, 8 May 2023 13:48:01 -0700 Subject: [PATCH v10 2/4] ICU: for locale "C", automatically use "builtin" provider instead. Postgres expects locale C to be optimizable to simple locale-unaware byte operations; while ICU does not recognize the locale "C" at all, and falls back to the root locale. If the user specifies locale "C" when creating a new collation or a new database with the ICU provider, automatically switch it to the "builtin" provider. If provider is libc, behavior is unchanged. Discussion: https://postgr.es/m/ab925f69-5f9d-f85e-b87c-bd2a44798659@joeconway.com --- doc/src/sgml/charset.sgml | 6 +++ doc/src/sgml/ref/create_collation.sgml | 6 +++ doc/src/sgml/ref/create_database.sgml | 5 +++ doc/src/sgml/ref/createdb.sgml | 5 +++ doc/src/sgml/ref/initdb.sgml | 23 ++++++----- src/backend/commands/collationcmds.c | 20 ++++++++++ src/backend/commands/dbcommands.c | 21 ++++++++++ src/bin/initdb/initdb.c | 17 ++++++++ src/bin/initdb/t/001_initdb.pl | 39 +++++++++++++++++++ src/bin/scripts/t/020_createdb.pl | 18 +++++++++ .../regress/expected/collate.icu.utf8.out | 14 +++++-- src/test/regress/sql/collate.icu.utf8.sql | 6 +++ 12 files changed, 168 insertions(+), 12 deletions(-) diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index b38cf82f83..9a7f9263ec 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -407,6 +407,12 @@ initdb --locale-provider=icu --icu-locale=en change in results. LC_COLLATE and LC_CTYPE can be set independently of the ICU locale. + + The ICU provider does not accept the C + locale. Commands that create collations or database with the + icu provider and ICU locale C use + the provider builtin instead. + For the ICU provider, results may depend on the version of the ICU diff --git a/doc/src/sgml/ref/create_collation.sgml b/doc/src/sgml/ref/create_collation.sgml index c63a350c1e..67ea247b39 100644 --- a/doc/src/sgml/ref/create_collation.sgml +++ b/doc/src/sgml/ref/create_collation.sgml @@ -131,6 +131,12 @@ CREATE COLLATION [ IF NOT EXISTS ] name FROM libc is the default. See for details. + + If the provider is icu and the locale is + C or POSIX, the provider is + automatically set to builtin; as the ICU provider + doesn't support an ICU locale of C. + diff --git a/doc/src/sgml/ref/create_database.sgml b/doc/src/sgml/ref/create_database.sgml index 5655d6c823..17550c19bb 100644 --- a/doc/src/sgml/ref/create_database.sgml +++ b/doc/src/sgml/ref/create_database.sgml @@ -196,6 +196,11 @@ CREATE DATABASE name Specifies the ICU locale ID if the ICU locale provider is used. + + If specified as C or POSIX, the + provider is automatically set to builtin, as the + ICU provider doesn't support an ICU locale of C. + diff --git a/doc/src/sgml/ref/createdb.sgml b/doc/src/sgml/ref/createdb.sgml index ee4644818d..5352cce744 100644 --- a/doc/src/sgml/ref/createdb.sgml +++ b/doc/src/sgml/ref/createdb.sgml @@ -154,6 +154,11 @@ PostgreSQL documentation Specifies the ICU locale ID to be used in this database, if the ICU locale provider is selected. + + If specified as C or POSIX, the + provider is automatically set to builtin, as the + ICU provider doesn't support an ICU locale of C. + diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml index 96f84dc8ca..d9620e5931 100644 --- a/doc/src/sgml/ref/initdb.sgml +++ b/doc/src/sgml/ref/initdb.sgml @@ -94,13 +94,14 @@ PostgreSQL documentation By default, initdb uses the ICU library to provide - locale services if the server was built with ICU support; otherwise it uses - the libc locale provider (see ). To choose the specific ICU locale ID to - apply, use the option . Note that for - implementation reasons and to support legacy code, - initdb will still select and initialize libc locale - settings when the ICU locale provider is used. + locale services if the server was built with ICU support. To choose the + specific ICU locale ID to apply, use the option + . Note that for implementation reasons and to + support legacy code, initdb will still select and + initialize libc locale settings when the ICU locale provider is used. If + --icu-locale is C (or equivalently + POSIX), the builtin provider will be + selected instead of ICU. @@ -109,8 +110,7 @@ PostgreSQL documentation --locale-provider=libc, or build the server without ICU support. The libc locale provider takes the locale settings from the environment, and determines the encoding from the locale - settings. This is almost always sufficient, unless there are special - requirements. + settings. @@ -250,6 +250,11 @@ PostgreSQL documentation Specifies the ICU locale when the ICU provider is used. Locale support is described in . + + If specified as C or POSIX, the + provider is automatically set to builtin, as the + ICU provider doesn't support an ICU locale of C. + If this option is not specified, the locale is inherited from the environment in which initdb runs. The environment's diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c index 4748946499..3b2ba38250 100644 --- a/src/backend/commands/collationcmds.c +++ b/src/backend/commands/collationcmds.c @@ -252,6 +252,26 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e if (lcctypeEl) collctype = defGetString(lcctypeEl); + /* + * Postgres defines the "C" (and equivalently, "POSIX") locales to be + * optimizable to byte operations (memcmp(), pg_ascii_tolower(), + * etc.); use the builtin provider instead of ICU if the locale is C. + * + * Don't change the provider during binary upgrade. + */ + if (!IsBinaryUpgrade && collprovider == COLLPROVIDER_ICU && + colliculocale && + (strcmp(colliculocale, "C") == 0 || + strcmp(colliculocale, "POSIX") == 0)) + { + ereport(NOTICE, + (errmsg("using locale provider \"builtin\" for ICU locale \"%s\"", + colliculocale))); + builtin_locale = colliculocale; + colliculocale = NULL; + collprovider = COLLPROVIDER_BUILTIN; + } + if (collprovider == COLLPROVIDER_BUILTIN) { if (!builtin_locale) diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 016852644f..5c8caeac2d 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -1043,6 +1043,27 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) check_encoding_locale_matches(encoding, dbcollate, dbctype); + /* + * Postgres defines the "C" (and equivalently, "POSIX") locales to be + * optimizable to byte operations (memcmp(), pg_ascii_tolower(), etc.); + * use the builtin provider instead of ICU if the locale is C. + * + * Don't change the provider during binary upgrade or when both the + * provider and ICU locale are unchanged from the template. + */ + if (!IsBinaryUpgrade && dblocprovider == COLLPROVIDER_ICU && + dbiculocale && + (dblocprovider != src_locprovider || + strcmp(dbiculocale, src_iculocale) != 0) && + (strcmp(dbiculocale, "C") == 0 || strcmp(dbiculocale, "POSIX") == 0)) + { + ereport(NOTICE, + (errmsg("using locale provider \"builtin\" for ICU locale \"%s\"", + dbiculocale))); + dbiculocale = NULL; + dblocprovider = COLLPROVIDER_BUILTIN; + } + if (dblocprovider == COLLPROVIDER_ICU) { if (!(is_encoding_supported_by_icu(encoding))) diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 6fc19c8d64..897521cf35 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -2394,6 +2394,23 @@ setlocales(void) lc_messages = locale; } + /* + * Postgres defines the "C" (and equivalently, "POSIX") locales to be + * optimizable to byte operations (memcmp(), pg_ascii_tolower(), etc.); + * use the builtin provider instead of ICU if the locale is C. + */ + if (icu_locale && locale_provider == COLLPROVIDER_ICU && + (strcmp(icu_locale, "C") == 0 || + strcmp(icu_locale, "POSIX") == 0 || + strncmp(icu_locale, "C.", 2) == 0 || + strncmp(icu_locale, "POSIX.", 6) == 0)) + { + pg_log_info("using locale provider \"builtin\" for ICU locale \"%s\"", + icu_locale); + icu_locale = NULL; + locale_provider = COLLPROVIDER_BUILTIN; + } + /* * canonicalize locale names, and obtain any missing values from our * current environment diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl index 157d6acfd4..f3f832a413 100644 --- a/src/bin/initdb/t/001_initdb.pl +++ b/src/bin/initdb/t/001_initdb.pl @@ -111,6 +111,45 @@ if ($ENV{with_icu} eq 'yes') ], 'option --icu-locale'); + # transformed to provider=builtin + command_ok( + [ + 'initdb', '--no-sync', + '--locale-provider=icu', '--icu-locale=C', + "$tempdir/data4a" + ], + 'option --icu-locale=C'); + + # transformed to provider=builtin + command_ok( + [ + 'initdb', '--no-sync', + '--locale-provider=icu', '--icu-locale=C', + '--locale=C', + "$tempdir/data4b" + ], + 'option --icu-locale=C --locale=C'); + + # transformed to provider=builtin + command_ok( + [ + 'initdb', '--no-sync', + '--locale-provider=icu', '--icu-locale=C', + '--lc-collate=C', + "$tempdir/data4c" + ], + 'option --icu-locale=C --lc-collate=C'); + + # transformed to provider=builtin + command_ok( + [ + 'initdb', '--no-sync', + '--locale-provider=icu', '--icu-locale=C', + '--lc-ctype=C', + "$tempdir/data4d" + ], + 'option --icu-locale=C --lc-ctype=C'); + command_fails_like( [ 'initdb', '--no-sync', diff --git a/src/bin/scripts/t/020_createdb.pl b/src/bin/scripts/t/020_createdb.pl index f1d6db0f48..121ad5112b 100644 --- a/src/bin/scripts/t/020_createdb.pl +++ b/src/bin/scripts/t/020_createdb.pl @@ -86,6 +86,24 @@ if ($ENV{with_icu} eq 'yes') ], 'create database with icu locale from template database with icu provider' ); + + # transformed into provider "builtin" + $node->command_ok( + [ + 'createdb', '-T', 'template0', '--locale-provider=icu', + '--icu-locale=C', 'test_builtin_icu1' + ], + 'create database with provider "icu" and ICU_LOCALE="C"' + ); + + # transformed into provider "builtin" + $node->command_ok( + [ + 'createdb', '-T', 'template0', '--locale-provider=icu', + '--icu-locale=C', '--lc-ctype=C', 'test_builtin_icu2' + ], + 'create database with provider "icu" and ICU_LOCALE="C" and LC_CTYPE=C' + ); } else { diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out index 00dee24549..5ba8f75558 100644 --- a/src/test/regress/expected/collate.icu.utf8.out +++ b/src/test/regress/expected/collate.icu.utf8.out @@ -1035,6 +1035,9 @@ BEGIN END $$; RESET client_min_messages; +-- uses "builtin" provider instead +CREATE COLLATION testc (provider = icu, locale='C'); +NOTICE: using locale provider "builtin" for ICU locale "C" CREATE COLLATION test3 (provider = icu, lc_collate = 'en_US.utf8'); -- fail, needs "locale" ERROR: parameter "locale" must be specified SET icu_validation_level = ERROR; @@ -1058,8 +1061,11 @@ SELECT collname FROM pg_collation WHERE collname LIKE 'test%' ORDER BY 1; test0 test1 test5 -(3 rows) + testc +(4 rows) +DROP COLLATION test1; +CREATE COLLATION test1 (provider = icu, locale = 'und'); ALTER COLLATION test1 RENAME TO test11; ALTER COLLATION test0 RENAME TO test11; -- fail ERROR: collation "test11" already exists in schema "collate_tests" @@ -1079,7 +1085,8 @@ SELECT collname, nspname, obj_description(pg_collation.oid, 'pg_collation') test0 | collate_tests | US English test11 | test_schema | test5 | collate_tests | -(3 rows) + testc | collate_tests | +(4 rows) DROP COLLATION test0, test_schema.test11, test5; DROP COLLATION test0; -- fail @@ -1089,7 +1096,8 @@ NOTICE: collation "test0" does not exist, skipping SELECT collname FROM pg_collation WHERE collname LIKE 'test%'; collname ---------- -(0 rows) + testc +(1 row) DROP SCHEMA test_schema; DROP ROLE regress_test_role; diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql index a8001b4b8e..122c166966 100644 --- a/src/test/regress/sql/collate.icu.utf8.sql +++ b/src/test/regress/sql/collate.icu.utf8.sql @@ -375,6 +375,9 @@ $$; RESET client_min_messages; +-- uses "builtin" provider instead +CREATE COLLATION testc (provider = icu, locale='C'); + CREATE COLLATION test3 (provider = icu, lc_collate = 'en_US.utf8'); -- fail, needs "locale" SET icu_validation_level = ERROR; CREATE COLLATION testx (provider = icu, locale = 'nonsense-nowhere'); -- fails @@ -388,6 +391,9 @@ CREATE COLLATION test5 FROM test0; SELECT collname FROM pg_collation WHERE collname LIKE 'test%' ORDER BY 1; +DROP COLLATION test1; +CREATE COLLATION test1 (provider = icu, locale = 'und'); + ALTER COLLATION test1 RENAME TO test11; ALTER COLLATION test0 RENAME TO test11; -- fail ALTER COLLATION test1 RENAME TO test22; -- fail -- 2.34.1