From b6cbf986e1f0b32009ed060ad52a145a01e999d0 Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Mon, 24 Apr 2023 15:46:17 -0700 Subject: [PATCH v6 1/4] ICU: support locale "C" with the same behavior as libc. The "C" locale doesn't actually use a provider at all, it's a special locale that uses memcmp() and built-in character classification. Make it behave the same in ICU as libc (even though it doesn't actually make use of either provider). Discussion: https://postgr.es/m/87v8hoexdv.fsf@news-spur.riddles.org.uk --- src/backend/commands/collationcmds.c | 43 ++++++---- src/backend/commands/dbcommands.c | 42 ++++++---- src/backend/utils/adt/pg_locale.c | 83 ++++++++++++++----- .../regress/expected/collate.icu.utf8.out | 6 ++ src/test/regress/sql/collate.icu.utf8.sql | 4 + 5 files changed, 127 insertions(+), 51 deletions(-) diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c index 2969a2bb21..dd6cd2682f 100644 --- a/src/backend/commands/collationcmds.c +++ b/src/backend/commands/collationcmds.c @@ -264,26 +264,39 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("parameter \"locale\" must be specified"))); - /* - * During binary upgrade, preserve the locale string. Otherwise, - * canonicalize to a language tag. - */ - if (!IsBinaryUpgrade) + if (strcmp(colliculocale, "C") == 0 || + strcmp(colliculocale, "POSIX") == 0) { - char *langtag = icu_language_tag(colliculocale, - icu_validation_level); - - if (langtag && strcmp(colliculocale, langtag) != 0) + if (!collisdeterministic) + ereport(ERROR, + (errmsg("nondeterministic collations not supported for C or POSIX locale"))); + if (collicurules != NULL) + ereport(ERROR, + (errmsg("RULES not supported for C or POSIX locale"))); + } + else + { + /* + * During binary upgrade, preserve the locale + * string. Otherwise, canonicalize to a language tag. + */ + if (!IsBinaryUpgrade) { - ereport(NOTICE, - (errmsg("using standard form \"%s\" for locale \"%s\"", - langtag, colliculocale))); + char *langtag = icu_language_tag(colliculocale, + icu_validation_level); + + if (langtag && strcmp(colliculocale, langtag) != 0) + { + ereport(NOTICE, + (errmsg("using standard form \"%s\" for locale \"%s\"", + langtag, colliculocale))); - colliculocale = langtag; + colliculocale = langtag; + } } - } - icu_validate_locale(colliculocale); + icu_validate_locale(colliculocale); + } } /* diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 99d4080ea9..bfce8dc348 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -1058,27 +1058,37 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("ICU locale must be specified"))); - /* - * During binary upgrade, or when the locale came from the template - * database, preserve locale string. Otherwise, canonicalize to a - * language tag. - */ - if (!IsBinaryUpgrade && dbiculocale != src_iculocale) + if (strcmp(dbiculocale, "C") == 0 || + strcmp(dbiculocale, "POSIX") == 0) { - char *langtag = icu_language_tag(dbiculocale, - icu_validation_level); - - if (langtag && strcmp(dbiculocale, langtag) != 0) + if (dbicurules != NULL) + ereport(ERROR, + (errmsg("ICU_RULES not supported for C or POSIX locale"))); + } + else + { + /* + * During binary upgrade, or when the locale came from the + * template database, preserve locale string. Otherwise, + * canonicalize to a language tag. + */ + if (!IsBinaryUpgrade && dbiculocale != src_iculocale) { - ereport(NOTICE, - (errmsg("using standard form \"%s\" for locale \"%s\"", - langtag, dbiculocale))); + char *langtag = icu_language_tag(dbiculocale, + icu_validation_level); + + if (langtag && strcmp(dbiculocale, langtag) != 0) + { + ereport(NOTICE, + (errmsg("using standard form \"%s\" for locale \"%s\"", + langtag, dbiculocale))); - dbiculocale = langtag; + dbiculocale = langtag; + } } - } - icu_validate_locale(dbiculocale); + icu_validate_locale(dbiculocale); + } } else { diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 31e3b16ae0..986dcbd2a7 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1246,8 +1246,15 @@ lookup_collation_cache(Oid collation, bool set_flags) } else { - cache_entry->collate_is_c = false; - cache_entry->ctype_is_c = false; + Datum datum; + const char *colliculocale; + + datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colliculocale); + colliculocale = TextDatumGetCString(datum); + + cache_entry->collate_is_c = ((strcmp(colliculocale, "C") == 0) || + (strcmp(colliculocale, "POSIX") == 0)); + cache_entry->ctype_is_c = cache_entry->collate_is_c; } cache_entry->flags_valid = true; @@ -1279,16 +1286,27 @@ lc_collate_is_c(Oid collation) if (collation == DEFAULT_COLLATION_OID) { static int result = -1; - char *localeptr; - - if (default_locale.provider == COLLPROVIDER_ICU) - return false; + const char *localeptr; if (result >= 0) return (bool) result; - localeptr = setlocale(LC_COLLATE, NULL); - if (!localeptr) - elog(ERROR, "invalid LC_COLLATE setting"); + + if (default_locale.provider == COLLPROVIDER_ICU) + { +#ifdef USE_ICU + localeptr = default_locale.info.icu.locale; +#else + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("ICU is not supported in this build"))); +#endif + } + else + { + localeptr = setlocale(LC_COLLATE, NULL); + if (!localeptr) + elog(ERROR, "invalid LC_COLLATE setting"); + } if (strcmp(localeptr, "C") == 0) result = true; @@ -1332,16 +1350,27 @@ lc_ctype_is_c(Oid collation) if (collation == DEFAULT_COLLATION_OID) { static int result = -1; - char *localeptr; - - if (default_locale.provider == COLLPROVIDER_ICU) - return false; + const char *localeptr; if (result >= 0) return (bool) result; - localeptr = setlocale(LC_CTYPE, NULL); - if (!localeptr) - elog(ERROR, "invalid LC_CTYPE setting"); + + if (default_locale.provider == COLLPROVIDER_ICU) + { +#ifdef USE_ICU + localeptr = default_locale.info.icu.locale; +#else + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("ICU is not supported in this build"))); +#endif + } + else + { + localeptr = setlocale(LC_CTYPE, NULL); + if (!localeptr) + elog(ERROR, "invalid LC_CTYPE setting"); + } if (strcmp(localeptr, "C") == 0) result = true; @@ -1375,7 +1404,13 @@ make_icu_collator(const char *iculocstr, #ifdef USE_ICU UCollator *collator; - collator = pg_ucol_open(iculocstr); + if (strcmp(iculocstr, "C") == 0 || strcmp(iculocstr, "POSIX") == 0) + { + Assert(icurules == NULL); + collator = NULL; + } + else + collator = pg_ucol_open(iculocstr); /* * If rules are specified, we extract the rules of the standard collation, @@ -1650,6 +1685,9 @@ get_collation_actual_version(char collprovider, const char *collcollate) { char *collversion = NULL; + if (strcmp("C", collcollate) == 0 || strcmp("POSIX", collcollate) == 0) + return NULL; + #ifdef USE_ICU if (collprovider == COLLPROVIDER_ICU) { @@ -1668,9 +1706,7 @@ get_collation_actual_version(char collprovider, const char *collcollate) else #endif if (collprovider == COLLPROVIDER_LIBC && - pg_strcasecmp("C", collcollate) != 0 && - pg_strncasecmp("C.", collcollate, 2) != 0 && - pg_strcasecmp("POSIX", collcollate) != 0) + pg_strncasecmp("C.", collcollate, 2) != 0) { #if defined(__GLIBC__) /* Use the glibc version because we don't have anything better. */ @@ -2457,6 +2493,13 @@ pg_ucol_open(const char *loc_str) if (loc_str == NULL) elog(ERROR, "opening default collator is not supported"); + /* + * Must never open special values C or POSIX, which are treated specially + * and not passed to the provider. + */ + if (strcmp(loc_str, "C") == 0 || strcmp(loc_str, "POSIX") == 0) + elog(ERROR, "unexpected ICU locale string: %s", loc_str); + /* * In ICU versions 54 and earlier, "und" is not a recognized spelling of * the root locale. If the first component of the locale is "und", replace diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out index c658ee1404..bfc28ecfcf 100644 --- a/src/test/regress/expected/collate.icu.utf8.out +++ b/src/test/regress/expected/collate.icu.utf8.out @@ -1043,12 +1043,18 @@ ERROR: ICU locale "nonsense-nowhere" has unknown language "nonsense" HINT: To disable ICU locale validation, set parameter icu_validation_level to DISABLED. CREATE COLLATION testx (provider = icu, locale = '@colStrength=primary;nonsense=yes'); -- fails ERROR: could not convert locale name "@colStrength=primary;nonsense=yes" to language tag: U_ILLEGAL_ARGUMENT_ERROR +CREATE COLLATION testx (provider = icu, locale = 'C', deterministic = false); -- fails +ERROR: nondeterministic collations not supported for C or POSIX locale +CREATE COLLATION testx (provider = icu, locale = 'C', rules = '&V << w <<< W'); -- fails +ERROR: RULES not supported for C or POSIX locale RESET icu_validation_level; CREATE COLLATION testx (provider = icu, locale = '@colStrength=primary;nonsense=yes'); DROP COLLATION testx; WARNING: could not convert locale name "@colStrength=primary;nonsense=yes" to language tag: U_ILLEGAL_ARGUMENT_ERROR CREATE COLLATION testx (provider = icu, locale = 'nonsense-nowhere'); DROP COLLATION testx; WARNING: ICU locale "nonsense-nowhere" has unknown language "nonsense" HINT: To disable ICU locale validation, set parameter icu_validation_level to DISABLED. +CREATE COLLATION testx (provider = icu, locale = 'C'); DROP COLLATION testx; +CREATE COLLATION testx (provider = icu, locale = 'POSIX'); DROP COLLATION testx; CREATE COLLATION test4 FROM nonsense; ERROR: collation "nonsense" for encoding "UTF8" does not exist CREATE COLLATION test5 FROM test0; diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql index 7bd0901281..572dc5a50a 100644 --- a/src/test/regress/sql/collate.icu.utf8.sql +++ b/src/test/regress/sql/collate.icu.utf8.sql @@ -379,9 +379,13 @@ CREATE COLLATION test3 (provider = icu, lc_collate = 'en_US.utf8'); -- fail, nee SET icu_validation_level = ERROR; CREATE COLLATION testx (provider = icu, locale = 'nonsense-nowhere'); -- fails CREATE COLLATION testx (provider = icu, locale = '@colStrength=primary;nonsense=yes'); -- fails +CREATE COLLATION testx (provider = icu, locale = 'C', deterministic = false); -- fails +CREATE COLLATION testx (provider = icu, locale = 'C', rules = '&V << w <<< W'); -- fails RESET icu_validation_level; CREATE COLLATION testx (provider = icu, locale = '@colStrength=primary;nonsense=yes'); DROP COLLATION testx; CREATE COLLATION testx (provider = icu, locale = 'nonsense-nowhere'); DROP COLLATION testx; +CREATE COLLATION testx (provider = icu, locale = 'C'); DROP COLLATION testx; +CREATE COLLATION testx (provider = icu, locale = 'POSIX'); DROP COLLATION testx; CREATE COLLATION test4 FROM nonsense; CREATE COLLATION test5 FROM test0; -- 2.34.1