From b33dc56960378a1047ccf9c0387a1fe333912140 Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Fri, 28 Apr 2023 12:22:41 -0700 Subject: [PATCH v3 3/4] ICU: fix up old libc-style locale strings. Before transforming a locale string into a language tag, fix up old libc-style locale strings such as 'de__PHONEBOOK' or 'fr_FR@EURO'. Older ICU versions did this automatically, but ICU version 64 removed that support. --- src/backend/utils/adt/pg_locale.c | 59 ++++++++++++++++- src/bin/initdb/initdb.c | 63 ++++++++++++++++++- .../regress/expected/collate.icu.utf8.out | 11 ++++ src/test/regress/sql/collate.icu.utf8.sql | 7 +++ 4 files changed, 138 insertions(+), 2 deletions(-) diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 3e19b21122..9f2c139b0b 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -2812,6 +2812,60 @@ icu_set_collation_attributes(UCollator *collator, const char *loc, pfree(lower_str); } +static const char *icu_variant_map[][2] = { + { "@EURO", "@currency=EUR" }, + { "@PINYIN", "@collation=pinyin" }, + { "@STROKE", "@collation=stroke" }, +}; + +#define ICU_VARIANT_MAP_SIZE \ + (sizeof(icu_variant_map)/sizeof(icu_variant_map[0])) + +/* + * ICU version 64 removed the ability to transform locale strings of the form + * '...@VARIANT' into proper language tags. Perform the transformation from + * within Postgres so that ICU supports any libc locale name consistently, + * regardless of the ICU version. + */ +static char * +icu_fix_variants(const char *loc_str) +{ + const char *old_variant = strrchr(loc_str, '@'); + + /* + * Extract a variant of the form '...@VARIANT', and replace with + * the appropriate '...@keyword=value' if found in the map. + */ + if (old_variant) + { + size_t prefix_len = old_variant - loc_str; /* bytes before the '@' */ + + for (int i = 0; i < ICU_VARIANT_MAP_SIZE; i++) + { + const char *map_variant = icu_variant_map[i][0]; + const char *map_replacement = icu_variant_map[i][1]; + + if (pg_strcasecmp(old_variant, map_variant) == 0) + { + size_t replacement_len = strlen(map_replacement); + size_t result_len; + char *result; + + result_len = prefix_len + replacement_len + 1; + result = palloc(result_len); + + memcpy(result, loc_str, prefix_len); + memcpy(result + prefix_len, map_replacement, replacement_len); + result[prefix_len + replacement_len] = '\0'; + + return result; + } + } + } + + return pstrdup(loc_str); +} + #endif /* @@ -2828,6 +2882,7 @@ icu_language_tag(const char *loc_str, int elevel) { #ifdef USE_ICU UErrorCode status; + char *fixed_loc_str = icu_fix_variants(loc_str); char *langtag; size_t buflen = 32; /* arbitrary starting buffer size */ const bool strict = true; @@ -2844,7 +2899,7 @@ icu_language_tag(const char *loc_str, int elevel) int32_t len; status = U_ZERO_ERROR; - len = uloc_toLanguageTag(loc_str, langtag, buflen, strict, &status); + len = uloc_toLanguageTag(fixed_loc_str, langtag, buflen, strict, &status); /* * If the result fits in the buffer exactly (len == buflen), @@ -2864,6 +2919,8 @@ icu_language_tag(const char *loc_str, int elevel) break; } + pfree(fixed_loc_str); + if (U_FAILURE(status)) { pfree(langtag); diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 4086834458..600c8d93f3 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -2229,6 +2229,64 @@ check_icu_locale_encoding(int user_enc) return true; } +#ifdef USE_ICU + +static const char *icu_variant_map[][2] = { + { "@EURO", "@currency=EUR" }, + { "@PINYIN", "@collation=pinyin" }, + { "@STROKE", "@collation=stroke" }, +}; + +#define ICU_VARIANT_MAP_SIZE \ + (sizeof(icu_variant_map)/sizeof(icu_variant_map[0])) + +/* + * ICU version 64 removed the ability to transform locale strings of the form + * '...@VARIANT' into proper language tags. Perform the transformation from + * within Postgres so that ICU supports any libc locale name consistently, + * regardless of the ICU version. + */ +static char * +icu_fix_variants(const char *loc_str) +{ + const char *old_variant = strrchr(loc_str, '@'); + + /* + * Extract a variant of the form '...@VARIANT', and replace with + * the appropriate '...@keyword=value' if found in the map. + */ + if (old_variant) + { + size_t prefix_len = old_variant - loc_str; /* bytes before the '@' */ + + for (int i = 0; i < ICU_VARIANT_MAP_SIZE; i++) + { + const char *map_variant = icu_variant_map[i][0]; + const char *map_replacement = icu_variant_map[i][1]; + + if (pg_strcasecmp(old_variant, map_variant) == 0) + { + size_t replacement_len = strlen(map_replacement); + size_t result_len; + char *result; + + result_len = prefix_len + replacement_len + 1; + result = pg_malloc(result_len); + + memcpy(result, loc_str, prefix_len); + memcpy(result + prefix_len, map_replacement, replacement_len); + result[prefix_len + replacement_len] = '\0'; + + return result; + } + } + } + + return pg_strdup(loc_str); +} + +#endif + /* * Convert to canonical BCP47 language tag. Must be consistent with * icu_language_tag(). @@ -2238,6 +2296,7 @@ icu_language_tag(const char *loc_str) { #ifdef USE_ICU UErrorCode status; + char *fixed_loc_str = icu_fix_variants(loc_str); char *langtag; size_t buflen = 32; /* arbitrary starting buffer size */ const bool strict = true; @@ -2254,7 +2313,7 @@ icu_language_tag(const char *loc_str) int32_t len; status = U_ZERO_ERROR; - len = uloc_toLanguageTag(loc_str, langtag, buflen, strict, &status); + len = uloc_toLanguageTag(fixed_loc_str, langtag, buflen, strict, &status); /* * If the result fits in the buffer exactly (len == buflen), @@ -2273,6 +2332,8 @@ icu_language_tag(const char *loc_str) break; } + pg_free(fixed_loc_str); + if (U_FAILURE(status)) { pg_free(langtag); diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out index 53ab496bfe..5f5b61d036 100644 --- a/src/test/regress/expected/collate.icu.utf8.out +++ b/src/test/regress/expected/collate.icu.utf8.out @@ -1048,15 +1048,26 @@ CREATE COLLATION testx (provider = icu, locale = 'c', rules = '&V << w <<< W'); ERROR: RULES not supported for C or POSIX locale CREATE COLLATION testx (provider = icu, locale = '@colStrength=primary;nonsense=yes'); -- fails ERROR: could not convert locale name "@colStrength=primary;nonsense=yes" to language tag: U_ILLEGAL_ARGUMENT_ERROR +CREATE COLLATION testx (provider = icu, locale = '@ASDF'); -- fails +ERROR: could not convert locale name "@ASDF" to language tag: U_ILLEGAL_ARGUMENT_ERROR SET icu_validation_level = WARNING; CREATE COLLATION testx (provider = icu, locale = '@colStrength=primary;nonsense=yes'); DROP COLLATION testx; WARNING: could not convert locale name "@colStrength=primary;nonsense=yes" to language tag: U_ILLEGAL_ARGUMENT_ERROR CREATE COLLATION testx (provider = icu, locale = 'nonsense-nowhere'); DROP COLLATION testx; WARNING: ICU locale "nonsense-nowhere" has unknown language "nonsense" HINT: To disable ICU locale validation, set parameter icu_validation_level to DISABLED. +CREATE COLLATION testx (provider = icu, locale = '@ASDF'); DROP COLLATION testx; +WARNING: could not convert locale name "@ASDF" to language tag: U_ILLEGAL_ARGUMENT_ERROR RESET icu_validation_level; CREATE COLLATION testx (provider = icu, locale = 'c'); DROP COLLATION testx; CREATE COLLATION testx (provider = icu, locale = 'posix'); DROP COLLATION testx; +-- test special variants +CREATE COLLATION testx (provider = icu, locale = '@EURO'); DROP COLLATION testx; +NOTICE: using standard form "und-u-cu-eur" for ICU locale "@EURO" +CREATE COLLATION testx (provider = icu, locale = '@pinyin'); DROP COLLATION testx; +NOTICE: using standard form "und-u-co-pinyin" for ICU locale "@pinyin" +CREATE COLLATION testx (provider = icu, locale = '@stroke'); DROP COLLATION testx; +NOTICE: using standard form "und-u-co-stroke" for ICU locale "@stroke" CREATE COLLATION test4 FROM nonsense; ERROR: collation "nonsense" for encoding "UTF8" does not exist CREATE COLLATION test5 FROM test0; diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql index 63d5352ee6..e4bbd2c009 100644 --- a/src/test/regress/sql/collate.icu.utf8.sql +++ b/src/test/regress/sql/collate.icu.utf8.sql @@ -382,14 +382,21 @@ CREATE COLLATION testx (provider = icu, locale = 'nonsense-nowhere'); -- fails CREATE COLLATION testx (provider = icu, locale = 'c', deterministic = false); -- fails CREATE COLLATION testx (provider = icu, locale = 'c', rules = '&V << w <<< W'); -- fails CREATE COLLATION testx (provider = icu, locale = '@colStrength=primary;nonsense=yes'); -- fails +CREATE COLLATION testx (provider = icu, locale = '@ASDF'); -- fails SET icu_validation_level = WARNING; CREATE COLLATION testx (provider = icu, locale = '@colStrength=primary;nonsense=yes'); DROP COLLATION testx; CREATE COLLATION testx (provider = icu, locale = 'nonsense-nowhere'); DROP COLLATION testx; +CREATE COLLATION testx (provider = icu, locale = '@ASDF'); DROP COLLATION testx; RESET icu_validation_level; CREATE COLLATION testx (provider = icu, locale = 'c'); DROP COLLATION testx; CREATE COLLATION testx (provider = icu, locale = 'posix'); DROP COLLATION testx; +-- test special variants +CREATE COLLATION testx (provider = icu, locale = '@EURO'); DROP COLLATION testx; +CREATE COLLATION testx (provider = icu, locale = '@pinyin'); DROP COLLATION testx; +CREATE COLLATION testx (provider = icu, locale = '@stroke'); DROP COLLATION testx; + CREATE COLLATION test4 FROM nonsense; CREATE COLLATION test5 FROM test0; -- 2.34.1