From c7405ecc07e552fa9bdf4cf535b4757c2de7e9e4 Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Sun, 26 Oct 2025 14:51:47 -0700 Subject: [PATCH v7 2/9] Define char_tolower()/char_toupper() for all locale providers. The behavior is defined for each locale provider rather than unconditionally depending on the global LC_CTYPE setting. Needed as an alternative for tolower()/toupper() for some callers. --- src/backend/utils/adt/like.c | 4 +-- src/backend/utils/adt/pg_locale.c | 32 ++++++++++++++++------- src/backend/utils/adt/pg_locale_builtin.c | 18 +++++++++++++ src/backend/utils/adt/pg_locale_icu.c | 23 ++++++++++++++++ src/backend/utils/adt/pg_locale_libc.c | 21 +++++++++++++-- src/include/utils/pg_locale.h | 10 +++---- 6 files changed, 89 insertions(+), 19 deletions(-) diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c index 4216ac17f43..37c1c86aee8 100644 --- a/src/backend/utils/adt/like.c +++ b/src/backend/utils/adt/like.c @@ -209,9 +209,7 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation) * way. */ - if (locale->ctype_is_c || - (char_tolower_enabled(locale) && - pg_database_encoding_max_length() == 1)) + if (locale->ctype_is_c || locale->ctype->pattern_casefold_char) { p = VARDATA_ANY(pat); plen = VARSIZE_ANY_EXHDR(pat); diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index b14c7837938..9631d274611 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1564,25 +1564,39 @@ char_is_cased(char ch, pg_locale_t locale) } /* - * char_tolower_enabled() + * char_tolower() * - * Does the provider support char_tolower()? + * Convert single-byte char to lowercase. Not correct for multibyte encodings, + * but needed for historical compatibility purposes. */ -bool -char_tolower_enabled(pg_locale_t locale) +char +char_tolower(unsigned char ch, pg_locale_t locale) { - return (locale->ctype->char_tolower != NULL); + if (locale->ctype == NULL) + { + if (ch >= 'A' && ch <= 'Z') + return ch + ('a' - 'A'); + return ch; + } + return locale->ctype->char_tolower(ch, locale); } /* - * char_tolower() + * char_toupper() * - * Convert char (single-byte encoding) to lowercase. + * Convert single-byte char to uppercase. Not correct for multibyte encodings, + * but needed for historical compatibility purposes. */ char -char_tolower(unsigned char ch, pg_locale_t locale) +char_toupper(unsigned char ch, pg_locale_t locale) { - return locale->ctype->char_tolower(ch, locale); + if (locale->ctype == NULL) + { + if (ch >= 'a' && ch <= 'z') + return ch - ('a' - 'A'); + return ch; + } + return locale->ctype->char_toupper(ch, locale); } /* diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c index 1021e0d129b..5059b2bb59a 100644 --- a/src/backend/utils/adt/pg_locale_builtin.c +++ b/src/backend/utils/adt/pg_locale_builtin.c @@ -185,6 +185,22 @@ wc_isxdigit_builtin(pg_wchar wc, pg_locale_t locale) return pg_u_isxdigit(to_char32(wc), !locale->builtin.casemap_full); } +static char +char_tolower_builtin(unsigned char ch, pg_locale_t locale) +{ + if (ch >= 'A' && ch <= 'Z') + return ch + ('a' - 'A'); + return ch; +} + +static char +char_toupper_builtin(unsigned char ch, pg_locale_t locale) +{ + if (ch >= 'a' && ch <= 'z') + return ch - ('a' - 'A'); + return ch; +} + static bool char_is_cased_builtin(char ch, pg_locale_t locale) { @@ -219,6 +235,8 @@ static const struct ctype_methods ctype_methods_builtin = { .wc_ispunct = wc_ispunct_builtin, .wc_isspace = wc_isspace_builtin, .wc_isxdigit = wc_isxdigit_builtin, + .char_tolower = char_tolower_builtin, + .char_toupper = char_toupper_builtin, .char_is_cased = char_is_cased_builtin, .wc_tolower = wc_tolower_builtin, .wc_toupper = wc_toupper_builtin, diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c index f5a0cc8fe41..449e3bbb7a6 100644 --- a/src/backend/utils/adt/pg_locale_icu.c +++ b/src/backend/utils/adt/pg_locale_icu.c @@ -121,6 +121,27 @@ static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity, const char *locale, UErrorCode *pErrorCode); +/* + * ICU still depends on libc for compatibility with certain historical + * behavior for single-byte encodings. XXX: consider fixing by decoding the + * single byte into a code point, and using u_tolower(). + */ +static char +char_tolower_icu(unsigned char ch, pg_locale_t locale) +{ + if (isupper(ch)) + return tolower(ch); + return ch; +} + +static char +char_toupper_icu(unsigned char ch, pg_locale_t locale) +{ + if (islower(ch)) + return toupper(ch); + return ch; +} + static bool char_is_cased_icu(char ch, pg_locale_t locale) { @@ -238,6 +259,8 @@ static const struct ctype_methods ctype_methods_icu = { .wc_ispunct = wc_ispunct_icu, .wc_isspace = wc_isspace_icu, .wc_isxdigit = wc_isxdigit_icu, + .char_tolower = char_tolower_icu, + .char_toupper = char_toupper_icu, .char_is_cased = char_is_cased_icu, .wc_toupper = toupper_icu, .wc_tolower = tolower_icu, diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c index 716f005066a..b0428ad288e 100644 --- a/src/backend/utils/adt/pg_locale_libc.c +++ b/src/backend/utils/adt/pg_locale_libc.c @@ -251,8 +251,21 @@ wc_isxdigit_libc_mb(pg_wchar wc, pg_locale_t locale) static char char_tolower_libc(unsigned char ch, pg_locale_t locale) { - Assert(pg_database_encoding_max_length() == 1); - return tolower_l(ch, locale->lt); + locale_t loc = locale->lt; + + if (isupper_l(ch, loc)) + return tolower_l(ch, loc); + return ch; +} + +static char +char_toupper_libc(unsigned char ch, pg_locale_t locale) +{ + locale_t loc = locale->lt; + + if (islower_l(ch, loc)) + return toupper_l(ch, loc); + return ch; } static bool @@ -338,9 +351,11 @@ static const struct ctype_methods ctype_methods_libc_sb = { .wc_isxdigit = wc_isxdigit_libc_sb, .char_is_cased = char_is_cased_libc, .char_tolower = char_tolower_libc, + .char_toupper = char_toupper_libc, .wc_toupper = toupper_libc_sb, .wc_tolower = tolower_libc_sb, .max_chr = UCHAR_MAX, + .pattern_casefold_char = true, }; /* @@ -363,6 +378,7 @@ static const struct ctype_methods ctype_methods_libc_other_mb = { .wc_isxdigit = wc_isxdigit_libc_sb, .char_is_cased = char_is_cased_libc, .char_tolower = char_tolower_libc, + .char_toupper = char_toupper_libc, .wc_toupper = toupper_libc_sb, .wc_tolower = tolower_libc_sb, .max_chr = UCHAR_MAX, @@ -384,6 +400,7 @@ static const struct ctype_methods ctype_methods_libc_utf8 = { .wc_isxdigit = wc_isxdigit_libc_mb, .char_is_cased = char_is_cased_libc, .char_tolower = char_tolower_libc, + .char_toupper = char_toupper_libc, .wc_toupper = toupper_libc_mb, .wc_tolower = tolower_libc_mb, }; diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 683e1a0eef8..790db566e91 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -113,13 +113,13 @@ struct ctype_methods /* required */ bool (*char_is_cased) (char ch, pg_locale_t locale); + char (*char_tolower) (unsigned char ch, pg_locale_t locale); + char (*char_toupper) (unsigned char ch, pg_locale_t locale); /* - * Optional. If defined, will only be called for single-byte encodings. If - * not defined, or if the encoding is multibyte, will fall back to - * pg_strlower(). + * Use byte-at-a-time case folding for case-insensitive patterns. */ - char (*char_tolower) (unsigned char ch, pg_locale_t locale); + bool pattern_casefold_char; /* * For regex and pattern matching efficiency, the maximum char value @@ -177,8 +177,8 @@ extern pg_locale_t pg_newlocale_from_collation(Oid collid); extern char *get_collation_actual_version(char collprovider, const char *collcollate); extern bool char_is_cased(char ch, pg_locale_t locale); -extern bool char_tolower_enabled(pg_locale_t locale); extern char char_tolower(unsigned char ch, pg_locale_t locale); +extern char char_toupper(unsigned char ch, pg_locale_t locale); extern size_t pg_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); -- 2.43.0