From 94a0e519f13cfc8554d11cf46ed7bbef8aad2ed3 Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Fri, 21 Nov 2025 12:14:21 -0800 Subject: [PATCH v10 08/11] Use multibyte-aware extraction of pattern prefixes. Previously, like_fixed_prefix() used char-at-a-time logic, which forced it to be too conservative for case-insensitive matching. Now, use pg_wchar-at-a-time loop for text types, along with proper detection of cased characters; and preserve and char-at-a-time logic for bytea. Removes the pg_locale_t char_is_cased() single-byte method and replaces it with a proper multibyte pg_iswcased() method. Discussion: https://postgr.es/m/450ceb6260cad30d7afdf155d991a9caafee7c0d.camel@j-davis.com --- src/backend/utils/adt/like_support.c | 111 +++++++++++++--------- src/backend/utils/adt/pg_locale.c | 26 +++-- src/backend/utils/adt/pg_locale_builtin.c | 7 +- src/backend/utils/adt/pg_locale_icu.c | 15 ++- src/backend/utils/adt/pg_locale_libc.c | 23 +++-- src/include/utils/pg_locale.h | 5 +- 6 files changed, 103 insertions(+), 84 deletions(-) diff --git a/src/backend/utils/adt/like_support.c b/src/backend/utils/adt/like_support.c index 0debccfa67b..e7255fa652a 100644 --- a/src/backend/utils/adt/like_support.c +++ b/src/backend/utils/adt/like_support.c @@ -987,12 +987,11 @@ static Pattern_Prefix_Status like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, Const **prefix_const, Selectivity *rest_selec) { - char *match; char *patt; int pattlen; Oid typeid = patt_const->consttype; - int pos, - match_pos; + int pos; + int match_pos = 0; pg_locale_t locale = 0; /* the right-hand const is type text or bytea */ @@ -1020,67 +1019,91 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, locale = pg_newlocale_from_collation(collation); } + /* for text types, use pg_wchar; for BYTEA, use char */ if (typeid != BYTEAOID) { - patt = TextDatumGetCString(patt_const->constvalue); - pattlen = strlen(patt); + text *val = DatumGetTextPP(patt_const->constvalue); + pg_wchar *wpatt; + pg_wchar *wmatch; + char *match; + + patt = VARDATA_ANY(val); + pattlen = VARSIZE_ANY_EXHDR(val); + wpatt = palloc((pattlen + 1) * sizeof(pg_wchar)); + wmatch = palloc((pattlen + 1) * sizeof(pg_wchar)); + pg_mb2wchar_with_len(patt, wpatt, pattlen); + + match = palloc(pattlen + 1); + for (pos = 0; pos < pattlen; pos++) + { + /* % and _ are wildcard characters in LIKE */ + if (wpatt[pos] == '%' || + wpatt[pos] == '_') + break; + + /* Backslash escapes the next character */ + if (wpatt[pos] == '\\') + { + pos++; + if (pos >= pattlen) + break; + } + + /* + * For ILIKE, stop if it's a case-varying character (it's sort of + * a wildcard). + */ + if (case_insensitive && pg_iswcased(wpatt[pos], locale)) + break; + + wmatch[match_pos++] = wpatt[pos]; + } + + wmatch[match_pos] = '\0'; + + pg_wchar2mb_with_len(wmatch, match, pattlen); + + pfree(wpatt); + pfree(wmatch); + + *prefix_const = string_to_const(match, typeid); } else { bytea *bstr = DatumGetByteaPP(patt_const->constvalue); + char *match; + patt = VARDATA_ANY(bstr); pattlen = VARSIZE_ANY_EXHDR(bstr); - patt = (char *) palloc(pattlen); - memcpy(patt, VARDATA_ANY(bstr), pattlen); - Assert((Pointer) bstr == DatumGetPointer(patt_const->constvalue)); - } - match = palloc(pattlen + 1); - match_pos = 0; - for (pos = 0; pos < pattlen; pos++) - { - /* % and _ are wildcard characters in LIKE */ - if (patt[pos] == '%' || - patt[pos] == '_') - break; - - /* Backslash escapes the next character */ - if (patt[pos] == '\\') + match = palloc(pattlen + 1); + for (pos = 0; pos < pattlen; pos++) { - pos++; - if (pos >= pattlen) + /* % and _ are wildcard characters in LIKE */ + if (patt[pos] == '%' || + patt[pos] == '_') break; - } - /* - * Stop if case-varying character (it's sort of a wildcard). - * - * In multibyte character sets or with non-libc providers, we can't - * use isalpha, and it does not seem worth trying to convert to - * wchar_t or char32_t. Instead, just pass the single byte to the - * provider, which will assume any non-ASCII char is potentially - * case-varying. - */ - if (case_insensitive && char_is_cased(patt[pos], locale)) - break; - - match[match_pos++] = patt[pos]; - } + /* Backslash escapes the next character */ + if (patt[pos] == '\\') + { + pos++; + if (pos >= pattlen) + break; + } - match[match_pos] = '\0'; + match[match_pos++] = pos; + } - if (typeid != BYTEAOID) - *prefix_const = string_to_const(match, typeid); - else *prefix_const = string_to_bytea_const(match, match_pos); + pfree(match); + } + if (rest_selec != NULL) *rest_selec = like_selectivity(&patt[pos], pattlen - pos, case_insensitive); - pfree(patt); - pfree(match); - /* in LIKE, an empty pattern is an exact match! */ if (pos == pattlen) return Pattern_Prefix_Exact; /* reached end of pattern, so exact */ diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 5aba277ba99..c4e89502f85 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1588,6 +1588,17 @@ pg_iswxdigit(pg_wchar wc, pg_locale_t locale) return locale->ctype->wc_isxdigit(wc, locale); } +bool +pg_iswcased(pg_wchar wc, pg_locale_t locale) +{ + /* for the C locale, Cased and Alpha are equivalent */ + if (locale->ctype == NULL) + return (wc <= (pg_wchar) 127 && + (pg_char_properties[wc] & PG_ISALPHA)); + else + return locale->ctype->wc_iscased(wc, locale); +} + pg_wchar pg_towupper(pg_wchar wc, pg_locale_t locale) { @@ -1614,21 +1625,6 @@ pg_towlower(pg_wchar wc, pg_locale_t locale) return locale->ctype->wc_tolower(wc, locale); } -/* - * char_is_cased() - * - * Fuzzy test of whether the given char is case-varying or not. The argument - * is a single byte, so in a multibyte encoding, just assume any non-ASCII - * char is case-varying. - */ -bool -char_is_cased(char ch, pg_locale_t locale) -{ - if (locale->ctype == NULL) - return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'); - return locale->ctype->char_is_cased(ch, locale); -} - /* * Return required encoding ID for the given locale, or -1 if any encoding is * valid for the locale. diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c index 1021e0d129b..0c2920112bb 100644 --- a/src/backend/utils/adt/pg_locale_builtin.c +++ b/src/backend/utils/adt/pg_locale_builtin.c @@ -186,10 +186,9 @@ wc_isxdigit_builtin(pg_wchar wc, pg_locale_t locale) } static bool -char_is_cased_builtin(char ch, pg_locale_t locale) +wc_iscased_builtin(pg_wchar wc, pg_locale_t locale) { - return IS_HIGHBIT_SET(ch) || - (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'); + return pg_u_prop_cased(to_char32(wc)); } static pg_wchar @@ -219,7 +218,7 @@ static const struct ctype_methods ctype_methods_builtin = { .wc_ispunct = wc_ispunct_builtin, .wc_isspace = wc_isspace_builtin, .wc_isxdigit = wc_isxdigit_builtin, - .char_is_cased = char_is_cased_builtin, + .wc_iscased = wc_iscased_builtin, .wc_tolower = wc_tolower_builtin, .wc_toupper = wc_toupper_builtin, }; diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c index f5a0cc8fe41..18d026deda8 100644 --- a/src/backend/utils/adt/pg_locale_icu.c +++ b/src/backend/utils/adt/pg_locale_icu.c @@ -121,13 +121,6 @@ static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity, const char *locale, UErrorCode *pErrorCode); -static bool -char_is_cased_icu(char ch, pg_locale_t locale) -{ - return IS_HIGHBIT_SET(ch) || - (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'); -} - /* * XXX: many of the functions below rely on casts directly from pg_wchar to * UChar32, which is correct for the UTF-8 encoding, but not in general. @@ -223,6 +216,12 @@ wc_isxdigit_icu(pg_wchar wc, pg_locale_t locale) return u_isxdigit(wc); } +static bool +wc_iscased_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_hasBinaryProperty(wc, UCHAR_CASED); +} + static const struct ctype_methods ctype_methods_icu = { .strlower = strlower_icu, .strtitle = strtitle_icu, @@ -238,7 +237,7 @@ static const struct ctype_methods ctype_methods_icu = { .wc_ispunct = wc_ispunct_icu, .wc_isspace = wc_isspace_icu, .wc_isxdigit = wc_isxdigit_icu, - .char_is_cased = char_is_cased_icu, + .wc_iscased = wc_iscased_icu, .wc_toupper = toupper_icu, .wc_tolower = tolower_icu, }; diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c index 545ee9a3099..fa419863fa7 100644 --- a/src/backend/utils/adt/pg_locale_libc.c +++ b/src/backend/utils/adt/pg_locale_libc.c @@ -184,6 +184,13 @@ wc_isxdigit_libc_sb(pg_wchar wc, pg_locale_t locale) #endif } +static bool +wc_iscased_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isupper_l((unsigned char) wc, locale->lt) || + islower_l((unsigned char) wc, locale->lt); +} + static bool wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale) { @@ -249,14 +256,10 @@ wc_isxdigit_libc_mb(pg_wchar wc, pg_locale_t locale) } static bool -char_is_cased_libc(char ch, pg_locale_t locale) +wc_iscased_libc_mb(pg_wchar wc, pg_locale_t locale) { - bool is_multibyte = pg_database_encoding_max_length() > 1; - - if (is_multibyte && IS_HIGHBIT_SET(ch)) - return true; - else - return isalpha_l((unsigned char) ch, locale->lt); + return iswupper_l((wint_t) wc, locale->lt) || + iswlower_l((wint_t) wc, locale->lt); } static pg_wchar @@ -331,7 +334,7 @@ static const struct ctype_methods ctype_methods_libc_sb = { .wc_ispunct = wc_ispunct_libc_sb, .wc_isspace = wc_isspace_libc_sb, .wc_isxdigit = wc_isxdigit_libc_sb, - .char_is_cased = char_is_cased_libc, + .wc_iscased = wc_iscased_libc_sb, .wc_toupper = toupper_libc_sb, .wc_tolower = tolower_libc_sb, }; @@ -356,7 +359,7 @@ static const struct ctype_methods ctype_methods_libc_other_mb = { .wc_ispunct = wc_ispunct_libc_sb, .wc_isspace = wc_isspace_libc_sb, .wc_isxdigit = wc_isxdigit_libc_sb, - .char_is_cased = char_is_cased_libc, + .wc_iscased = wc_iscased_libc_sb, .wc_toupper = toupper_libc_sb, .wc_tolower = tolower_libc_sb, }; @@ -377,7 +380,7 @@ static const struct ctype_methods ctype_methods_libc_utf8 = { .wc_ispunct = wc_ispunct_libc_mb, .wc_isspace = wc_isspace_libc_mb, .wc_isxdigit = wc_isxdigit_libc_mb, - .char_is_cased = char_is_cased_libc, + .wc_iscased = wc_iscased_libc_mb, .wc_toupper = toupper_libc_mb, .wc_tolower = tolower_libc_mb, }; diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 50520e50127..01f891def7a 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -122,11 +122,9 @@ struct ctype_methods bool (*wc_ispunct) (pg_wchar wc, pg_locale_t locale); bool (*wc_isspace) (pg_wchar wc, pg_locale_t locale); bool (*wc_isxdigit) (pg_wchar wc, pg_locale_t locale); + bool (*wc_iscased) (pg_wchar wc, pg_locale_t locale); pg_wchar (*wc_toupper) (pg_wchar wc, pg_locale_t locale); pg_wchar (*wc_tolower) (pg_wchar wc, pg_locale_t locale); - - /* required */ - bool (*char_is_cased) (char ch, pg_locale_t locale); }; /* @@ -214,6 +212,7 @@ extern bool pg_iswprint(pg_wchar wc, pg_locale_t locale); extern bool pg_iswpunct(pg_wchar wc, pg_locale_t locale); extern bool pg_iswspace(pg_wchar wc, pg_locale_t locale); extern bool pg_iswxdigit(pg_wchar wc, pg_locale_t locale); +extern bool pg_iswcased(pg_wchar wc, pg_locale_t locale); extern pg_wchar pg_towupper(pg_wchar wc, pg_locale_t locale); extern pg_wchar pg_towlower(pg_wchar wc, pg_locale_t locale); -- 2.43.0