From 8161ca49ae2044e004d3f36c04f60b03e97f4071 Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Wed, 19 Nov 2025 13:24:38 -0800 Subject: [PATCH v13 1/2] fuzzystrmatch: use pg_ascii_toupper(). fuzzystrmatch is designed for ASCII, so no need to rely on the global LC_CTYPE setting. TODO: what about \xc7 case? Also, what should the behavior be for soundex()? Discussion: https://postgr.es/m/450ceb6260cad30d7afdf155d991a9caafee7c0d.camel@j-davis.com --- contrib/fuzzystrmatch/dmetaphone.c | 45 +++++++++++++++++++++++++-- contrib/fuzzystrmatch/fuzzystrmatch.c | 43 ++++++++++++++----------- 2 files changed, 67 insertions(+), 21 deletions(-) diff --git a/contrib/fuzzystrmatch/dmetaphone.c b/contrib/fuzzystrmatch/dmetaphone.c index 227d8b11ddc..9a4e5ae7e0e 100644 --- a/contrib/fuzzystrmatch/dmetaphone.c +++ b/contrib/fuzzystrmatch/dmetaphone.c @@ -98,6 +98,7 @@ The remaining code is authored by Andrew Dunstan and #include "postgres.h" +#include "mb/pg_wchar.h" #include "utils/builtins.h" /* turn off assertions for embedded function */ @@ -116,6 +117,9 @@ The remaining code is authored by Andrew Dunstan and #include #include +#define SMALL_LETTER_C_WITH_CEDILLA '\xe7' +#define CAPITAL_LETTER_C_WITH_CEDILLA '\xc7' + /* prototype for the main function we got from the perl module */ static void DoubleMetaphone(char *str, char **codes); @@ -282,9 +286,46 @@ static void MakeUpper(metastring *s) { char *i; + bool c_with_cedilla; + + /* + * C WITH CEDILLA should be uppercased, as well. + * + * XXX: Only works in single-byte encodings that encode lowercase C WITH + * CEDILLA as \xe7. Should have proper multibyte support. + * + * NB: WIN1256 encodes only the lowercase C WITH CEDILLA, but for the + * purposes of metaphone, we can still "uppercase" it to \xc7 here so that + * it's recognized later. + */ + switch (GetDatabaseEncoding()) + { + case PG_LATIN1: + case PG_LATIN2: + case PG_LATIN3: + case PG_LATIN5: + case PG_LATIN8: + case PG_LATIN9: + case PG_LATIN10: + case PG_WIN1250: + case PG_WIN1252: + case PG_WIN1254: + case PG_WIN1256: + case PG_WIN1258: + c_with_cedilla = true; + break; + default: + c_with_cedilla = false; + break; + } for (i = s->str; *i; i++) - *i = toupper((unsigned char) *i); + { + if (c_with_cedilla && *i == SMALL_LETTER_C_WITH_CEDILLA) + *i = CAPITAL_LETTER_C_WITH_CEDILLA; + else + *i = pg_ascii_toupper((unsigned char) *i); + } } @@ -463,7 +504,7 @@ DoubleMetaphone(char *str, char **codes) current += 1; break; - case '\xc7': /* C with cedilla */ + case CAPITAL_LETTER_C_WITH_CEDILLA: MetaphAdd(primary, "S"); MetaphAdd(secondary, "S"); current += 1; diff --git a/contrib/fuzzystrmatch/fuzzystrmatch.c b/contrib/fuzzystrmatch/fuzzystrmatch.c index e7cc314b763..319302af0e4 100644 --- a/contrib/fuzzystrmatch/fuzzystrmatch.c +++ b/contrib/fuzzystrmatch/fuzzystrmatch.c @@ -62,7 +62,7 @@ static const char *const soundex_table = "01230120022455012623010202"; static char soundex_code(char letter) { - letter = toupper((unsigned char) letter); + letter = pg_ascii_toupper((unsigned char) letter); /* Defend against non-ASCII letters */ if (letter >= 'A' && letter <= 'Z') return soundex_table[letter - 'A']; @@ -122,16 +122,21 @@ static const char _codes[26] = { static int getcode(char c) { - if (isalpha((unsigned char) c)) - { - c = toupper((unsigned char) c); - /* Defend against non-ASCII letters */ - if (c >= 'A' && c <= 'Z') - return _codes[c - 'A']; - } + c = pg_ascii_toupper((unsigned char) c); + /* Defend against non-ASCII letters */ + if (c >= 'A' && c <= 'Z') + return _codes[c - 'A']; + return 0; } +static bool +ascii_isalpha(char c) +{ + return (c >= 'A' && c <= 'Z') || + (c >= 'a' && c <= 'z'); +} + #define isvowel(c) (getcode(c) & 1) /* AEIOU */ /* These letters are passed through unchanged */ @@ -301,18 +306,18 @@ metaphone(PG_FUNCTION_ARGS) * accessing the array directly... */ /* Look at the next letter in the word */ -#define Next_Letter (toupper((unsigned char) word[w_idx+1])) +#define Next_Letter (pg_ascii_toupper((unsigned char) word[w_idx+1])) /* Look at the current letter in the word */ -#define Curr_Letter (toupper((unsigned char) word[w_idx])) +#define Curr_Letter (pg_ascii_toupper((unsigned char) word[w_idx])) /* Go N letters back. */ #define Look_Back_Letter(n) \ - (w_idx >= (n) ? toupper((unsigned char) word[w_idx-(n)]) : '\0') + (w_idx >= (n) ? pg_ascii_toupper((unsigned char) word[w_idx-(n)]) : '\0') /* Previous letter. I dunno, should this return null on failure? */ #define Prev_Letter (Look_Back_Letter(1)) /* Look two letters down. It makes sure you don't walk off the string. */ #define After_Next_Letter \ - (Next_Letter != '\0' ? toupper((unsigned char) word[w_idx+2]) : '\0') -#define Look_Ahead_Letter(n) toupper((unsigned char) Lookahead(word+w_idx, n)) + (Next_Letter != '\0' ? pg_ascii_toupper((unsigned char) word[w_idx+2]) : '\0') +#define Look_Ahead_Letter(n) pg_ascii_toupper((unsigned char) Lookahead(word+w_idx, n)) /* Allows us to safely look ahead an arbitrary # of letters */ @@ -340,7 +345,7 @@ Lookahead(char *word, int how_far) #define Phone_Len (p_idx) /* Note is a letter is a 'break' in the word */ -#define Isbreak(c) (!isalpha((unsigned char) (c))) +#define Isbreak(c) (!ascii_isalpha((unsigned char) (c))) static void @@ -379,7 +384,7 @@ _metaphone(char *word, /* IN */ /*-- The first phoneme has to be processed specially. --*/ /* Find our first letter */ - for (; !isalpha((unsigned char) (Curr_Letter)); w_idx++) + for (; !ascii_isalpha((unsigned char) (Curr_Letter)); w_idx++) { /* On the off chance we were given nothing but crap... */ if (Curr_Letter == '\0') @@ -478,7 +483,7 @@ _metaphone(char *word, /* IN */ */ /* Ignore non-alphas */ - if (!isalpha((unsigned char) (Curr_Letter))) + if (!ascii_isalpha((unsigned char) (Curr_Letter))) continue; /* Drop duplicates, except CC */ @@ -731,7 +736,7 @@ _soundex(const char *instr, char *outstr) Assert(outstr); /* Skip leading non-alphabetic characters */ - while (*instr && !isalpha((unsigned char) *instr)) + while (*instr && !ascii_isalpha((unsigned char) *instr)) ++instr; /* If no string left, return all-zeroes buffer */ @@ -742,12 +747,12 @@ _soundex(const char *instr, char *outstr) } /* Take the first letter as is */ - *outstr++ = (char) toupper((unsigned char) *instr++); + *outstr++ = (char) pg_ascii_toupper((unsigned char) *instr++); count = 1; while (*instr && count < SOUNDEX_LEN) { - if (isalpha((unsigned char) *instr) && + if (ascii_isalpha((unsigned char) *instr) && soundex_code(*instr) != soundex_code(*(instr - 1))) { *outstr = soundex_code(*instr); -- 2.43.0