From 604737aff7fb805ec91e283f0c8b9a257f9039bb Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Mon, 21 Aug 2023 16:06:42 +0900 Subject: [PATCH] unaccent: Add support for quoted translated characters --- doc/src/sgml/unaccent.sgml | 16 +++++ contrib/unaccent/Makefile | 2 +- contrib/unaccent/custom_unaccent.rules | 6 ++ contrib/unaccent/expected/unaccent.out | 74 ++++++++++++++++++++ contrib/unaccent/generate_unaccent_rules.py | 4 ++ contrib/unaccent/sql/unaccent.sql | 15 +++++ contrib/unaccent/unaccent--1.1.sql | 5 ++ contrib/unaccent/unaccent.c | 75 ++++++++++++++++++--- contrib/unaccent/unaccent.rules | 56 +++++++-------- 9 files changed, 216 insertions(+), 37 deletions(-) create mode 100644 contrib/unaccent/custom_unaccent.rules diff --git a/doc/src/sgml/unaccent.sgml b/doc/src/sgml/unaccent.sgml index f3ddc64bbc..94100ed260 100644 --- a/doc/src/sgml/unaccent.sgml +++ b/doc/src/sgml/unaccent.sgml @@ -84,6 +84,22 @@ + + + Some characters, like numeric symbols, may require whitespaces in their + translation rule. It is possible to use double quotes around the translated + characters in this case. A double quote needs to be escaped with a second + double quote when including one in the translated character. For example: + +¼ " 1/4" +½ " 1/2" +¾ " 3/4" +“ """" +” """" + + + + As with other PostgreSQL text search configuration files, diff --git a/contrib/unaccent/Makefile b/contrib/unaccent/Makefile index 652a3e774c..27c7b2ca6e 100644 --- a/contrib/unaccent/Makefile +++ b/contrib/unaccent/Makefile @@ -7,7 +7,7 @@ OBJS = \ EXTENSION = unaccent DATA = unaccent--1.1.sql unaccent--1.0--1.1.sql -DATA_TSEARCH = unaccent.rules +DATA_TSEARCH = unaccent.rules custom_unaccent.rules PGFILEDESC = "unaccent - text search dictionary that removes accents" REGRESS = unaccent diff --git a/contrib/unaccent/custom_unaccent.rules b/contrib/unaccent/custom_unaccent.rules new file mode 100644 index 0000000000..d8791dd902 --- /dev/null +++ b/contrib/unaccent/custom_unaccent.rules @@ -0,0 +1,6 @@ +¼ " ""1/4"" " +½ """1/2"" " +¾ " ""3/4""" +ʺ " a """" " +“ " t """ +” """ b " diff --git a/contrib/unaccent/expected/unaccent.out b/contrib/unaccent/expected/unaccent.out index f080707c4a..64402010ce 100644 --- a/contrib/unaccent/expected/unaccent.out +++ b/contrib/unaccent/expected/unaccent.out @@ -51,6 +51,18 @@ SELECT unaccent('℗'); -- sound recording copyright (P) (1 row) +SELECT unaccent('1½'); -- math expression with whitespace + unaccent +---------- + 1 1/2 +(1 row) + +SELECT unaccent('〝'); -- quote + unaccent +---------- + " +(1 row) + SELECT unaccent('unaccent', 'foobar'); unaccent ---------- @@ -93,6 +105,56 @@ SELECT unaccent('unaccent', '℗'); (P) (1 row) +SELECT unaccent('unaccent', '1½'); + unaccent +---------- + 1 1/2 +(1 row) + +SELECT unaccent('unaccent', '〝'); + unaccent +---------- + " +(1 row) + +-- XXX: Remove later. +-- Just for the sake of checking the parsing logic. +SELECT unaccent('custom_unaccent', '1¼'); + unaccent +------------ + 1 "1/4" +(1 row) + +SELECT unaccent('custom_unaccent', '1½'); + unaccent +------------ + 1"1/2" +(1 row) + +SELECT unaccent('custom_unaccent', '1¾'); + unaccent +---------- + 1 "3/4" +(1 row) + +SELECT unaccent('custom_unaccent', 'ʺ'); + unaccent +---------- + a "" +(1 row) + +SELECT unaccent('custom_unaccent', '“'); + unaccent +---------- + t " +(1 row) + +SELECT unaccent('custom_unaccent', '”'); + unaccent +---------- + " b +(1 row) + SELECT ts_lexize('unaccent', 'foobar'); ts_lexize ----------- @@ -135,6 +197,18 @@ SELECT ts_lexize('unaccent', '℗'); {(P)} (1 row) +SELECT ts_lexize('unaccent', '1½'); + ts_lexize +----------- + {"1 1/2"} +(1 row) + +SELECT ts_lexize('unaccent', '〝'); + ts_lexize +----------- + {"\""} +(1 row) + -- Controversial case. Black-Letter Capital H (U+210C) is translated by -- Latin-ASCII.xml as 'x', but it should be 'H'. SELECT unaccent('ℌ'); diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py index b4b4c38beb..cffb7db7ce 100644 --- a/contrib/unaccent/generate_unaccent_rules.py +++ b/contrib/unaccent/generate_unaccent_rules.py @@ -58,6 +58,10 @@ COMBINING_MARK_RANGES = ((0x0300, 0x0362), # Mn: Accents, IPA def print_record(codepoint, letter): if letter: + # If the letter has whitespace or double quotes, escape double + # quotes and apply more quotes around it. + if (' ' in letter) or ('"' in letter): + letter = '"' + letter.replace('"', '""') + '"' output = chr(codepoint) + "\t" + letter else: output = chr(codepoint) diff --git a/contrib/unaccent/sql/unaccent.sql b/contrib/unaccent/sql/unaccent.sql index 663646c1ac..9ef2fc5010 100644 --- a/contrib/unaccent/sql/unaccent.sql +++ b/contrib/unaccent/sql/unaccent.sql @@ -20,6 +20,8 @@ SELECT unaccent('˃˖˗˜'); SELECT unaccent('À'); -- Remove combining diacritical 0x0300 SELECT unaccent('℃℉'); -- degree signs SELECT unaccent('℗'); -- sound recording copyright +SELECT unaccent('1½'); -- math expression with whitespace +SELECT unaccent('〝'); -- quote SELECT unaccent('unaccent', 'foobar'); SELECT unaccent('unaccent', 'ёлка'); @@ -28,6 +30,17 @@ SELECT unaccent('unaccent', '˃˖˗˜'); SELECT unaccent('unaccent', 'À'); SELECT unaccent('unaccent', '℃℉'); SELECT unaccent('unaccent', '℗'); +SELECT unaccent('unaccent', '1½'); +SELECT unaccent('unaccent', '〝'); + +-- XXX: Remove later. +-- Just for the sake of checking the parsing logic. +SELECT unaccent('custom_unaccent', '1¼'); +SELECT unaccent('custom_unaccent', '1½'); +SELECT unaccent('custom_unaccent', '1¾'); +SELECT unaccent('custom_unaccent', 'ʺ'); +SELECT unaccent('custom_unaccent', '“'); +SELECT unaccent('custom_unaccent', '”'); SELECT ts_lexize('unaccent', 'foobar'); SELECT ts_lexize('unaccent', 'ёлка'); @@ -36,6 +49,8 @@ SELECT ts_lexize('unaccent', '˃˖˗˜'); SELECT ts_lexize('unaccent', 'À'); SELECT ts_lexize('unaccent', '℃℉'); SELECT ts_lexize('unaccent', '℗'); +SELECT ts_lexize('unaccent', '1½'); +SELECT ts_lexize('unaccent', '〝'); -- Controversial case. Black-Letter Capital H (U+210C) is translated by -- Latin-ASCII.xml as 'x', but it should be 'H'. diff --git a/contrib/unaccent/unaccent--1.1.sql b/contrib/unaccent/unaccent--1.1.sql index ecc8651780..a821e1c37d 100644 --- a/contrib/unaccent/unaccent--1.1.sql +++ b/contrib/unaccent/unaccent--1.1.sql @@ -32,3 +32,8 @@ CREATE TEXT SEARCH DICTIONARY unaccent ( TEMPLATE = unaccent, RULES = 'unaccent' ); + +CREATE TEXT SEARCH DICTIONARY custom_unaccent ( + TEMPLATE = unaccent, + RULES = 'custom_unaccent' +); diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c index 64c879e547..75fb4032b2 100644 --- a/contrib/unaccent/unaccent.c +++ b/contrib/unaccent/unaccent.c @@ -127,14 +127,16 @@ initTrie(const char *filename) * src and trg are sequences of one or more non-whitespace * characters, separated by whitespace. Whitespace at start * or end of line is ignored. If trg is omitted, an empty - * string is used as the replacement. + * string is used as the replacement. trg can be optionally + * quoted, in which case whitespaces are included in it. * * We use a simple state machine, with states * 0 initial (before src) * 1 in src * 2 in whitespace after src - * 3 in trg - * 4 in whitespace after trg + * 3 in trg (non-quoted) + * 4 in trg (quoted) + * 5 in whitespace after trg * -1 syntax error detected *---------- */ @@ -142,9 +144,12 @@ initTrie(const char *filename) char *ptr; char *src = NULL; char *trg = NULL; + char *trgstore = NULL; int ptrlen; int srclen = 0; int trglen = 0; + int trgstorelen = 0; + bool trgquoted = false; state = 0; for (ptr = line; *ptr; ptr += ptrlen) @@ -156,8 +161,10 @@ initTrie(const char *filename) if (state == 1) state = 2; else if (state == 3) - state = 4; - continue; + state = 5; + /* whitespaces are OK in quoted area */ + if (state != 4) + continue; } switch (state) { @@ -173,14 +180,41 @@ initTrie(const char *filename) break; case 2: /* start of trg */ + if (*ptr == '"') + { + trgquoted = true; + state = 4; + } + else + state = 3; + trg = ptr; trglen = ptrlen; - state = 3; break; case 3: - /* continue trg */ + /* continue non-quoted trg */ trglen += ptrlen; break; + case 4: + /* continue quoted trg */ + trglen += ptrlen; + + /* + * If this is a quote, consider it as the end of + * trg except if the follow-up character is itself + * a quote. + */ + if (*ptr == '"') + { + if (*(ptr + 1) == '"') + { + ptr++; + trglen += 1; + } + else + state = 5; + } + break; default: /* bogus line format */ state = -1; @@ -195,10 +229,35 @@ initTrie(const char *filename) trglen = 0; } + /* If still in a quoted area, fallback to an error */ + if (state == 4) + state = -1; + + /* If trg was quoted, remove its quotes and unescape it */ + if (trgquoted) + { + /* Ignore first and end quotes */ + trgstore = palloc0(sizeof(char *) * trglen - 2); + trgstorelen = 0; + for (int i = 1; i < trglen - 1; i++) + { + trgstore[trgstorelen] = trg[i]; + trgstorelen++; + /* skip second double quotes */ + if (trg[i] == '"' && trg[i + 1] == '"') + i++; + } + } + else + { + trgstore = trg; + trgstorelen = trglen; + } + if (state > 0) rootTrie = placeChar(rootTrie, (unsigned char *) src, srclen, - trg, trglen); + trgstore, trgstorelen); else if (state < 0) ereport(WARNING, (errcode(ERRCODE_CONFIG_FILE_ERROR), diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules index 3030166ed6..ca6caa51f5 100644 --- a/contrib/unaccent/unaccent.rules +++ b/contrib/unaccent/unaccent.rules @@ -5,9 +5,9 @@ ® (R) ± +/- » >> -¼ 1/4 -½ 1/2 -¾ 3/4 +¼ " 1/4" +½ " 1/2" +¾ " 3/4" ¿ ? À A Á A @@ -403,7 +403,7 @@ ʪ ls ʫ lz ʹ ' -ʺ " +ʺ """" ʻ ' ʼ ' ʽ ' @@ -1058,15 +1058,15 @@ ’ ' ‚ , ‛ ' -“ " -” " +“ """" +” """" „ ,, -‟ " +‟ """" ․ . ‥ .. … ... ′ ' -″ " +″ """" ‹ < › > ‼ !! @@ -1134,22 +1134,22 @@ ⅇ e ⅈ i ⅉ j -⅐ 1/7 -⅑ 1/9 -⅒ 1/10 -⅓ 1/3 -⅔ 2/3 -⅕ 1/5 -⅖ 2/5 -⅗ 3/5 -⅘ 4/5 -⅙ 1/6 -⅚ 5/6 -⅛ 1/8 -⅜ 3/8 -⅝ 5/8 -⅞ 7/8 -⅟ 1/ +⅐ " 1/7" +⅑ " 1/9" +⅒ " 1/10" +⅓ " 1/3" +⅔ " 2/3" +⅕ " 1/5" +⅖ " 2/5" +⅗ " 3/5" +⅘ " 4/5" +⅙ " 1/6" +⅚ " 5/6" +⅛ " 1/8" +⅜ " 3/8" +⅝ " 5/8" +⅞ " 7/8" +⅟ " 1/" Ⅰ I Ⅱ II Ⅲ III @@ -1182,7 +1182,7 @@ ⅽ c ⅾ d ⅿ m -↉ 0/3 +↉ " 0/3" − - ∕ / ∖ \ @@ -1296,8 +1296,8 @@ 〙 ] 〚 [ 〛 ] -〝 " -〞 " +〝 """" +〞 """" ㍱ hPa ㍲ da ㍳ AU @@ -1512,7 +1512,7 @@ ﹪ % ﹫ @ ! ! -" " +" """" # # $ $ % % -- 2.40.1