From 7b3bc97b7dfae464321c8f4682f5eaf571214045 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Fri, 3 Mar 2017 14:25:09 +0900 Subject: [PATCH 09/10] Set of fixes for SASLprep Some variable renames, as well as calculation of Hangul characters are adjusted. Per review from Kyotaro Horiguchi. --- src/common/scram-common.c | 8 +++- src/common/utf_norm.c | 106 +++++++++++++++++++++++++--------------------- 2 files changed, 65 insertions(+), 49 deletions(-) diff --git a/src/common/scram-common.c b/src/common/scram-common.c index 041cf58f20..b262356325 100644 --- a/src/common/scram-common.c +++ b/src/common/scram-common.c @@ -121,7 +121,9 @@ pg_utf_mblen(const unsigned char *s) * Check validity of the given null-terminated string for UTF-8. * * This routine uses pg_utf_mblen() and pg_utf8_islegal() to check each - * character of the string. + * character of the string. Strings made only of ASCII characters do not + * need to go through SASLprep, so let caller know as well in this case + * that the string is eligible in this case. */ static bool pg_utf8_check_string(const unsigned char *source) @@ -139,6 +141,10 @@ pg_utf8_check_string(const unsigned char *source) p += l; } + /* ASCII-only strings have no need to go through SASLprep */ + if (l == strlen((const char*) source)) + return false; + return true; } diff --git a/src/common/utf_norm.c b/src/common/utf_norm.c index c953b66e5f..2e7d6264fd 100644 --- a/src/common/utf_norm.c +++ b/src/common/utf_norm.c @@ -4,7 +4,8 @@ * Unicode strings (NFKC, NFKD, NFC and NFD). * * This contains the common low-level routines to perform normalizations - * per documentation here: http://www.unicode.org/reports/tr15/. + * per documentation here: http://www.unicode.org/reports/tr15/, using the + * composition version 3.0. * * Portions Copyright (c) 2017, PostgreSQL Global Development Group * @@ -23,6 +24,17 @@ #include "common/utf_norm_table.h" #include "mb/pg_wchar.h" +/* Constants for calculations wih Hangul characters */ +#define SBASE 0xAC00 +#define LBASE 0x1100 +#define VBASE 0x1161 +#define TBASE 0x11A7 +#define LCOUNT 19 +#define VCOUNT 21 +#define TCOUNT 28 +#define NCOUNT VCOUNT * TCOUNT +#define SCOUNT LCOUNT * NCOUNT + /* * utf_to_array * @@ -298,14 +310,14 @@ get_decomposed_size(pg_wchar code) * See http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details * on the matter. */ - if (code >= 0xAC00 && code < 0xD7A4) + if (code >= SBASE && code < SBASE + SCOUNT) { - uint32 t, hindex; + uint32 tindex, sindex; - hindex = code - 0xAC00; - t = hindex % 28; + sindex = code - SBASE; + tindex = sindex % TCOUNT; - if (t != 0) + if (tindex != 0) return 3; return 2; } @@ -344,21 +356,21 @@ static bool recompose_code(uint32 start, uint32 code, uint32 *result) { /* No need to care about ascii characters */ - if (start <= 0xef || code <= 0xef) + if (start <= 0x7f || code <= 0x7f) return false; /* Hangul characters go here */ - if (start >= 0x1100 && start < 0x1113 && - code >= 0x1161 && code < 0x1176) + if (start >= LBASE && start < LBASE + LCOUNT && + code >= VBASE && code < VBASE + VCOUNT) { - *result = ((start - 0x1100) * 21 + code - 0x1161) * 28 + 0xAC00; + *result = ((start - LBASE) * VCOUNT + code - VBASE) * TCOUNT + SBASE; return true; } - else if (start >= 0xAC00 && start < 0xD7A4 && - !((start - 0xAC00) % 28) && - code >= 0x11A8 && code < 0x11C3) + else if (start >= SBASE && start < (SBASE + SCOUNT) && + ((start - SBASE) % TCOUNT) == 0 && + code >= TBASE && code < (TBASE + TCOUNT)) { - *result = start + code - 0x11A7; + *result = start + code - TBASE; return true; } else @@ -406,24 +418,24 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current) * See http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details * on the matter. */ - if (code >= 0xAC00 && code < 0xD7A4) + if (code >= SBASE && code < SBASE + SCOUNT) { - uint32 l, v, t, hindex; + uint32 l, v, tindex, sindex; pg_wchar *res = *result; - hindex = code - 0xAC00; - l = 0x1100 + hindex / (21 * 28); - v = 0x1161 + (hindex % (21 * 28)) / 28; - t = hindex % 28; + sindex = code - SBASE; + l = LBASE + sindex / (VCOUNT * TCOUNT); + v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT; + tindex = sindex % TCOUNT; res[*current] = l; (*current)++; res[*current] = v; (*current)++; - if (t != 0) + if (tindex != 0) { - res[*current] = 0x11A7 + t; + res[*current] = TBASE + tindex; (*current)++; } @@ -475,11 +487,10 @@ utf_sasl_prepare(const char *input) int count; char *result; /* variables for recomposition */ - int lastClass; - int starterPos; - int sourceLength; - int targetPos; - uint32 starterCh; + int last_class; + int starter_pos; + int target_pos; + uint32 starter_ch; /* Convert input string into a manipulable array of character integers */ input_chars = utf_to_array((char *) input, &input_size); @@ -527,8 +538,8 @@ utf_sasl_prepare(const char *input) Assert(decomp_size == current_size); /* - * Now that the decomposition is done, apply the combining class - * for each multibyte character. + * Now end the decomposition by applying the combining class for + * each multibyte character. */ for (count = 1; count < decomp_size; count++) { @@ -577,40 +588,39 @@ utf_sasl_prepare(const char *input) * make the allocation of the recomposed string based on that assumption. */ recomp_chars = (pg_wchar *) malloc(decomp_size * sizeof(int)); - lastClass = -1; /* this eliminates a special check */ - starterPos = 0; - sourceLength = decomp_size; - targetPos = 1; - starterCh = recomp_chars[0] = decomp_chars[0]; + last_class = -1; /* this eliminates a special check */ + starter_pos = 0; + target_pos = 1; + starter_ch = recomp_chars[0] = decomp_chars[0]; for (count = 1; count < decomp_size; count++) { pg_wchar ch = decomp_chars[count]; - pg_utf_decomposition *chEntry = get_code_entry(ch); - int chClass = chEntry == NULL ? 0 : chEntry->class; + pg_utf_decomposition *ch_entry = get_code_entry(ch); + int ch_class = ch_entry == NULL ? 0 : ch_entry->class; pg_wchar composite; - bool found_match = recompose_code(starterCh, ch, &composite); + bool found_match = recompose_code(starter_ch, ch, &composite); - if (found_match && lastClass < chClass) + if (found_match && last_class < ch_class) { - recomp_chars[starterPos] = composite; - starterCh = composite; + recomp_chars[starter_pos] = composite; + starter_ch = composite; } - else if (chClass == 0) + else if (ch_class == 0) { - starterPos = targetPos; - starterCh = ch; - lastClass = -1; - recomp_chars[targetPos++] = ch; + starter_pos = target_pos; + starter_ch = ch; + last_class = -1; + recomp_chars[target_pos++] = ch; } else { - lastClass = chClass; - recomp_chars[targetPos++] = ch; + last_class = ch_class; + recomp_chars[target_pos++] = ch; } } - recomp_size = targetPos; + recomp_size = target_pos; /* * Convert the decomposition back to a string, which is the final -- 2.12.0