From 29249d17b5580e87365c008b18c83db5528db37e Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Wed, 9 Nov 2022 08:58:08 -0800 Subject: [PATCH v3 1/5] Introduce pg_strcoll() and pg_strncoll(). Hide the special cases of the platform, collation provider, and database encoding in pg_locale.c. Simplify varlena.c. --- src/backend/utils/adt/pg_locale.c | 263 ++++++++++++++++++++++++++++++ src/backend/utils/adt/varlena.c | 230 +------------------------- src/include/utils/pg_locale.h | 3 + 3 files changed, 268 insertions(+), 228 deletions(-) diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 2b42d9ccd8..94dc64c2d0 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -79,6 +79,12 @@ #include #endif +/* + * This should be large enough that most strings will fit, but small enough + * that we feel comfortable putting it on the stack + */ +#define TEXTBUFLEN 1024 + #define MAX_L10N_DATA 80 @@ -1731,6 +1737,263 @@ get_collation_actual_version(char collprovider, const char *collcollate) return collversion; } +/* + * win32_utf8_wcscoll + * + * Convert UTF8 arguments to wide characters and invoke wcscoll() or + * wcscoll_l(). + */ +#ifdef WIN32 +static int +win32_utf8_wcscoll(const char *arg1, size_t len1, const char *arg2, size_t len2, + pg_locale_t locale) +{ + char a1buf[TEXTBUFLEN]; + char a2buf[TEXTBUFLEN]; + char *a1p, + *a2p; + int a1len; + int a2len; + int r; + int result; + + if (len1 >= TEXTBUFLEN / 2) + { + a1len = len1 * 2 + 2; + a1p = palloc(a1len); + } + else + { + a1len = TEXTBUFLEN; + a1p = a1buf; + } + if (len2 >= TEXTBUFLEN / 2) + { + a2len = len2 * 2 + 2; + a2p = palloc(a2len); + } + else + { + a2len = TEXTBUFLEN; + a2p = a2buf; + } + + /* API does not work for zero-length input */ + if (len1 == 0) + r = 0; + else + { + r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1, + (LPWSTR) a1p, a1len / 2); + if (!r) + ereport(ERROR, + (errmsg("could not convert string to UTF-16: error code %lu", + GetLastError()))); + } + ((LPWSTR) a1p)[r] = 0; + + if (len2 == 0) + r = 0; + else + { + r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2, + (LPWSTR) a2p, a2len / 2); + if (!r) + ereport(ERROR, + (errmsg("could not convert string to UTF-16: error code %lu", + GetLastError()))); + } + ((LPWSTR) a2p)[r] = 0; + + errno = 0; +#ifdef HAVE_LOCALE_T + if (locale) + result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt); + else +#endif + result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p); + if (result == 2147483647) /* _NLSCMPERROR; missing from mingw + * headers */ + ereport(ERROR, + (errmsg("could not compare Unicode strings: %m"))); + + if (a1p != a1buf) + pfree(a1p); + if (a2p != a2buf) + pfree(a2p); + + return result; +} +#endif /* WIN32 */ + +/* + * Collate using the libc provider. Arguments must be nul-terminated. + */ +static int +pg_collate_libc(const char *arg1, const char *arg2, pg_locale_t locale) +{ + int result; + +#ifdef WIN32 + /* Win32 does not have UTF-8, so we need to map to UTF-16 */ + if (GetDatabaseEncoding() == PG_UTF8) + { + size_t len1 = strlen(arg1); + size_t len2 = strlen(arg2); + result = win32_utf8_wcscoll(arg1, len1, arg2, len2, locale); + } + else +#endif /* WIN32 */ + if (locale) + { +#ifdef HAVE_LOCALE_T + result = strcoll_l(arg1, arg2, locale->info.lt); +#else + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", locale->provider); +#endif + } + else + result = strcoll(arg1, arg2); + + return result; +} + +/* + * Collate using the icu provider. + */ +static int +pg_collate_icu(const char *arg1, size_t len1, const char *arg2, size_t len2, + pg_locale_t locale) +{ +#ifdef USE_ICU + int result; + + Assert(locale->provider == COLLPROVIDER_ICU); + +#ifdef HAVE_UCOL_STRCOLLUTF8 + if (GetDatabaseEncoding() == PG_UTF8) + { + UErrorCode status; + + status = U_ZERO_ERROR; + result = ucol_strcollUTF8(locale->info.icu.ucol, + arg1, len1, + arg2, len2, + &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("collation failed: %s", u_errorName(status)))); + } + else +#endif + { + int32_t ulen1, + ulen2; + UChar *uchar1, + *uchar2; + + ulen1 = icu_to_uchar(&uchar1, arg1, len1); + ulen2 = icu_to_uchar(&uchar2, arg2, len2); + + result = ucol_strcoll(locale->info.icu.ucol, + uchar1, ulen1, + uchar2, ulen2); + + pfree(uchar1); + pfree(uchar2); + } + + return result; +#else /* not USE_ICU */ + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", locale->provider); +#endif /* not USE_ICU */ +} + +/* + * pg_strcoll + * + * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll(), strcoll_l(), wcscoll(), + * or wcscoll_l() as appropriate for the given locale, platform, and database + * encoding. If the locale is not specified, use the database collation. + * + * Arguments must be encoded in the database encoding and nul-terminated. + * + * If the collation is deterministic, break ties with strcmp(). + */ +int +pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale) +{ + int result; + + if (locale && locale->provider == COLLPROVIDER_ICU) + { + size_t len1 = strlen(arg1); + size_t len2 = strlen(arg2); + result = pg_collate_icu(arg1, len1, arg2, len2, locale); + } + else + { + result = pg_collate_libc(arg1, arg2, locale); + } + + /* Break tie if necessary. */ + if (result == 0 && (!locale || locale->deterministic)) + result = strcmp(arg1, arg2); + + return result; +} + +/* + * pg_strncoll + * + * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll(), strcoll_l(), wcscoll(), + * or wcscoll_l() as appropriate for the given locale, platform, and database + * encoding. If the locale is not specified, use the database collation. + * + * Arguments must be encoded in the database encoding. + * + * If the collation is deterministic, break ties with memcmp(), and then with + * the string length. + */ +int +pg_strncoll(const char *arg1, size_t len1, const char *arg2, size_t len2, + pg_locale_t locale) +{ + int result; + + if (locale && locale->provider == COLLPROVIDER_ICU) + { + result = pg_collate_icu(arg1, len1, arg2, len2, locale); + } + else + { + char *arg1n = palloc(len1 + 1); + char *arg2n = palloc(len2 + 1); + + /* nul-terminate arguments */ + memcpy(arg1n, arg1, len1); + arg1n[len1] = '\0'; + memcpy(arg2n, arg2, len2); + arg2n[len2] = '\0'; + + result = pg_collate_libc(arg1n, arg2n, locale); + + pfree(arg1n); + pfree(arg2n); + } + + /* Break tie if necessary. */ + if (result == 0 && (!locale || locale->deterministic)) + { + result = memcmp(arg1, arg2, Min(len1, len2)); + if ((result == 0) && (len1 != len2)) + result = (len1 < len2) ? -1 : 1; + } + + return result; +} #ifdef USE_ICU /* diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index c5e7ee7ca2..c904bc0825 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -1535,10 +1535,6 @@ varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid) } else { - char a1buf[TEXTBUFLEN]; - char a2buf[TEXTBUFLEN]; - char *a1p, - *a2p; pg_locale_t mylocale; mylocale = pg_newlocale_from_collation(collid); @@ -1555,171 +1551,7 @@ varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid) if (len1 == len2 && memcmp(arg1, arg2, len1) == 0) return 0; -#ifdef WIN32 - /* Win32 does not have UTF-8, so we need to map to UTF-16 */ - if (GetDatabaseEncoding() == PG_UTF8 - && (!mylocale || mylocale->provider == COLLPROVIDER_LIBC)) - { - int a1len; - int a2len; - int r; - - if (len1 >= TEXTBUFLEN / 2) - { - a1len = len1 * 2 + 2; - a1p = palloc(a1len); - } - else - { - a1len = TEXTBUFLEN; - a1p = a1buf; - } - if (len2 >= TEXTBUFLEN / 2) - { - a2len = len2 * 2 + 2; - a2p = palloc(a2len); - } - else - { - a2len = TEXTBUFLEN; - a2p = a2buf; - } - - /* stupid Microsloth API does not work for zero-length input */ - if (len1 == 0) - r = 0; - else - { - r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1, - (LPWSTR) a1p, a1len / 2); - if (!r) - ereport(ERROR, - (errmsg("could not convert string to UTF-16: error code %lu", - GetLastError()))); - } - ((LPWSTR) a1p)[r] = 0; - - if (len2 == 0) - r = 0; - else - { - r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2, - (LPWSTR) a2p, a2len / 2); - if (!r) - ereport(ERROR, - (errmsg("could not convert string to UTF-16: error code %lu", - GetLastError()))); - } - ((LPWSTR) a2p)[r] = 0; - - errno = 0; -#ifdef HAVE_LOCALE_T - if (mylocale) - result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt); - else -#endif - result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p); - if (result == 2147483647) /* _NLSCMPERROR; missing from mingw - * headers */ - ereport(ERROR, - (errmsg("could not compare Unicode strings: %m"))); - - /* Break tie if necessary. */ - if (result == 0 && - (!mylocale || mylocale->deterministic)) - { - result = memcmp(arg1, arg2, Min(len1, len2)); - if ((result == 0) && (len1 != len2)) - result = (len1 < len2) ? -1 : 1; - } - - if (a1p != a1buf) - pfree(a1p); - if (a2p != a2buf) - pfree(a2p); - - return result; - } -#endif /* WIN32 */ - - if (len1 >= TEXTBUFLEN) - a1p = (char *) palloc(len1 + 1); - else - a1p = a1buf; - if (len2 >= TEXTBUFLEN) - a2p = (char *) palloc(len2 + 1); - else - a2p = a2buf; - - memcpy(a1p, arg1, len1); - a1p[len1] = '\0'; - memcpy(a2p, arg2, len2); - a2p[len2] = '\0'; - - if (mylocale) - { - if (mylocale->provider == COLLPROVIDER_ICU) - { -#ifdef USE_ICU -#ifdef HAVE_UCOL_STRCOLLUTF8 - if (GetDatabaseEncoding() == PG_UTF8) - { - UErrorCode status; - - status = U_ZERO_ERROR; - result = ucol_strcollUTF8(mylocale->info.icu.ucol, - arg1, len1, - arg2, len2, - &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("collation failed: %s", u_errorName(status)))); - } - else -#endif - { - int32_t ulen1, - ulen2; - UChar *uchar1, - *uchar2; - - ulen1 = icu_to_uchar(&uchar1, arg1, len1); - ulen2 = icu_to_uchar(&uchar2, arg2, len2); - - result = ucol_strcoll(mylocale->info.icu.ucol, - uchar1, ulen1, - uchar2, ulen2); - - pfree(uchar1); - pfree(uchar2); - } -#else /* not USE_ICU */ - /* shouldn't happen */ - elog(ERROR, "unsupported collprovider: %c", mylocale->provider); -#endif /* not USE_ICU */ - } - else - { -#ifdef HAVE_LOCALE_T - result = strcoll_l(a1p, a2p, mylocale->info.lt); -#else - /* shouldn't happen */ - elog(ERROR, "unsupported collprovider: %c", mylocale->provider); -#endif - } - } - else - result = strcoll(a1p, a2p); - - /* Break tie if necessary. */ - if (result == 0 && - (!mylocale || mylocale->deterministic)) - result = strcmp(a1p, a2p); - - if (a1p != a1buf) - pfree(a1p); - if (a2p != a2buf) - pfree(a2p); + result = pg_strncoll(arg1, len1, arg2, len2, mylocale); } return result; @@ -2377,65 +2209,7 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup) return sss->last_returned; } - if (sss->locale) - { - if (sss->locale->provider == COLLPROVIDER_ICU) - { -#ifdef USE_ICU -#ifdef HAVE_UCOL_STRCOLLUTF8 - if (GetDatabaseEncoding() == PG_UTF8) - { - UErrorCode status; - - status = U_ZERO_ERROR; - result = ucol_strcollUTF8(sss->locale->info.icu.ucol, - a1p, len1, - a2p, len2, - &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("collation failed: %s", u_errorName(status)))); - } - else -#endif - { - int32_t ulen1, - ulen2; - UChar *uchar1, - *uchar2; - - ulen1 = icu_to_uchar(&uchar1, a1p, len1); - ulen2 = icu_to_uchar(&uchar2, a2p, len2); - - result = ucol_strcoll(sss->locale->info.icu.ucol, - uchar1, ulen1, - uchar2, ulen2); - - pfree(uchar1); - pfree(uchar2); - } -#else /* not USE_ICU */ - /* shouldn't happen */ - elog(ERROR, "unsupported collprovider: %c", sss->locale->provider); -#endif /* not USE_ICU */ - } - else - { -#ifdef HAVE_LOCALE_T - result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt); -#else - /* shouldn't happen */ - elog(ERROR, "unsupported collprovider: %c", sss->locale->provider); -#endif - } - } - else - result = strcoll(sss->buf1, sss->buf2); - - /* Break tie if necessary. */ - if (result == 0 && - (!sss->locale || sss->locale->deterministic)) - result = strcmp(sss->buf1, sss->buf2); + result = pg_strcoll(sss->buf1, sss->buf2, sss->locale); /* Cache result, perhaps saving an expensive strcoll() call next time */ sss->cache_blob = false; diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index a875942123..bf70ae08ca 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -100,6 +100,9 @@ extern void make_icu_collator(const char *iculocstr, extern pg_locale_t pg_newlocale_from_collation(Oid collid); extern char *get_collation_actual_version(char collprovider, const char *collcollate); +extern int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale); +extern int pg_strncoll(const char *arg1, size_t len1, + const char *arg2, size_t len2, pg_locale_t locale); #ifdef USE_ICU extern int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes); -- 2.34.1