src/common/wchar.c | 116 ++++++++++++++++++++++++++------- src/include/port/pg_utf8.h | 74 +++++++++++++++++++++ src/port/Makefile | 13 +++- src/port/pg_utf8_choose.c | 65 +++++++++++++++++++ src/port/pg_utf8_fallback.c | 153 ++++++++++++++++++++++++++++++++++++++++++++ src/port/pg_utf8_sse42.c | 29 +++++++++ 6 files changed, 425 insertions(+), 25 deletions(-) diff --git a/src/common/wchar.c b/src/common/wchar.c index 6e7d731e02..742957e67e 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -13,6 +13,7 @@ #include "c.h" #include "mb/pg_wchar.h" +#include "port/pg_utf8.h" /* @@ -1189,6 +1190,15 @@ pg_eucjp_verifystr(const unsigned char *s, int len) int l; /* fast path for ASCII-subset characters */ + l = check_ascii(s, len); + if (l) + { + s += l; + len -= l; + continue; + } + + /* Found non-ASCII or zero above, so verify a single character. */ if (!IS_HIGHBIT_SET(*s)) { if (*s == '\0') @@ -1247,6 +1257,15 @@ pg_euckr_verifystr(const unsigned char *s, int len) int l; /* fast path for ASCII-subset characters */ + l = check_ascii(s, len); + if (l) + { + s += l; + len -= l; + continue; + } + + /* Found non-ASCII or zero above, so verify a single character. */ if (!IS_HIGHBIT_SET(*s)) { if (*s == '\0') @@ -1330,6 +1349,15 @@ pg_euctw_verifystr(const unsigned char *s, int len) int l; /* fast path for ASCII-subset characters */ + l = check_ascii(s, len); + if (l) + { + s += l; + len -= l; + continue; + } + + /* Found non-ASCII or zero above, so verify a single character. */ if (!IS_HIGHBIT_SET(*s)) { if (*s == '\0') @@ -1383,6 +1411,15 @@ pg_johab_verifystr(const unsigned char *s, int len) int l; /* fast path for ASCII-subset characters */ + l = check_ascii(s, len); + if (l) + { + s += l; + len -= l; + continue; + } + + /* Found non-ASCII or zero above, so verify a single character. */ if (!IS_HIGHBIT_SET(*s)) { if (*s == '\0') @@ -1433,6 +1470,15 @@ pg_mule_verifystr(const unsigned char *s, int len) int l; /* fast path for ASCII-subset characters */ + l = check_ascii(s, len); + if (l) + { + s += l; + len -= l; + continue; + } + + /* Found non-ASCII or zero above, so verify a single character. */ if (!IS_HIGHBIT_SET(*s)) { if (*s == '\0') @@ -1502,6 +1548,15 @@ pg_sjis_verifystr(const unsigned char *s, int len) int l; /* fast path for ASCII-subset characters */ + l = check_ascii(s, len); + if (l) + { + s += l; + len -= l; + continue; + } + + /* Found non-ASCII or zero above, so verify a single character. */ if (!IS_HIGHBIT_SET(*s)) { if (*s == '\0') @@ -1551,6 +1606,15 @@ pg_big5_verifystr(const unsigned char *s, int len) int l; /* fast path for ASCII-subset characters */ + l = check_ascii(s, len); + if (l) + { + s += l; + len -= l; + continue; + } + + /* Found non-ASCII or zero above, so verify a single character. */ if (!IS_HIGHBIT_SET(*s)) { if (*s == '\0') @@ -1600,6 +1664,15 @@ pg_gbk_verifystr(const unsigned char *s, int len) int l; /* fast path for ASCII-subset characters */ + l = check_ascii(s, len); + if (l) + { + s += l; + len -= l; + continue; + } + + /* Found non-ASCII or zero above, so verify a single character. */ if (!IS_HIGHBIT_SET(*s)) { if (*s == '\0') @@ -1649,6 +1722,15 @@ pg_uhc_verifystr(const unsigned char *s, int len) int l; /* fast path for ASCII-subset characters */ + l = check_ascii(s, len); + if (l) + { + s += l; + len -= l; + continue; + } + + /* Found non-ASCII or zero above, so verify a single character. */ if (!IS_HIGHBIT_SET(*s)) { if (*s == '\0') @@ -1709,6 +1791,15 @@ pg_gb18030_verifystr(const unsigned char *s, int len) int l; /* fast path for ASCII-subset characters */ + l = check_ascii(s, len); + if (l) + { + s += l; + len -= l; + continue; + } + + /* Found non-ASCII or zero above, so verify a single character. */ if (!IS_HIGHBIT_SET(*s)) { if (*s == '\0') @@ -1760,30 +1851,7 @@ pg_utf8_verifychar(const unsigned char *s, int len) static int pg_utf8_verifystr(const unsigned char *s, int len) { - const unsigned char *start = s; - - while (len > 0) - { - int l; - - /* fast path for ASCII-subset characters */ - if (!IS_HIGHBIT_SET(*s)) - { - if (*s == '\0') - break; - l = 1; - } - else - { - l = pg_utf8_verifychar(s, len); - if (l == -1) - break; - } - s += l; - len -= l; - } - - return s - start; + return pg_validate_utf8(s, len); } /* diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h new file mode 100644 index 0000000000..b0e0939e43 --- /dev/null +++ b/src/include/port/pg_utf8.h @@ -0,0 +1,74 @@ +/*------------------------------------------------------------------------- + * + * pg_utf8.h + * Routines for fast validation of UTF-8 text. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/port/pg_utf8.h + * + *------------------------------------------------------------------------- + */ +#ifndef PG_UTF8_H +#define PG_UTF8_H + + +#if defined(USE_SSE42_CRC32C) +/* Use Intel SSE4.2 instructions. */ +extern int pg_validate_utf8_sse42(const unsigned char *s, int len); + +#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) +/* + * Use Intel SSE 4.2 instructions, but perform a runtime check first + * to check that they are available. + */ +extern int (*pg_validate_utf8) (const unsigned char *s, int len); +extern int pg_validate_utf8_sse42(const unsigned char *s, int len); +extern int pg_validate_utf8_fallback(const unsigned char *s, int len); + +#else +extern int pg_validate_utf8_fallback(const unsigned char *s, int len); + +#endif /* USE_SSE42_CRC32C */ + + +/* from https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord */ +#define HAS_ZERO(chunk) ( \ + ((chunk) - UINT64CONST(0x0101010101010101)) & \ + ~(chunk) & \ + UINT64CONST(0x8080808080808080)) + +/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */ +static inline int +check_ascii(const unsigned char *s, int len) +{ + uint64 half1, half2, + highbit_mask; + + if (len >= 2 * sizeof(uint64)) + { + memcpy(&half1, s, sizeof(uint64)); + memcpy(&half2, s + sizeof(uint64), sizeof(uint64)); + + /* + * If there are any zero bytes, bail and let the slow + * path handle it. + */ + if (HAS_ZERO(half1) || HAS_ZERO(half2)) + return 0; + + /* Check if any bytes in this chunk have the high bit set. */ + highbit_mask = ((half1 | half2) & UINT64CONST(0x8080808080808080)); + + if (!highbit_mask) + return 2 * sizeof(uint64); + else + return 0; + } + + return 0; +} + +#endif /* PG_UTF8_H */ diff --git a/src/port/Makefile b/src/port/Makefile index e41b005c4f..bd33d500c5 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -59,7 +59,13 @@ OBJS = \ snprintf.o \ strerror.o \ tar.o \ - thread.o + thread.o \ + pg_utf8_sse42.o \ + pg_utf8_fallback.o \ + pg_utf8_choose.o + +# FIXME --^ +# we need something like $(PG_SSE42_OBJS) # libpgport.a, libpgport_shlib.a, and libpgport_srv.a contain the same files # foo.o, foo_shlib.o, and foo_srv.o are all built from foo.c @@ -88,6 +94,11 @@ libpgport.a: $(OBJS) thread.o: CFLAGS+=$(PTHREAD_CFLAGS) thread_shlib.o: CFLAGS+=$(PTHREAD_CFLAGS) +# all versions of pg_utf8_sse42.o need CFLAGS_SSE42 +pg_utf8_sse42.o: CFLAGS+=$(CFLAGS_SSE42) +pg_utf8_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42) +pg_utf8_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42) + # all versions of pg_crc32c_sse42.o need CFLAGS_SSE42 pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42) pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42) diff --git a/src/port/pg_utf8_choose.c b/src/port/pg_utf8_choose.c new file mode 100644 index 0000000000..4dd80c2189 --- /dev/null +++ b/src/port/pg_utf8_choose.c @@ -0,0 +1,65 @@ +/*------------------------------------------------------------------------- + * + * pg_utf8_sse42_choose.c + * Choose between Intel SSE 4.2 and fallback implementation. + * + * On first call, checks if the CPU we're running on supports Intel SSE + * 4.2. If it does, use SSE instructions for UTF-8 validation. Otherwise, + * fall back to the pure C implementation which has a fast path for ASCII + * text. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/port/pg_utf8_sse42_choose.c + * + *------------------------------------------------------------------------- + */ + +#include "c.h" + +#ifdef HAVE__GET_CPUID +#include +#endif + +#ifdef HAVE__CPUID +#include +#endif + +#include "port/pg_utf8.h" + +static bool +pg_utf8_sse42_available(void) +{ + unsigned int exx[4] = {0, 0, 0, 0}; + +#if defined(HAVE__GET_CPUID) + __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUID) + __cpuid(exx, 1); +#else +#error cpuid instruction not available +#endif + + return (exx[2] & (1 << 20)) != 0; /* SSE 4.2 */ +} + +/* + * This gets called on the first call. It replaces the function pointer + * so that subsequent calls are routed directly to the chosen implementation. + */ +static int +pg_validate_utf8_choose(const unsigned char *s, int len) +{ + if (pg_utf8_sse42_available()) + //pg_validate_utf8 = pg_validate_utf8_sse42; + pg_validate_utf8 = pg_validate_utf8_fallback; // FIXME + else + pg_validate_utf8 = pg_validate_utf8_fallback; + + return pg_validate_utf8(s, len); +} + +int (*pg_validate_utf8) (const unsigned char *s, int len) = pg_validate_utf8_choose; diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c new file mode 100644 index 0000000000..113534c2ec --- /dev/null +++ b/src/port/pg_utf8_fallback.c @@ -0,0 +1,153 @@ +/*------------------------------------------------------------------------- + * + * pg_utf8_fallback.c + * Validate UTF-8 with a fast path for the ASCII subset. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/port/pg_utf8_fallback.c + * + *------------------------------------------------------------------------- + */ + +#include "c.h" + +#include "port/pg_utf8.h" + + +#define IS_CONTINUATION_BYTE(c) (((c) & 0b11000000) == 0b10000000) + +/* + * See the comment in common/wchar.c under "multibyte sequence validators". + */ +int +pg_validate_utf8_fallback(const unsigned char *s, int len) +{ + const unsigned char *start = s; + unsigned char b1, b2, b3, b4; + + while (len > 0) + { + int l; + + /* fast path for ASCII-subset characters */ + l = check_ascii(s, len); + if (l) + { + s += l; + len -= l; + continue; + } + + /* Found non-ASCII or zero above, so verify a single character. */ + if (!IS_HIGHBIT_SET(*s)) + { + if (*s == '\0') + break; + l = 1; + } + else if ((*s & 0b11100000) == 0b11000000) + { + l = 2; + if (len < l) + break; + + b1 = *s; + b2 = *(s + 1); + + if (!IS_CONTINUATION_BYTE(b2)) + break; + + /* check 2-byte overlong: 1100.000x.10xx.xxxx */ + if (b1 < 0xC2) + break; + +#ifdef USE_ASSERT_CHECKING + uint32 code_point = (b1 & 0b00011111) << 6 | + (b2 & 0b00111111); + + Assert(code_point >= 0x80 && code_point <= 0x7FF); +#endif + } + else if ((*s & 0b11110000) == 0b11100000) + { + l = 3; + if (len < l) + break; + + b1 = *s; + b2 = *(s + 1); + b3 = *(s + 2); + + if (!IS_CONTINUATION_BYTE(b2) || + !IS_CONTINUATION_BYTE(b3)) + break; + + /* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */ + if (b1 == 0xE0 && b2 < 0xA0) + break; + + /* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */ + if (b1 == 0xED && b2 > 0x9F) + break; + +#ifdef USE_ASSERT_CHECKING + uint32 code_point = (b1 & 0b00001111) << 12 | + (b2 & 0b00111111) << 6 | + (b3 & 0b00111111); + + Assert((code_point >= 0x0800 && code_point <= 0xD7FF) || + (code_point >= 0xE000 && code_point <= 0xFFFF)); +#endif + } + else if ((*s & 0b11111000) == 0b11110000) + { + l = 4; + if (len < l) + break; + + b1 = *s; + b2 = *(s + 1); + b3 = *(s + 2); + b4 = *(s + 3); + + if (!IS_CONTINUATION_BYTE(b2) || + !IS_CONTINUATION_BYTE(b3) || + !IS_CONTINUATION_BYTE(b4)) + break; + + /* + * check 4-byte overlong: + * 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx + */ + if (b1 == 0xF0 && b2 < 0x90) + + /* + * check too large: + * 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx + */ + if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4) + break; + +#ifdef USE_ASSERT_CHECKING + uint32 code_point = (b1 & 0b00000111) << 18 | + (b2 & 0b00111111) << 12 | + (b3 & 0b00111111) << 6 | + (b4 & 0b00111111); + + Assert(code_point >= 0x010000 && code_point <= 0x10FFFF); +#endif + } + else + /* We may have a bare continuation or large byte. */ + break; + + s += l; + len -= l; + } + + return s - start; +} diff --git a/src/port/pg_utf8_sse42.c b/src/port/pg_utf8_sse42.c new file mode 100644 index 0000000000..30bd9769b6 --- /dev/null +++ b/src/port/pg_utf8_sse42.c @@ -0,0 +1,29 @@ +/*------------------------------------------------------------------------- + * + * pg_utf8_fallback.c + * Validate UTF-8 with Intel SSE 4.2 instructions. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/port/pg_utf8_fallback.c + * + *------------------------------------------------------------------------- + */ + +#include "c.h" + +#include + +#include "mb/pg_wchar.h" +#include "port/pg_utf8.h" + +// TODO +int +pg_validate_utf8_sse42(const unsigned char *s, int len) +{ + Assert(0); + return 0; +}