From 0e223a8702491d6bcd06e55eab4d83be7455b537 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Wed, 2 Jun 2021 11:48:12 -0400 Subject: [PATCH v10 1/2] Rewrite pg_utf8_verifystr() for speed Instead of relying on pg_utf8_verifychar() and pg_utf8_isvalid(), rewrite this function in a manner loosely based on the fallback that is part of the simdjson library. Verifying multibyte UTF-8 text is modestly faster, but the biggest improvement is in verifying ASCII, which is now about 7 times faster. --- src/common/wchar.c | 125 ++++++++++++++++++++++- src/test/regress/expected/conversion.out | 52 ++++++++++ src/test/regress/sql/conversion.sql | 28 +++++ 3 files changed, 202 insertions(+), 3 deletions(-) diff --git a/src/common/wchar.c b/src/common/wchar.c index 6e7d731e02..29eeeae859 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -15,6 +15,47 @@ #include "mb/pg_wchar.h" +/* for UTF-8 */ +#define IS_CONTINUATION_BYTE(c) (((c) & 0xC0) == 0x80) +#define IS_TWO_BYTE_LEAD(c) (((c) & 0xE0) == 0xC0) +#define IS_THREE_BYTE_LEAD(c) (((c) & 0xF0) == 0xE0) +#define IS_FOUR_BYTE_LEAD(c) (((c) & 0xF8) == 0xF0) + +/* from https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord */ +#define HAS_ZERO(chunk) ( \ + ((chunk) - UINT64CONST(0x0101010101010101)) & \ + ~(chunk) & \ + UINT64CONST(0x8080808080808080)) + +/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */ +static inline int +check_ascii(const unsigned char *s, int len) +{ + uint64 half1, + half2, + highbits_set; + + if (len >= 2 * sizeof(uint64)) + { + memcpy(&half1, s, sizeof(uint64)); + memcpy(&half2, s + sizeof(uint64), sizeof(uint64)); + + /* If there are zero bytes, bail and let the slow path handle it. */ + if (HAS_ZERO(half1) || HAS_ZERO(half2)) + return 0; + + /* Check if any bytes in this chunk have the high bit set. */ + highbits_set = ((half1 | half2) & UINT64CONST(0x8080808080808080)); + + if (!highbits_set) + return 2 * sizeof(uint64); + else + return 0; + } + else + return 0; +} + /* * Operations on multi-byte encodings are driven by a table of helper * functions. @@ -1761,24 +1802,102 @@ static int pg_utf8_verifystr(const unsigned char *s, int len) { const unsigned char *start = s; + unsigned char b1, + b2, + b3, + b4; while (len > 0) { int l; /* fast path for ASCII-subset characters */ + l = check_ascii(s, len); + if (l) + { + s += l; + len -= l; + continue; + } + + /* Found non-ASCII or zero above, so verify a single character. */ if (!IS_HIGHBIT_SET(*s)) { if (*s == '\0') break; l = 1; } - else + /* code points U+0080 through U+07FF */ + else if (IS_TWO_BYTE_LEAD(*s)) { - l = pg_utf8_verifychar(s, len); - if (l == -1) + l = 2; + if (len < l) + break; + + b1 = *s; + b2 = *(s + 1); + + if (!IS_CONTINUATION_BYTE(b2)) + break; + + /* check 2-byte overlong: 1100.000x.10xx.xxxx */ + if (b1 < 0xC2) break; } + /* code points U+0800 through U+D7FF and U+E000 through U+FFFF */ + else if (IS_THREE_BYTE_LEAD(*s)) + { + l = 3; + if (len < l) + break; + + b1 = *s; + b2 = *(s + 1); + b3 = *(s + 2); + + if (!IS_CONTINUATION_BYTE(b2) || + !IS_CONTINUATION_BYTE(b3)) + break; + + /* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */ + if (b1 == 0xE0 && b2 < 0xA0) + break; + + /* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */ + if (b1 == 0xED && b2 > 0x9F) + break; + } + /* code points U+010000 through U+10FFFF */ + else if (IS_FOUR_BYTE_LEAD(*s)) + { + l = 4; + if (len < l) + break; + + b1 = *s; + b2 = *(s + 1); + b3 = *(s + 2); + b4 = *(s + 3); + + if (!IS_CONTINUATION_BYTE(b2) || + !IS_CONTINUATION_BYTE(b3) || + !IS_CONTINUATION_BYTE(b4)) + break; + + /* + * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx + */ + if (b1 == 0xF0 && b2 < 0x90) + break; + + /* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */ + if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4) + break; + } + else + /* invalid byte */ + break; + s += l; len -= l; } diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out index 04fdcba496..07ad2577ff 100644 --- a/src/test/regress/expected/conversion.out +++ b/src/test/regress/expected/conversion.out @@ -72,6 +72,58 @@ $$; -- -- UTF-8 -- +CREATE TABLE utf8_verification_inputs (inbytes bytea, description text); +insert into utf8_verification_inputs values + ('\xaf', 'bare continuation'), + ('\xc5', 'missing second byte in 2-byte char'), + ('\xc080', 'smallest 2-byte overlong'), + ('\xc1bf', 'largest 2-byte overlong'), + ('\xc280', 'next 2-byte after overlongs'), + ('\xdfbf', 'largest 2-byte'), + ('\xe9af', 'missing third byte in 3-byte char'), + ('\xe08080', 'smallest 3-byte overlong'), + ('\xe09fbf', 'largest 3-byte overlong'), + ('\xe0a080', 'next 3-byte after overlong'), + ('\xed9fbf', 'last before surrogates'), + ('\xeda080', 'smallest surrogate'), + ('\xedbfbf', 'largest surrogate'), + ('\xee8080', 'next after surrogates'), + ('\xefbfbf', 'largest 3-byte'), + ('\xf1afbf', 'missing fourth byte in 4-byte char'), + ('\xf0808080', 'smallest 4-byte overlong'), + ('\xf08fbfbf', 'largest 4-byte overlong'), + ('\xf0908080', 'next 4-byte after overlong'), + ('\xf48fbfbf', 'largest 4-byte'), + ('\xf4908080', 'smallest too large'), + ('\xfa9a9a8a8a', '5 byte'); +-- Test UTF-8 verification +select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs; + description | result | errorat | error +------------------------------------+------------+--------------+---------------------------------------------------------------- + bare continuation | \x | \xaf | invalid byte sequence for encoding "UTF8": 0xaf + missing second byte in 2-byte char | \x | \xc5 | invalid byte sequence for encoding "UTF8": 0xc5 + smallest 2-byte overlong | \x | \xc080 | invalid byte sequence for encoding "UTF8": 0xc0 0x80 + largest 2-byte overlong | \x | \xc1bf | invalid byte sequence for encoding "UTF8": 0xc1 0xbf + next 2-byte after overlongs | \xc280 | | + largest 2-byte | \xdfbf | | + missing third byte in 3-byte char | \x | \xe9af | invalid byte sequence for encoding "UTF8": 0xe9 0xaf + smallest 3-byte overlong | \x | \xe08080 | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80 + largest 3-byte overlong | \x | \xe09fbf | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf + next 3-byte after overlong | \xe0a080 | | + last before surrogates | \xed9fbf | | + smallest surrogate | \x | \xeda080 | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80 + largest surrogate | \x | \xedbfbf | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf + next after surrogates | \xee8080 | | + largest 3-byte | \xefbfbf | | + missing fourth byte in 4-byte char | \x | \xf1afbf | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf + smallest 4-byte overlong | \x | \xf0808080 | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80 + largest 4-byte overlong | \x | \xf08fbfbf | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf + next 4-byte after overlong | \xf0908080 | | + largest 4-byte | \xf48fbfbf | | + smallest too large | \x | \xf4908080 | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80 + 5 byte | \x | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa +(22 rows) + CREATE TABLE utf8_inputs (inbytes bytea, description text); insert into utf8_inputs values ('\x666f6f', 'valid, pure ASCII'), diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql index 8358682432..cb35112901 100644 --- a/src/test/regress/sql/conversion.sql +++ b/src/test/regress/sql/conversion.sql @@ -74,6 +74,34 @@ $$; -- -- UTF-8 -- +CREATE TABLE utf8_verification_inputs (inbytes bytea, description text); +insert into utf8_verification_inputs values + ('\xaf', 'bare continuation'), + ('\xc5', 'missing second byte in 2-byte char'), + ('\xc080', 'smallest 2-byte overlong'), + ('\xc1bf', 'largest 2-byte overlong'), + ('\xc280', 'next 2-byte after overlongs'), + ('\xdfbf', 'largest 2-byte'), + ('\xe9af', 'missing third byte in 3-byte char'), + ('\xe08080', 'smallest 3-byte overlong'), + ('\xe09fbf', 'largest 3-byte overlong'), + ('\xe0a080', 'next 3-byte after overlong'), + ('\xed9fbf', 'last before surrogates'), + ('\xeda080', 'smallest surrogate'), + ('\xedbfbf', 'largest surrogate'), + ('\xee8080', 'next after surrogates'), + ('\xefbfbf', 'largest 3-byte'), + ('\xf1afbf', 'missing fourth byte in 4-byte char'), + ('\xf0808080', 'smallest 4-byte overlong'), + ('\xf08fbfbf', 'largest 4-byte overlong'), + ('\xf0908080', 'next 4-byte after overlong'), + ('\xf48fbfbf', 'largest 4-byte'), + ('\xf4908080', 'smallest too large'), + ('\xfa9a9a8a8a', '5 byte'); + +-- Test UTF-8 verification +select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs; + CREATE TABLE utf8_inputs (inbytes bytea, description text); insert into utf8_inputs values ('\x666f6f', 'valid, pure ASCII'), -- 2.31.1