From 482df5ff99a40c64f1d87b08b41eac0206082eef Mon Sep 17 00:00:00 2001 From: John Naylor Date: Sun, 18 Jul 2021 17:14:32 -0400 Subject: [PATCH v18 1/6] Use pure DFA Based on https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725 --- src/common/wchar.c | 182 +++++++++++++++++++++-- src/test/regress/expected/conversion.out | 85 +++++++++++ src/test/regress/sql/conversion.sql | 57 +++++++ 3 files changed, 311 insertions(+), 13 deletions(-) diff --git a/src/common/wchar.c b/src/common/wchar.c index 0636b8765b..aafc602bcd 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -1757,32 +1757,188 @@ pg_utf8_verifychar(const unsigned char *s, int len) return l; } +/* possible transition states for the UTF-8 DFA */ + +#define DFA_BITS_PER_STATE 6 +#define DFA_MASK ((1 << DFA_BITS_PER_STATE) - 1) + +/* Start */ +#define BGN UINT64CONST(0) +/* Invalid sequence */ +#define ERR (UINT64CONST(1) * DFA_BITS_PER_STATE) +/* Continuation states */ +#define CS1 (UINT64CONST(2) * DFA_BITS_PER_STATE) +#define CS2 (UINT64CONST(3) * DFA_BITS_PER_STATE) +#define CS3 (UINT64CONST(4) * DFA_BITS_PER_STATE) +/* Partial 3-byte sequence states */ +#define P3A (UINT64CONST(5) * DFA_BITS_PER_STATE) +#define P3B (UINT64CONST(6) * DFA_BITS_PER_STATE) +/* Partial 4-byte sequence states */ +#define P4A (UINT64CONST(7) * DFA_BITS_PER_STATE) +#define P4B (UINT64CONST(8) * DFA_BITS_PER_STATE) +/* Start and End are the same state */ +#define END BGN + +/* + * The DFA transition table would look like this if encoded as an array + * (ERR is lower case for readability). + * + * ILL ASC CR1 CR2 CR3 L2A L3A L3B L3C L4A L4B L4C CLASS / STATE + * ========================================================================= + * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B, // BGN|END + * err, err, err, err, err, err, err, err, err, err, err, err, // ERR + * + * err, err, END, END, END, err, err, err, err, err, err, err, // CS1 + * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err, // CS2 + * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err, // CS3 + * + * err, err, err, err, CS1, err, err, err, err, err, err, err, // P3A + * err, err, CS1, CS1, err, err, err, err, err, err, err, err, // P3B + * + * err, err, err, CS2, CS2, err, err, err, err, err, err, err, // P4A + * err, err, CS2, err, err, err, err, err, err, err, err, err, // P4B + */ + +/* Encode each transition within DFA_BITS_PER_STATE-sized sequences of bits. */ + +#define ERR_ON_ALL_NON_BGN_STATES (ERR << ERR) | (ERR << CS1) | (ERR << CS2) | (ERR << CS3) | (ERR << P3A) | (ERR << P3B) | (ERR << P4A) | (ERR << P4B) + +/* 00, C0..C1, F5..FF Invalid bytes that never appear in a UTF-8 sequence */ +#define ILL ERR | ERR_ON_ALL_NON_BGN_STATES + +/* 01..7F Non-zero ASCII */ +#define NZA END | ERR_ON_ALL_NON_BGN_STATES + +/* 80..8F Continuation range 1 */ +#define CR1 ERR | (ERR << ERR) | (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (ERR << P3A) | (CS1 << P3B) | (ERR << P4A) | (CS2 << P4B) + +/* 90..9F Continuation range 2 */ +#define CR2 ERR | (ERR << ERR) | (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (ERR << P3A) | (CS1 << P3B) | (CS2 << P4A) | (ERR << P4B) + +/* A0..BF Continuation range 3 */ +#define CR3 ERR | (ERR << ERR) | (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (ERR << P3B) | (CS2 << P4A) | (ERR << P4B) + +/* C2..DF 2-byte lead */ +#define L2A CS1 | ERR_ON_ALL_NON_BGN_STATES + +/* E0 3-byte lead range A */ +#define L3A P3A | ERR_ON_ALL_NON_BGN_STATES + +/* E1..EC, EE..EF 3-byte lead range B */ +#define L3B CS2 | ERR_ON_ALL_NON_BGN_STATES + +/* ED 3-byte lead range C */ +#define L3C P3B | ERR_ON_ALL_NON_BGN_STATES + +/* F0 4-byte lead range A */ +#define L4A P4A | ERR_ON_ALL_NON_BGN_STATES + +/* F1..F3 4-byte lead range B */ +#define L4B CS3 | ERR_ON_ALL_NON_BGN_STATES + +/* F4 4-byte lead range C */ +#define L4C P4B | ERR_ON_ALL_NON_BGN_STATES + +/* maps an input byte to an 8-byte integer that encodes the possible state transitions */ +#define REP16(a) a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a +const uint64 ByteCategory[256] = +{ + /* ASCII */ + + ILL, NZA, NZA, NZA, NZA, NZA, NZA, NZA, + NZA, NZA, NZA, NZA, NZA, NZA, NZA, NZA, + REP16(NZA), + REP16(NZA), REP16(NZA), + REP16(NZA), REP16(NZA), + REP16(NZA), REP16(NZA), + + /* continuation bytes */ + + /* 80..8F */ + REP16(CR1), + + /* 90..9F */ + REP16(CR2), + + /* A0..BF */ + REP16(CR3), REP16(CR3), + + /* leading bytes */ + + /* C0..CF */ + ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A, + L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A, + + /* D0..DF */ + L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A, + L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A, + + /* E0..EF */ + L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B, + L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B, + + /* F0..FF */ + L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL, + ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL, +}; + + +static inline int +utf8_advance(const unsigned char *s) +{ + uint64 class; + uint64 state = BGN; + int l = 0; + + do + { + class = ByteCategory[*s++]; + state = (class >> state) & DFA_MASK; + l++; + } while (state > ERR); + + if (state == ERR) + return -1; + + Assert(l <= 4); + return l; +} + static int pg_utf8_verifystr(const unsigned char *s, int len) { const unsigned char *start = s; + /* + * fast path when we have enough bytes left in the string to cover all + * valid UTF-8 sequences + */ + while (len >= 4) + { + int l; + + l = utf8_advance(s); + if (l == -1) + goto end; + + s += l; + len -= l; + } + + /* handle last few bytes */ while (len > 0) { int l; - /* fast path for ASCII-subset characters */ - if (!IS_HIGHBIT_SET(*s)) - { - if (*s == '\0') - break; - l = 1; - } - else - { - l = pg_utf8_verifychar(s, len); - if (l == -1) - break; - } + l = pg_utf8_verifychar(s, len); + if (l == -1) + goto end; + s += l; len -= l; } +end: return s - start; } diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out index 04fdcba496..e4ab9fe765 100644 --- a/src/test/regress/expected/conversion.out +++ b/src/test/regress/expected/conversion.out @@ -72,6 +72,91 @@ $$; -- -- UTF-8 -- +-- The description column must be unique. +CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY); +insert into utf8_verification_inputs values + ('\xaf', 'bare continuation'), + ('\xc5', 'missing second byte in 2-byte char'), + ('\xc080', 'smallest 2-byte overlong'), + ('\xc1bf', 'largest 2-byte overlong'), + ('\xc280', 'next 2-byte after overlongs'), + ('\xdfbf', 'largest 2-byte'), + ('\xe9af', 'missing third byte in 3-byte char'), + ('\xe08080', 'smallest 3-byte overlong'), + ('\xe09fbf', 'largest 3-byte overlong'), + ('\xe0a080', 'next 3-byte after overlong'), + ('\xed9fbf', 'last before surrogates'), + ('\xeda080', 'smallest surrogate'), + ('\xedbfbf', 'largest surrogate'), + ('\xee8080', 'next after surrogates'), + ('\xefbfbf', 'largest 3-byte'), + ('\xf1afbf', 'missing fourth byte in 4-byte char'), + ('\xf0808080', 'smallest 4-byte overlong'), + ('\xf08fbfbf', 'largest 4-byte overlong'), + ('\xf0908080', 'next 4-byte after overlong'), + ('\xf48fbfbf', 'largest 4-byte'), + ('\xf4908080', 'smallest too large'), + ('\xfa9a9a8a8a', '5-byte'), + ('\x66006f', 'NUL byte'); +-- Test UTF-8 verification +select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs; + description | result | errorat | error +------------------------------------+------------+--------------+---------------------------------------------------------------- + bare continuation | \x | \xaf | invalid byte sequence for encoding "UTF8": 0xaf + missing second byte in 2-byte char | \x | \xc5 | invalid byte sequence for encoding "UTF8": 0xc5 + smallest 2-byte overlong | \x | \xc080 | invalid byte sequence for encoding "UTF8": 0xc0 0x80 + largest 2-byte overlong | \x | \xc1bf | invalid byte sequence for encoding "UTF8": 0xc1 0xbf + next 2-byte after overlongs | \xc280 | | + largest 2-byte | \xdfbf | | + missing third byte in 3-byte char | \x | \xe9af | invalid byte sequence for encoding "UTF8": 0xe9 0xaf + smallest 3-byte overlong | \x | \xe08080 | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80 + largest 3-byte overlong | \x | \xe09fbf | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf + next 3-byte after overlong | \xe0a080 | | + last before surrogates | \xed9fbf | | + smallest surrogate | \x | \xeda080 | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80 + largest surrogate | \x | \xedbfbf | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf + next after surrogates | \xee8080 | | + largest 3-byte | \xefbfbf | | + missing fourth byte in 4-byte char | \x | \xf1afbf | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf + smallest 4-byte overlong | \x | \xf0808080 | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80 + largest 4-byte overlong | \x | \xf08fbfbf | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf + next 4-byte after overlong | \xf0908080 | | + largest 4-byte | \xf48fbfbf | | + smallest too large | \x | \xf4908080 | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80 + 5-byte | \x | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa + NUL byte | \x66 | \x006f | invalid byte sequence for encoding "UTF8": 0x00 +(23 rows) + +-- Test UTF-8 verification with ASCII padding appended to provide +-- coverage for algorithms that work on multiple bytes at a time. +with test_bytes as ( + -- The error message for a sequence starting with a 4-byte lead + -- will contain all 4 bytes if they are present, so add 3 + -- ASCII bytes to the end to ensure consistent error messages. + select + inbytes, + description, + (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error + from utf8_verification_inputs +), test_padded as ( + select + description, + (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error + from test_bytes +) +select + description, + b.error as orig_error, + p.error as error_after_padding +from test_padded p +join test_bytes b +using (description) +where p.error is distinct from b.error +order by description; + description | orig_error | error_after_padding +-------------+------------+--------------------- +(0 rows) + CREATE TABLE utf8_inputs (inbytes bytea, description text); insert into utf8_inputs values ('\x666f6f', 'valid, pure ASCII'), diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql index 8358682432..e5a7e47958 100644 --- a/src/test/regress/sql/conversion.sql +++ b/src/test/regress/sql/conversion.sql @@ -74,6 +74,63 @@ $$; -- -- UTF-8 -- +-- The description column must be unique. +CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY); +insert into utf8_verification_inputs values + ('\xaf', 'bare continuation'), + ('\xc5', 'missing second byte in 2-byte char'), + ('\xc080', 'smallest 2-byte overlong'), + ('\xc1bf', 'largest 2-byte overlong'), + ('\xc280', 'next 2-byte after overlongs'), + ('\xdfbf', 'largest 2-byte'), + ('\xe9af', 'missing third byte in 3-byte char'), + ('\xe08080', 'smallest 3-byte overlong'), + ('\xe09fbf', 'largest 3-byte overlong'), + ('\xe0a080', 'next 3-byte after overlong'), + ('\xed9fbf', 'last before surrogates'), + ('\xeda080', 'smallest surrogate'), + ('\xedbfbf', 'largest surrogate'), + ('\xee8080', 'next after surrogates'), + ('\xefbfbf', 'largest 3-byte'), + ('\xf1afbf', 'missing fourth byte in 4-byte char'), + ('\xf0808080', 'smallest 4-byte overlong'), + ('\xf08fbfbf', 'largest 4-byte overlong'), + ('\xf0908080', 'next 4-byte after overlong'), + ('\xf48fbfbf', 'largest 4-byte'), + ('\xf4908080', 'smallest too large'), + ('\xfa9a9a8a8a', '5-byte'), + ('\x66006f', 'NUL byte'); + +-- Test UTF-8 verification +select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs; + +-- Test UTF-8 verification with ASCII padding appended to provide +-- coverage for algorithms that work on multiple bytes at a time. +with test_bytes as ( + -- The error message for a sequence starting with a 4-byte lead + -- will contain all 4 bytes if they are present, so add 3 + -- ASCII bytes to the end to ensure consistent error messages. + select + inbytes, + description, + (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error + from utf8_verification_inputs +), test_padded as ( + select + description, + (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error + from test_bytes +) +select + description, + b.error as orig_error, + p.error as error_after_padding +from test_padded p +join test_bytes b +using (description) +where p.error is distinct from b.error +order by description; + CREATE TABLE utf8_inputs (inbytes bytea, description text); insert into utf8_inputs values ('\x666f6f', 'valid, pure ASCII'), -- 2.31.1