From ccacdfe30614f10a79038df36fab228428335fe1 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 15 Dec 2020 11:12:45 +0200 Subject: [PATCH 2/5] Replace pg_utf8_verifystr() with a faster implementation. This inlines the pg_utf8_verifychar() function into the loop. We could do a lot more - there are much faster SIMD and lookup table based algorithms out there - but I'll leave that for another patch. In the passing, remove remnants of support for 5- and 6-byte UTF-8 characters. They were considered in very early Unicode versions, but the current Unicode standard limits the number of code points to 17 planes which are representable in 4 bytes in UTF-8, and there are no plans to ever go beyond that. --- src/common/wchar.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/src/common/wchar.c b/src/common/wchar.c index 5ab29bcbc39..403974629f7 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -558,12 +558,6 @@ pg_utf_mblen(const unsigned char *s) len = 3; else if ((*s & 0xf8) == 0xf0) len = 4; -#ifdef NOT_USED - else if ((*s & 0xfc) == 0xf8) - len = 5; - else if ((*s & 0xfe) == 0xfc) - len = 6; -#endif else len = 1; return len; @@ -1764,28 +1758,37 @@ static int pg_utf8_verifystr(const unsigned char *s, int len) { const unsigned char *start = s; + const unsigned char *end = s + len; - while (len > 0) + while (s < end) { - int l; + int l; - /* fast path for ASCII-subset characters */ - if (!IS_HIGHBIT_SET(*s)) + if ((*s & 0x80) == 0) { if (*s == '\0') break; - l = 1; + + s++; + continue; } + else if ((*s & 0xe0) == 0xc0) + l = 2; + else if ((*s & 0xf0) == 0xe0) + l = 3; + else if ((*s & 0xf8) == 0xf0) + l = 4; else - { - l = pg_utf8_verifychar(s, len); - if (l == -1) - break; - } + l = 1; + + if (s + l > end) + break; + + if (!pg_utf8_islegal(s, l)) + break; + s += l; - len -= l; } - return s - start; } @@ -1810,9 +1813,6 @@ pg_utf8_islegal(const unsigned char *source, int length) switch (length) { - default: - /* reject lengths 5 and 6 for now */ - return false; case 4: a = source[3]; if (a < 0x80 || a > 0xBF) -- 2.20.1