diff -ruN postgresql-7.5-old/src/backend/utils/mb/wchar.c postgresql-7.5/src/backend/utils/mb/wchar.c --- postgresql-7.5-old/src/backend/utils/mb/wchar.c 2004-08-06 22:44:26.000000000 +1000 +++ postgresql-7.5/src/backend/utils/mb/wchar.c 2004-08-07 01:19:39.000000000 +1000 @@ -406,8 +406,14 @@ len = 1; else if ((*s & 0xe0) == 0xc0) len = 2; - else if ((*s & 0xe0) == 0xe0) - len = 3; + else if ((*s & 0xf0) == 0xe0) + len = 3; + else if ((*s & 0xf8) == 0xf0) + len = 4; + else if ((*s & 0xfc) == 0xf8) + len = 5; + else if ((*s & 0xfe) == 0xfc) + len = 6; return (len); } @@ -801,6 +801,36 @@ #ifndef FRONTEND +/* --------------------------------------------------------------------- */ + +/* + * Utility routine to tell whether a sequence of bytes is legal UTF-8. + */ + +static unsigned char isLegalUTF8(const unsigned char *source, int srclen) { + int length = pg_utf_mblen(source); + if(length > srclen || length > 4) return false; + unsigned char a; + const unsigned char *srcptr = source+length; + switch (length) { + default: return false; + /* Everything else falls through when "true"... */ + case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 2: if ((a = (*--srcptr)) > 0xBF) return false; + switch (*source) { + /* no fall-through in this inner switch */ + case 0xE0: if (a < 0xA0) return false; break; + case 0xF0: if (a < 0x90) return false; break; + case 0xF4: if (a > 0x8F) return false; break; + default: if (a < 0x80) return false; + } + case 1: if (*source >= 0x80 && *source < 0xC2) return false; + if (*source > 0xF4) return false; + } + return true; +} + /* * Verify mbstr to make sure that it has a valid character sequence. * mbstr is not necessarily NULL terminated; length of mbstr is @@ -825,14 +863,16 @@ while (len > 0 && *mbstr) { /* special UTF-8 check */ - if (encoding == PG_UTF8 && (*mbstr & 0xf8) == 0xf0) + if (encoding == PG_UTF8 && !isLegalUTF8(mbstr,len)) { if (noError) return false; ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("Unicode characters greater than or equal to 0x10000 are not supported"))); + errmsg("Invalid UNICODE byte sequence detected"))); } + if (encoding == PG_UTF8) + return true; l = pg_mblen(mbstr);