diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index c97bb367f8..864591e086 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -60,6 +60,7 @@ #include "parser/gramparse.h" #include "parser/parser.h" #include "parser/parse_expr.h" +#include "parser/scansup.h" #include "storage/lmgr.h" #include "utils/date.h" #include "utils/datetime.h" @@ -188,6 +189,9 @@ static void processCASbits(int cas_bits, int location, const char *constrType, bool *deferrable, bool *initdeferred, bool *not_valid, bool *no_inherit, core_yyscan_t yyscanner); static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); +static char * str_udeescape(char *str, int str_pos, char *esc_str, int esc_pos, core_yyscan_t yyscanner); +static bool check_uescapechar(unsigned char escape); +static bool check_unicode_value(pg_wchar c, char *loc, core_yyscan_t yyscanner); %} @@ -528,6 +532,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type Iconst SignedIconst %type Sconst comment_text notify_payload +%type Ident %type RoleId opt_boolean_or_string %type var_list %type ColId ColLabel var_name type_function_name param_name @@ -599,7 +604,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); * DOT_DOT is unused in the core SQL grammar, and so will always provoke * parse errors. It is needed by PL/pgSQL. */ -%token IDENT FCONST SCONST BCONST XCONST Op +%token IDENT UIDENT FCONST SCONST UCONST BCONST XCONST Op %token ICONST PARAM %token TYPECAST DOT_DOT COLON_EQUALS EQUALS_GREATER %token LESS_EQUALS GREATER_EQUALS NOT_EQUALS @@ -689,7 +694,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); TREAT TRIGGER TRIM TRUE_P TRUNCATE TRUSTED TYPE_P TYPES_P - UNBOUNDED UNCOMMITTED UNENCRYPTED UNION UNIQUE UNKNOWN UNLISTEN UNLOGGED + UESCAPE UNBOUNDED UNCOMMITTED UNENCRYPTED UNION UNIQUE UNKNOWN UNLISTEN UNLOGGED UNTIL UPDATE USER USING VACUUM VALID VALIDATE VALIDATOR VALUE_P VALUES VARCHAR VARIADIC VARYING @@ -718,6 +723,8 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); /* Precedence: lowest to highest */ +%nonassoc UIDENT /* UESCAPE must be higher than UIDENT */ +%nonassoc UESCAPE %nonassoc SET /* see relation_expr_opt_alias */ %left UNION EXCEPT %left INTERSECT @@ -1048,7 +1055,7 @@ AlterOptRoleElem: { $$ = makeDefElem("rolemembers", (Node *)$2, @1); } - | IDENT + | Ident { /* * We handle identifiers that aren't parser keywords with @@ -1602,14 +1609,14 @@ opt_boolean_or_string: * - an integer or floating point number * - a time interval per SQL99 * ColId gives reduce/reduce errors against ConstInterval and LOCAL, - * so use IDENT (meaning we reject anything that is a key word). + * so use Ident (meaning we reject anything that is a key word). */ zone_value: Sconst { $$ = makeStringConst($1, @1); } - | IDENT + | Ident { $$ = makeStringConst($1, @1); } @@ -3871,7 +3878,7 @@ PartitionSpec: PARTITION BY part_strategy '(' part_params ')' } ; -part_strategy: IDENT { $$ = $1; } +part_strategy: Ident { $$ = $1; } | unreserved_keyword { $$ = pstrdup($1); } ; @@ -5262,7 +5269,7 @@ RowSecurityOptionalToRole: ; RowSecurityDefaultPermissive: - AS IDENT + AS Ident { if (strcmp($2, "permissive") == 0) $$ = true; @@ -5831,11 +5838,11 @@ old_aggr_list: old_aggr_elem { $$ = list_make1($1); } ; /* - * Must use IDENT here to avoid reduce/reduce conflicts; fortunately none of + * Must use Ident here to avoid reduce/reduce conflicts; fortunately none of * the item names needed in old aggregate definitions are likely to become * SQL keywords. */ -old_aggr_elem: IDENT '=' def_arg +old_aggr_elem: Ident '=' def_arg { $$ = makeDefElem($1, (Node *)$3, @1); } @@ -10113,7 +10120,7 @@ createdb_opt_item: /* * Ideally we'd use ColId here, but that causes shift/reduce conflicts against * the ALTER DATABASE SET/RESET syntaxes. Instead call out specific keywords - * we need, and allow IDENT so that database option names don't have to be + * we need, and allow Ident so that database option names don't have to be * parser keywords unless they are already keywords for other reasons. * * XXX this coding technique is fragile since if someone makes a formerly @@ -10122,7 +10129,7 @@ createdb_opt_item: * exercising every such option, at least at the syntax level. */ createdb_opt_name: - IDENT { $$ = $1; } + Ident { $$ = $1; } | CONNECTION LIMIT { $$ = pstrdup("connection_limit"); } | ENCODING { $$ = pstrdup($1); } | LOCATION { $$ = pstrdup($1); } @@ -12424,7 +12431,7 @@ xmltable_column_option_list: ; xmltable_column_option_el: - IDENT b_expr + Ident b_expr { $$ = makeDefElem($1, $2, @1); } | DEFAULT b_expr { $$ = makeDefElem("default", $2, @1); } @@ -14412,7 +14419,7 @@ extract_list: * - thomas 2001-04-12 */ extract_arg: - IDENT { $$ = $1; } + Ident { $$ = $1; } | YEAR_P { $$ = "year"; } | MONTH_P { $$ = "month"; } | DAY_P { $$ = "day"; } @@ -14655,7 +14662,7 @@ target_el: a_expr AS ColLabel * as an infix expression, which we accomplish by assigning * IDENT a precedence higher than POSTFIXOP. */ - | a_expr IDENT + | a_expr Ident { $$ = makeNode(ResTarget); $$->name = $2; @@ -14874,13 +14881,53 @@ AexprConst: Iconst ; Iconst: ICONST { $$ = $1; }; -Sconst: SCONST { $$ = $1; }; +Sconst: SCONST + { + $$ = $1; + } + | UCONST + { + $$ = str_udeescape($1, @1, "\\", 0, yyscanner); + } + | UCONST UESCAPE SCONST + { + $$ = str_udeescape($1, @1, $3, @3, yyscanner); + } + ; SignedIconst: Iconst { $$ = $1; } | '+' Iconst { $$ = + $2; } | '-' Iconst { $$ = - $2; } ; +Ident: IDENT + { + $$ = $1; + } + | UIDENT + { + char *ident; + int identlen; + + ident = str_udeescape($1, @1, "\\", 0, yyscanner); + identlen = strlen(ident); + if (identlen >= NAMEDATALEN) + truncate_identifier(ident, identlen, true); + $$ = ident; + } + | UIDENT UESCAPE SCONST + { + char *ident; + int identlen; + + ident = str_udeescape($1, @1, $3, @3, yyscanner); + identlen = strlen(ident); + if (identlen >= NAMEDATALEN) + truncate_identifier(ident, identlen, true); + $$ = ident; + } + ; + /* Role specifications */ RoleId: RoleSpec { @@ -14971,21 +15018,21 @@ role_list: RoleSpec /* Column identifier --- names that can be column, table, etc names. */ -ColId: IDENT { $$ = $1; } +ColId: Ident { $$ = $1; } | unreserved_keyword { $$ = pstrdup($1); } | col_name_keyword { $$ = pstrdup($1); } ; /* Type/function identifier --- names that can be type or function names. */ -type_function_name: IDENT { $$ = $1; } +type_function_name: Ident { $$ = $1; } | unreserved_keyword { $$ = pstrdup($1); } | type_func_name_keyword { $$ = pstrdup($1); } ; /* Any not-fully-reserved word --- these names can be, eg, role names. */ -NonReservedWord: IDENT { $$ = $1; } +NonReservedWord: Ident { $$ = $1; } | unreserved_keyword { $$ = pstrdup($1); } | col_name_keyword { $$ = pstrdup($1); } | type_func_name_keyword { $$ = pstrdup($1); } @@ -14994,7 +15041,7 @@ NonReservedWord: IDENT { $$ = $1; } /* Column label --- allowed labels in "AS" clauses. * This presently includes *all* Postgres keywords. */ -ColLabel: IDENT { $$ = $1; } +ColLabel: Ident { $$ = $1; } | unreserved_keyword { $$ = pstrdup($1); } | col_name_keyword { $$ = pstrdup($1); } | type_func_name_keyword { $$ = pstrdup($1); } @@ -15282,6 +15329,7 @@ unreserved_keyword: | TRUSTED | TYPE_P | TYPES_P + | UESCAPE | UNBOUNDED | UNCOMMITTED | UNENCRYPTED @@ -16351,3 +16399,202 @@ parser_init(base_yy_extra_type *yyext) { yyext->parsetree = NIL; /* in case grammar forgets to set it */ } + +/* handle unicode escapes */ +static char * +str_udeescape(char *str, int str_pos, + char *esc_str, int esc_pos, + core_yyscan_t yyscanner) +{ + char *new, + *in, + *out; + int str_length, + esc_length; + pg_wchar pair_first = 0; + unsigned char escape; + + esc_length = strlen(esc_str); + escape = esc_str[0]; + + str_length = strlen(str); + + if (esc_length != 1 || + !check_uescapechar(escape)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode escape character \"%s\"", esc_str), + parser_errposition(esc_pos + 1))); /* 1 for the quote */ + + /* + * This relies on the subtle assumption that a UTF-8 expansion cannot be + * longer than its escaped representation. + */ + new = palloc(str_length + 1); + + in = str; + out = new; + while (*in) + { + if (in[0] == escape) + { + if (in[1] == escape) + { + if (pair_first) + goto invalid_pair; + *out++ = escape; + in += 2; + } + else if (isxdigit((unsigned char) in[1]) && + isxdigit((unsigned char) in[2]) && + isxdigit((unsigned char) in[3]) && + isxdigit((unsigned char) in[4])) + { + pg_wchar unicode; + + unicode = (hexval(in[1]) << 12) + + (hexval(in[2]) << 8) + + (hexval(in[3]) << 4) + + hexval(in[4]); + if (!check_unicode_value(unicode, in, yyscanner)) + { + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8"), + parser_errposition(str_pos + (in - str) + 3))); /* 3 for U&" */ + } + + if (pair_first) + { + if (is_utf16_surrogate_second(unicode)) + { + unicode = surrogate_pair_to_codepoint(pair_first, unicode); + pair_first = 0; + } + else + goto invalid_pair; + } + else if (is_utf16_surrogate_second(unicode)) + goto invalid_pair; + + if (is_utf16_surrogate_first(unicode)) + pair_first = unicode; + else + { + unicode_to_utf8(unicode, (unsigned char *) out); + out += pg_mblen(out); + } + in += 5; + } + else if (in[1] == '+' && + isxdigit((unsigned char) in[2]) && + isxdigit((unsigned char) in[3]) && + isxdigit((unsigned char) in[4]) && + isxdigit((unsigned char) in[5]) && + isxdigit((unsigned char) in[6]) && + isxdigit((unsigned char) in[7])) + { + pg_wchar unicode; + + unicode = (hexval(in[2]) << 20) + + (hexval(in[3]) << 16) + + (hexval(in[4]) << 12) + + (hexval(in[5]) << 8) + + (hexval(in[6]) << 4) + + hexval(in[7]); + if (!check_unicode_value(unicode, in, yyscanner)) + { + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8"), + parser_errposition(str_pos + (in - str) + 3))); /* 3 for U&" */ + } + if (pair_first) + { + if (is_utf16_surrogate_second(unicode)) + { + unicode = surrogate_pair_to_codepoint(pair_first, unicode); + pair_first = 0; + } + else + goto invalid_pair; + } + else if (is_utf16_surrogate_second(unicode)) + goto invalid_pair; + + if (is_utf16_surrogate_first(unicode)) + pair_first = unicode; + else + { + unicode_to_utf8(unicode, (unsigned char *) out); + out += pg_mblen(out); + } + in += 8; + } + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode escape value"), + parser_errposition(str_pos + (in - str) + 3))); /* 3 for U&" */ + } + else + { + if (pair_first) + goto invalid_pair; + + *out++ = *in++; + } + } + + /* unfinished surrogate pair? */ + if (pair_first) + goto invalid_pair; + + *out = '\0'; + + /* + * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII + * codes; but it's probably not worth the trouble, since this isn't likely + * to be a performance-critical path. + */ + pg_verifymbstr(new, out - new, false); + return new; + +invalid_pair: + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode surrogate pair"), + parser_errposition(str_pos + (in - str) + 3))); /* 3 for U&" */ + +} + +/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */ +static bool +check_uescapechar(unsigned char escape) +{ + if (isxdigit(escape) + || escape == '+' + || escape == '\'' + || escape == '"' + || scanner_isspace(escape)) + { + return false; + } + else + return true; +} + +/* XXX this not covered in tests as far as I know, so check error position in caller */ +static bool +check_unicode_value(pg_wchar c, char *loc, core_yyscan_t yyscanner) +{ + if (GetDatabaseEncoding() == PG_UTF8) + return true; + + if (c > 0x7F) + { + return false; + } + else + return true; +} diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l index e1cae859e8..97bd54712e 100644 --- a/src/backend/parser/scan.l +++ b/src/backend/parser/scan.l @@ -110,14 +110,9 @@ const uint16 ScanKeywordTokens[] = { static void addlit(char *ytext, int yleng, core_yyscan_t yyscanner); static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner); static char *litbufdup(core_yyscan_t yyscanner); -static char *litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner); static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner); static int process_integer_literal(const char *token, YYSTYPE *lval); -static bool is_utf16_surrogate_first(pg_wchar c); -static bool is_utf16_surrogate_second(pg_wchar c); -static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second); static void addunicode(pg_wchar c, yyscan_t yyscanner); -static bool check_uescapechar(unsigned char escape); #define yyerror(msg) scanner_yyerror(msg, yyscanner) @@ -168,12 +163,11 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner); * delimited identifiers (double-quoted identifiers) * hexadecimal numeric string * standard quoted strings + * quote stop (detect continued strings) * extended quoted strings (support backslash escape sequences) * $foo$ quoted strings * quoted identifier with Unicode escapes - * end of a quoted identifier with Unicode escapes, UESCAPE can follow * quoted string with Unicode escapes - * end of a quoted string with Unicode escapes, UESCAPE can follow * Unicode surrogate pair in extended quoted string * * Remember to add an <> case whenever you add a new exclusive state! @@ -185,12 +179,11 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner); %x xd %x xh %x xq +%x xqs %x xe %x xdolq %x xui -%x xuiend %x xus -%x xusend %x xeu /* @@ -231,19 +224,18 @@ special_whitespace ({space}+|{comment}{newline}) horiz_whitespace ({horiz_space}|{comment}) whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*) +quote ' +/* If we see {quote} then {quotecontinue}, the quoted string continues */ +quotecontinue {whitespace_with_newline}{quote} + /* - * To ensure that {quotecontinue} can be scanned without having to back up - * if the full pattern isn't matched, we include trailing whitespace in - * {quotestop}. This matches all cases where {quotecontinue} fails to match, - * except for {quote} followed by whitespace and just one "-" (not two, - * which would start a {comment}). To cover that we have {quotefail}. - * The actions for {quotestop} and {quotefail} must throw back characters - * beyond the quote proper. + * {quotecontinuefail} is needed to avoid lexer backup when we fail to match + * {quotecontinue}. It might seem that this could just be {whitespace}*, + * but if there's a dash after {whitespace_with_newline}, it must be consumed + * to see if there's another dash --- which would start a {comment} and thus + * allow continuation of the {quotecontinue} token. */ -quote ' -quotestop {quote}{whitespace}* -quotecontinue {quote}{whitespace_with_newline}{quote} -quotefail {quote}{whitespace}*"-" +quotecontinuefail {whitespace}*"-"? /* Bit string * It is tempting to scan the string for only those characters @@ -304,21 +296,12 @@ xdstop {dquote} xddouble {dquote}{dquote} xdinside [^"]+ -/* Unicode escapes */ -uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote} -/* error rule to avoid backup */ -uescapefail [uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU] - /* Quoted identifier with Unicode escapes */ xuistart [uU]&{dquote} /* Quoted string with Unicode escapes */ xusstart [uU]&{quote} -/* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */ -xustop1 {uescapefail}? -xustop2 {uescape} - /* error rule to avoid backup */ xufailed [uU]& @@ -476,21 +459,10 @@ other . startlit(); addlitchar('b', yyscanner); } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(INITIAL); - yylval->str = litbufdup(yyscanner); - return BCONST; - } {xhinside} | {xbinside} { addlit(yytext, yyleng, yyscanner); } -{quotecontinue} | -{quotecontinue} { - /* ignore */ - } <> { yyerror("unterminated bit string literal"); } {xhstart} { @@ -505,13 +477,6 @@ other . startlit(); addlitchar('x', yyscanner); } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(INITIAL); - yylval->str = litbufdup(yyscanner); - return XCONST; - } <> { yyerror("unterminated hexadecimal string literal"); } {xnstart} { @@ -568,53 +533,67 @@ other . BEGIN(xus); startlit(); } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(INITIAL); + +{quote} { /* - * check that the data remains valid if it might have been - * made invalid by unescaping any chars. + * When we are scanning a quoted string and see an end + * quote, we must look ahead for a possible continuation. + * If we don't see one, we know the end quote was in fact + * the end of the string. To reduce the lexer table size, + * we use a single "xqs" state to do the lookahead for all + * types of strings. */ - if (yyextra->saw_non_ascii) - pg_verifymbstr(yyextra->literalbuf, - yyextra->literallen, - false); - yylval->str = litbufdup(yyscanner); - return SCONST; - } -{quotestop} | -{quotefail} { - /* throw back all but the quote */ - yyless(1); - /* xusend state looks for possible UESCAPE */ - BEGIN(xusend); + yyextra->state_before_str_stop = YYSTATE; + BEGIN(xqs); } -{whitespace} { - /* stay in xusend state over whitespace */ +{quotecontinue} { + /* + * Found a quote continuation, so return to the in-quote + * state and continue scanning the literal. + */ + BEGIN(yyextra->state_before_str_stop); } -<> | -{other} | -{xustop1} { - /* no UESCAPE after the quote, throw back everything */ +{quotecontinuefail} | +<> | +{other} { + /* + * Failed to see a quote continuation. Throw back + * everything after the end quote, and handle the string + * according to the state we were in previously. + */ yyless(0); BEGIN(INITIAL); - yylval->str = litbuf_udeescape('\\', yyscanner); - return SCONST; - } -{xustop2} { - /* found UESCAPE after the end quote */ - BEGIN(INITIAL); - if (!check_uescapechar(yytext[yyleng - 2])) + + switch (yyextra->state_before_str_stop) { - SET_YYLLOC(); - ADVANCE_YYLLOC(yyleng - 2); - yyerror("invalid Unicode escape character"); + case xb: + yylval->str = litbufdup(yyscanner); + return BCONST; + case xh: + yylval->str = litbufdup(yyscanner); + return XCONST; + case xq: + /* fallthrough */ + case xe: + /* + * Check that the data remains valid if it + * might have been made invalid by unescaping + * any chars. + */ + if (yyextra->saw_non_ascii) + pg_verifymbstr(yyextra->literalbuf, + yyextra->literallen, + false); + yylval->str = litbufdup(yyscanner); + return SCONST; + case xus: + yylval->str = litbufdup(yyscanner); + return UCONST; + default: + yyerror("unhandled previous state in xqs"); } - yylval->str = litbuf_udeescape(yytext[yyleng - 2], - yyscanner); - return SCONST; } + {xqdouble} { addlitchar('\'', yyscanner); } @@ -693,9 +672,6 @@ other . if (c == '\0' || IS_HIGHBIT_SET(c)) yyextra->saw_non_ascii = true; } -{quotecontinue} { - /* ignore */ - } . { /* This is only needed for \ just before EOF */ addlitchar(yytext[0], yyscanner); @@ -770,53 +746,14 @@ other . return IDENT; } {dquote} { - yyless(1); - /* xuiend state looks for possible UESCAPE */ - BEGIN(xuiend); - } -{whitespace} { - /* stay in xuiend state over whitespace */ - } -<> | -{other} | -{xustop1} { - /* no UESCAPE after the quote, throw back everything */ - char *ident; - int identlen; - - yyless(0); - - BEGIN(INITIAL); if (yyextra->literallen == 0) yyerror("zero-length delimited identifier"); - ident = litbuf_udeescape('\\', yyscanner); - identlen = strlen(ident); - if (identlen >= NAMEDATALEN) - truncate_identifier(ident, identlen, true); - yylval->str = ident; - return IDENT; - } -{xustop2} { - /* found UESCAPE after the end quote */ - char *ident; - int identlen; BEGIN(INITIAL); - if (yyextra->literallen == 0) - yyerror("zero-length delimited identifier"); - if (!check_uescapechar(yytext[yyleng - 2])) - { - SET_YYLLOC(); - ADVANCE_YYLLOC(yyleng - 2); - yyerror("invalid Unicode escape character"); - } - ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner); - identlen = strlen(ident); - if (identlen >= NAMEDATALEN) - truncate_identifier(ident, identlen, true); - yylval->str = ident; - return IDENT; + yylval->str = litbufdup(yyscanner); + return UIDENT; } + {xddouble} { addlitchar('"', yyscanner); } @@ -1288,50 +1225,6 @@ process_integer_literal(const char *token, YYSTYPE *lval) return ICONST; } -static unsigned int -hexval(unsigned char c) -{ - if (c >= '0' && c <= '9') - return c - '0'; - if (c >= 'a' && c <= 'f') - return c - 'a' + 0xA; - if (c >= 'A' && c <= 'F') - return c - 'A' + 0xA; - elog(ERROR, "invalid hexadecimal digit"); - return 0; /* not reached */ -} - -static void -check_unicode_value(pg_wchar c, char *loc, core_yyscan_t yyscanner) -{ - if (GetDatabaseEncoding() == PG_UTF8) - return; - - if (c > 0x7F) - { - ADVANCE_YYLLOC(loc - yyextra->literalbuf + 3); /* 3 for U&" */ - yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8"); - } -} - -static bool -is_utf16_surrogate_first(pg_wchar c) -{ - return (c >= 0xD800 && c <= 0xDBFF); -} - -static bool -is_utf16_surrogate_second(pg_wchar c) -{ - return (c >= 0xDC00 && c <= 0xDFFF); -} - -static pg_wchar -surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second) -{ - return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF); -} - static void addunicode(pg_wchar c, core_yyscan_t yyscanner) { @@ -1349,172 +1242,6 @@ addunicode(pg_wchar c, core_yyscan_t yyscanner) addlit(buf, pg_mblen(buf), yyscanner); } -/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */ -static bool -check_uescapechar(unsigned char escape) -{ - if (isxdigit(escape) - || escape == '+' - || escape == '\'' - || escape == '"' - || scanner_isspace(escape)) - { - return false; - } - else - return true; -} - -/* like litbufdup, but handle unicode escapes */ -static char * -litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner) -{ - char *new; - char *litbuf, - *in, - *out; - pg_wchar pair_first = 0; - - /* Make literalbuf null-terminated to simplify the scanning loop */ - litbuf = yyextra->literalbuf; - litbuf[yyextra->literallen] = '\0'; - - /* - * This relies on the subtle assumption that a UTF-8 expansion cannot be - * longer than its escaped representation. - */ - new = palloc(yyextra->literallen + 1); - - in = litbuf; - out = new; - while (*in) - { - if (in[0] == escape) - { - if (in[1] == escape) - { - if (pair_first) - { - ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ - yyerror("invalid Unicode surrogate pair"); - } - *out++ = escape; - in += 2; - } - else if (isxdigit((unsigned char) in[1]) && - isxdigit((unsigned char) in[2]) && - isxdigit((unsigned char) in[3]) && - isxdigit((unsigned char) in[4])) - { - pg_wchar unicode; - - unicode = (hexval(in[1]) << 12) + - (hexval(in[2]) << 8) + - (hexval(in[3]) << 4) + - hexval(in[4]); - check_unicode_value(unicode, in, yyscanner); - if (pair_first) - { - if (is_utf16_surrogate_second(unicode)) - { - unicode = surrogate_pair_to_codepoint(pair_first, unicode); - pair_first = 0; - } - else - { - ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ - yyerror("invalid Unicode surrogate pair"); - } - } - else if (is_utf16_surrogate_second(unicode)) - yyerror("invalid Unicode surrogate pair"); - - if (is_utf16_surrogate_first(unicode)) - pair_first = unicode; - else - { - unicode_to_utf8(unicode, (unsigned char *) out); - out += pg_mblen(out); - } - in += 5; - } - else if (in[1] == '+' && - isxdigit((unsigned char) in[2]) && - isxdigit((unsigned char) in[3]) && - isxdigit((unsigned char) in[4]) && - isxdigit((unsigned char) in[5]) && - isxdigit((unsigned char) in[6]) && - isxdigit((unsigned char) in[7])) - { - pg_wchar unicode; - - unicode = (hexval(in[2]) << 20) + - (hexval(in[3]) << 16) + - (hexval(in[4]) << 12) + - (hexval(in[5]) << 8) + - (hexval(in[6]) << 4) + - hexval(in[7]); - check_unicode_value(unicode, in, yyscanner); - if (pair_first) - { - if (is_utf16_surrogate_second(unicode)) - { - unicode = surrogate_pair_to_codepoint(pair_first, unicode); - pair_first = 0; - } - else - { - ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ - yyerror("invalid Unicode surrogate pair"); - } - } - else if (is_utf16_surrogate_second(unicode)) - yyerror("invalid Unicode surrogate pair"); - - if (is_utf16_surrogate_first(unicode)) - pair_first = unicode; - else - { - unicode_to_utf8(unicode, (unsigned char *) out); - out += pg_mblen(out); - } - in += 8; - } - else - { - ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ - yyerror("invalid Unicode escape value"); - } - } - else - { - if (pair_first) - { - ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ - yyerror("invalid Unicode surrogate pair"); - } - *out++ = *in++; - } - } - - /* unfinished surrogate pair? */ - if (pair_first) - { - ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ - yyerror("invalid Unicode surrogate pair"); - } - - *out = '\0'; - - /* - * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII - * codes; but it's probably not worth the trouble, since this isn't likely - * to be a performance-critical path. - */ - pg_verifymbstr(new, out - new, false); - return new; -} - static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner) { diff --git a/src/fe_utils/psqlscan.l b/src/fe_utils/psqlscan.l index ce20936339..eba7490078 100644 --- a/src/fe_utils/psqlscan.l +++ b/src/fe_utils/psqlscan.l @@ -114,12 +114,11 @@ extern void psql_yyset_column(int column_no, yyscan_t yyscanner); * delimited identifiers (double-quoted identifiers) * hexadecimal numeric string * standard quoted strings + * quote stop (detect continued strings) * extended quoted strings (support backslash escape sequences) * $foo$ quoted strings * quoted identifier with Unicode escapes - * end of a quoted identifier with Unicode escapes, UESCAPE can follow * quoted string with Unicode escapes - * end of a quoted string with Unicode escapes, UESCAPE can follow * * Note: we intentionally don't mimic the backend's state; we have * no need to distinguish it from state, and no good way to get out @@ -132,12 +131,11 @@ extern void psql_yyset_column(int column_no, yyscan_t yyscanner); %x xd %x xh %x xq +%x xqs %x xe %x xdolq %x xui -%x xuiend %x xus -%x xusend /* * In order to make the world safe for Windows and Mac clients as well as @@ -177,19 +175,18 @@ special_whitespace ({space}+|{comment}{newline}) horiz_whitespace ({horiz_space}|{comment}) whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*) +quote ' +/* If we see {quote} then {quotecontinue}, the quoted string continues */ +quotecontinue {whitespace_with_newline}{quote} + /* - * To ensure that {quotecontinue} can be scanned without having to back up - * if the full pattern isn't matched, we include trailing whitespace in - * {quotestop}. This matches all cases where {quotecontinue} fails to match, - * except for {quote} followed by whitespace and just one "-" (not two, - * which would start a {comment}). To cover that we have {quotefail}. - * The actions for {quotestop} and {quotefail} must throw back characters - * beyond the quote proper. + * {quotecontinuefail} is needed to avoid lexer backup when we fail to match + * {quotecontinue}. It might seem that this could just be {whitespace}*, + * but if there's a dash after {whitespace_with_newline}, it must be consumed + * to see if there's another dash --- which would start a {comment} and thus + * allow continuation of the {quotecontinue} token. */ -quote ' -quotestop {quote}{whitespace}* -quotecontinue {quote}{whitespace_with_newline}{quote} -quotefail {quote}{whitespace}*"-" +quotecontinuefail {whitespace}*"-"? /* Bit string * It is tempting to scan the string for only those characters @@ -250,21 +247,12 @@ xdstop {dquote} xddouble {dquote}{dquote} xdinside [^"]+ -/* Unicode escapes */ -uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote} -/* error rule to avoid backup */ -uescapefail [uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU] - /* Quoted identifier with Unicode escapes */ xuistart [uU]&{dquote} /* Quoted string with Unicode escapes */ xusstart [uU]&{quote} -/* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */ -xustop1 {uescapefail}? -xustop2 {uescape} - /* error rule to avoid backup */ xufailed [uU]& @@ -438,20 +426,10 @@ other . BEGIN(xb); ECHO; } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(INITIAL); - ECHO; - } {xhinside} | {xbinside} { ECHO; } -{quotecontinue} | -{quotecontinue} { - ECHO; - } {xhstart} { /* Hexadecimal bit type. @@ -463,12 +441,6 @@ other . BEGIN(xh); ECHO; } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(INITIAL); - ECHO; - } {xnstart} { yyless(1); /* eat only 'n' this time */ @@ -490,32 +462,38 @@ other . BEGIN(xus); ECHO; } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(INITIAL); - ECHO; - } -{quotestop} | -{quotefail} { - /* throw back all but the quote */ - yyless(1); - BEGIN(xusend); + +{quote} { + /* + * When we are scanning a quoted string and see an end + * quote, we must look ahead for a possible continuation. + * If we don't see one, we know the end quote was in fact + * the end of the string. To reduce the lexer table size, + * we use a single "xqs" state to do the lookahead for all + * types of strings. + */ + cur_state->state_before_str_stop = YYSTATE; + BEGIN(xqs); ECHO; } -{whitespace} { +{quotecontinue} { + /* + * Found a quote continuation, so return to the in-quote + * state and continue scanning the literal. + */ + BEGIN(cur_state->state_before_str_stop); ECHO; } -{other} | -{xustop1} { +{quotecontinuefail} | +{other} { + /* + * Failed to see a quote continuation. Throw back + * everything after the end quote. + */ yyless(0); BEGIN(INITIAL); - ECHO; - } -{xustop2} { - BEGIN(INITIAL); - ECHO; } + {xqdouble} { ECHO; } @@ -540,9 +518,6 @@ other . {xehexesc} { ECHO; } -{quotecontinue} { - ECHO; - } . { /* This is only needed for \ just before EOF */ ECHO; @@ -600,23 +575,10 @@ other . ECHO; } {dquote} { - yyless(1); - BEGIN(xuiend); - ECHO; - } -{whitespace} { - ECHO; - } -{other} | -{xustop1} { - yyless(0); - BEGIN(INITIAL); - ECHO; - } -{xustop2} { BEGIN(INITIAL); ECHO; } + {xddouble} { ECHO; } @@ -1084,8 +1046,7 @@ psql_scan(PsqlScanState state, switch (state->start_state) { case INITIAL: - case xuiend: /* we treat these like INITIAL */ - case xusend: + case xqs: /* we treat this like INITIAL */ if (state->paren_depth > 0) { result = PSCAN_INCOMPLETE; @@ -1240,7 +1201,8 @@ psql_scan_reselect_sql_lexer(PsqlScanState state) bool psql_scan_in_quote(PsqlScanState state) { - return state->start_state != INITIAL; + return state->start_state != INITIAL && + state->start_state != xqs; } /* diff --git a/src/include/fe_utils/psqlscan_int.h b/src/include/fe_utils/psqlscan_int.h index 2acb380078..f53ccbf82e 100644 --- a/src/include/fe_utils/psqlscan_int.h +++ b/src/include/fe_utils/psqlscan_int.h @@ -110,6 +110,7 @@ typedef struct PsqlScanStateData * and updated with its finishing state on exit. */ int start_state; /* yylex's starting/finishing state */ + int state_before_str_stop; /* start cond. before end quote */ int paren_depth; /* depth of nesting in parentheses */ int xcdepth; /* depth of nesting in slash-star comments */ char *dolqstart; /* current $foo$ quote start string */ diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h index 00ace8425e..5893d317d8 100644 --- a/src/include/parser/kwlist.h +++ b/src/include/parser/kwlist.h @@ -416,6 +416,7 @@ PG_KEYWORD("truncate", TRUNCATE, UNRESERVED_KEYWORD) PG_KEYWORD("trusted", TRUSTED, UNRESERVED_KEYWORD) PG_KEYWORD("type", TYPE_P, UNRESERVED_KEYWORD) PG_KEYWORD("types", TYPES_P, UNRESERVED_KEYWORD) +PG_KEYWORD("uescape", UESCAPE, UNRESERVED_KEYWORD) PG_KEYWORD("unbounded", UNBOUNDED, UNRESERVED_KEYWORD) PG_KEYWORD("uncommitted", UNCOMMITTED, UNRESERVED_KEYWORD) PG_KEYWORD("unencrypted", UNENCRYPTED, UNRESERVED_KEYWORD) diff --git a/src/include/parser/scanner.h b/src/include/parser/scanner.h index 731a2bd264..571d5e273f 100644 --- a/src/include/parser/scanner.h +++ b/src/include/parser/scanner.h @@ -48,7 +48,7 @@ typedef union core_YYSTYPE * However, those are not defined in this file, because bison insists on * defining them for itself. The token codes used by the core scanner are * the ASCII characters plus these: - * %token IDENT FCONST SCONST BCONST XCONST Op + * %token IDENT UIDENT FCONST SCONST UCONST BCONST XCONST Op * %token ICONST PARAM * %token TYPECAST DOT_DOT COLON_EQUALS EQUALS_GREATER * %token LESS_EQUALS GREATER_EQUALS NOT_EQUALS @@ -99,6 +99,7 @@ typedef struct core_yy_extra_type int literallen; /* actual current string length */ int literalalloc; /* current allocated buffer size */ + int state_before_str_stop; /* start cond. before end quote */ int xcdepth; /* depth of nesting in slash-star comments */ char *dolqstart; /* current $foo$ quote start string */ diff --git a/src/include/parser/scansup.h b/src/include/parser/scansup.h index fb2980bd17..8370982b47 100644 --- a/src/include/parser/scansup.h +++ b/src/include/parser/scansup.h @@ -15,6 +15,9 @@ #ifndef SCANSUP_H #define SCANSUP_H +#include "mb/pg_wchar.h" + + extern char *scanstr(const char *s); extern char *downcase_truncate_identifier(const char *ident, int len, @@ -27,4 +30,35 @@ extern void truncate_identifier(char *ident, int len, bool warn); extern bool scanner_isspace(char ch); +static inline unsigned int +hexval(unsigned char c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + if (c >= 'a' && c <= 'f') + return c - 'a' + 0xA; + if (c >= 'A' && c <= 'F') + return c - 'A' + 0xA; + elog(ERROR, "invalid hexadecimal digit"); + return 0; /* not reached */ +} + +static inline bool +is_utf16_surrogate_first(pg_wchar c) +{ + return (c >= 0xD800 && c <= 0xDBFF); +} + +static inline bool +is_utf16_surrogate_second(pg_wchar c) +{ + return (c >= 0xDC00 && c <= 0xDFFF); +} + +static inline pg_wchar +surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second) +{ + return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF); +} + #endif /* SCANSUP_H */ diff --git a/src/interfaces/ecpg/preproc/ecpg.tokens b/src/interfaces/ecpg/preproc/ecpg.tokens index 1d613af02f..749a9146ba 100644 --- a/src/interfaces/ecpg/preproc/ecpg.tokens +++ b/src/interfaces/ecpg/preproc/ecpg.tokens @@ -24,4 +24,4 @@ S_TYPEDEF %token CSTRING CVARIABLE CPP_LINE IP -%token DOLCONST ECONST NCONST UCONST UIDENT +%token DOLCONST ECONST NCONST diff --git a/src/interfaces/ecpg/preproc/ecpg.trailer b/src/interfaces/ecpg/preproc/ecpg.trailer index b303a9cbd0..dbf1abb5fb 100644 --- a/src/interfaces/ecpg/preproc/ecpg.trailer +++ b/src/interfaces/ecpg/preproc/ecpg.trailer @@ -1812,7 +1812,6 @@ ecpg_sconst: $$[strlen($1)+3]='\0'; free($1); } - | UCONST { $$ = $1; } | DOLCONST { $$ = $1; } ; @@ -1820,7 +1819,6 @@ ecpg_xconst: XCONST { $$ = make_name(); } ; ecpg_ident: IDENT { $$ = make_name(); } | CSTRING { $$ = make3_str(mm_strdup("\""), $1, mm_strdup("\"")); } - | UIDENT { $$ = $1; } ; quoted_ident_stringvar: name diff --git a/src/interfaces/ecpg/preproc/parse.pl b/src/interfaces/ecpg/preproc/parse.pl index 3619706cdc..dc40b2974c 100644 --- a/src/interfaces/ecpg/preproc/parse.pl +++ b/src/interfaces/ecpg/preproc/parse.pl @@ -218,8 +218,8 @@ sub main if ($a eq 'IDENT' && $prior eq '%nonassoc') { - # add two more tokens to the list - $str = $str . "\n%nonassoc CSTRING\n%nonassoc UIDENT"; + # add one more tokens to the list + $str = $str . "\n%nonassoc CSTRING"; } $prior = $a; } diff --git a/src/pl/plpgsql/src/pl_gram.y b/src/pl/plpgsql/src/pl_gram.y index dea95f4230..f0533d8407 100644 --- a/src/pl/plpgsql/src/pl_gram.y +++ b/src/pl/plpgsql/src/pl_gram.y @@ -232,7 +232,7 @@ static void check_raise_parameters(PLpgSQL_stmt_raise *stmt); * Some of these are not directly referenced in this file, but they must be * here anyway. */ -%token IDENT FCONST SCONST BCONST XCONST Op +%token IDENT UIDENT FCONST SCONST UCONST BCONST XCONST Op %token ICONST PARAM %token TYPECAST DOT_DOT COLON_EQUALS EQUALS_GREATER %token LESS_EQUALS GREATER_EQUALS NOT_EQUALS diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index 486c00b3b3..4d48d33688 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -48,17 +48,40 @@ SELECT 'tricky' AS U&"\" UESCAPE '!'; (1 row) SELECT U&'wrong: \061'; -ERROR: invalid Unicode escape value at or near "\061'" +ERROR: invalid Unicode escape value LINE 1: SELECT U&'wrong: \061'; ^ SELECT U&'wrong: \+0061'; -ERROR: invalid Unicode escape value at or near "\+0061'" +ERROR: invalid Unicode escape value LINE 1: SELECT U&'wrong: \+0061'; ^ SELECT U&'wrong: +0061' UESCAPE '+'; -ERROR: invalid Unicode escape character at or near "+'" +ERROR: invalid Unicode escape character "+" LINE 1: SELECT U&'wrong: +0061' UESCAPE '+'; ^ +-- handling of Unicode surrogate pairs +SELECT U&'\d83d\de04\d83d\dc36' as correct_in_utf8; + correct_in_utf8 +----------------- + 😄🐶 +(1 row) + +SELECT U&'\d83d\d83d'; -- 2 high surrogates in a row +ERROR: invalid Unicode surrogate pair +LINE 1: SELECT U&'\d83d\d83d'; + ^ +SELECT U&'\de04\d83d'; -- surrogates in wrong order +ERROR: invalid Unicode surrogate pair +LINE 1: SELECT U&'\de04\d83d'; + ^ +SELECT U&'\d83dX'; -- orphan high surrogate +ERROR: invalid Unicode surrogate pair +LINE 1: SELECT U&'\d83dX'; + ^ +SELECT U&'\de04X'; -- orphan low surrogate +ERROR: invalid Unicode surrogate pair +LINE 1: SELECT U&'\de04X'; + ^ SET standard_conforming_strings TO off; SELECT U&'d\0061t\+000061' AS U&"d\0061t\+000061"; ERROR: unsafe use of string constant with Unicode escapes diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index 5744c9f800..9d81096aa7 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -29,6 +29,13 @@ SELECT U&'wrong: \061'; SELECT U&'wrong: \+0061'; SELECT U&'wrong: +0061' UESCAPE '+'; +-- handling of Unicode surrogate pairs +SELECT U&'\d83d\de04\d83d\dc36' as correct_in_utf8; +SELECT U&'\d83d\d83d'; -- 2 high surrogates in a row +SELECT U&'\de04\d83d'; -- surrogates in wrong order +SELECT U&'\d83dX'; -- orphan high surrogate +SELECT U&'\de04X'; -- orphan low surrogate + SET standard_conforming_strings TO off; SELECT U&'d\0061t\+000061' AS U&"d\0061t\+000061";