adjust chr()/ascii() to prevent invalidly encoded data - Mailing list pgsql-patches
From | Andrew Dunstan |
---|---|
Subject | adjust chr()/ascii() to prevent invalidly encoded data |
Date | |
Msg-id | 46EAAF0E.5090501@dunslane.net Whole thread Raw |
Responses |
Re: adjust chr()/ascii() to prevent invalidly encoded
data
|
List | pgsql-patches |
The attached patch is intended to ensure that chr() does not produce invalidly encoded data, as recently discussed on -hackers. For UTF8, we treat its argument as a Unicode code point; for all other multi-byte encodings, we raise an error on any argument greater than 127. For all encodings we raise an error if the argument is 0 (we don't allow null bytes in text data). The ascii() function is adjusted so that it remains the inverse of chr() - i.e. for UTF8 it returns the Unicode code point, and it raises an error for any other multi-byte encoding if the aregument is outside the ASCII range. I have tested thius inverse property across the entire Unicode code point range, 0x01 .. 0x1ffff. TODO: alter docs to reflect changes. cheers andrew Index: src/backend/utils/adt/oracle_compat.c =================================================================== RCS file: /cvsroot/pgsql/src/backend/utils/adt/oracle_compat.c,v retrieving revision 1.70 diff -c -r1.70 oracle_compat.c *** src/backend/utils/adt/oracle_compat.c 27 Feb 2007 23:48:08 -0000 1.70 --- src/backend/utils/adt/oracle_compat.c 14 Sep 2007 15:44:41 -0000 *************** *** 1246,1251 **** --- 1246,1258 ---- * * Returns the decimal representation of the first character from * string. + * If the string is empty we return 0. + * If the database encoding is UTF8, we return the Unicode codepoint. + * If the database encoding is any other multi-byte encoding, we + * return the value of the first byte if it is an ASCII character + * (range 1 .. 127), or raise an error. + * For all other encodings we return the value of the first byte, + * (range 1..255). * ********************************************************************/ *************** *** 1253,1263 **** ascii(PG_FUNCTION_ARGS) { text *string = PG_GETARG_TEXT_P(0); if (VARSIZE(string) <= VARHDRSZ) PG_RETURN_INT32(0); ! PG_RETURN_INT32((int32) *((unsigned char *) VARDATA(string))); } /******************************************************************** --- 1260,1316 ---- ascii(PG_FUNCTION_ARGS) { text *string = PG_GETARG_TEXT_P(0); + int encoding = GetDatabaseEncoding(); + unsigned char *data; if (VARSIZE(string) <= VARHDRSZ) PG_RETURN_INT32(0); ! data = (unsigned char *) VARDATA(string); ! ! if (encoding == PG_UTF8 && *data > 127) ! { ! /* return the code point for Unicode */ ! ! int result = 0, tbytes = 0, i; ! ! if (*data >= 0xF0) ! { ! result = *data & 0x07; ! tbytes = 3; ! } ! else if (*data >= 0xE0) ! { ! result = *data & 0x0F; ! tbytes = 2; ! } ! else ! { ! Assert (*data > 0xC0); ! result = *data & 0x1f; ! tbytes = 1; ! } ! ! Assert (tbytes > 0); ! ! for (i = 1; i <= tbytes; i++) ! { ! Assert ((data[i] & 0xC0) == 0x80); ! result = (result << 6) + (data[i] & 0x3f); ! } ! ! PG_RETURN_INT32(result); ! } ! else ! { ! if (pg_encoding_max_length(encoding) > 1 && *data > 127) ! ereport(ERROR, ! (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), ! errmsg("requested character too large"))); ! ! ! PG_RETURN_INT32((int32) *data); ! } } /******************************************************************** *************** *** 1270,1288 **** * * Purpose: * ! * Returns the character having the binary equivalent to val * ********************************************************************/ Datum chr(PG_FUNCTION_ARGS) { ! int32 cvalue = PG_GETARG_INT32(0); text *result; ! result = (text *) palloc(VARHDRSZ + 1); ! SET_VARSIZE(result, VARHDRSZ + 1); ! *VARDATA(result) = (char) cvalue; PG_RETURN_TEXT_P(result); } --- 1323,1418 ---- * * Purpose: * ! * Returns the character having the binary equivalent to val. ! * ! * For UTF8 we treat the argumwent as a Unicode code point. ! * For other multi-byte encodings we raise an error for arguments ! * outside the strict ASCII range (1..127). ! * ! * It's important that we don't ever return a value that is not valid ! * in the database encoding, so that this doesn't become a way for ! * invalid data to enter the database. * ********************************************************************/ Datum chr(PG_FUNCTION_ARGS) { ! uint32 cvalue = PG_GETARG_UINT32(0); text *result; + int encoding = GetDatabaseEncoding(); + + if (encoding == PG_UTF8 && cvalue > 127) + { + /* for Unicode we treat the argument as a code point */ + int bytes ; + char *wch; ! /* We only allow valid Unicode code points */ ! if (cvalue > 0x001fffff) ! ereport(ERROR, ! (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), ! errmsg("requested character too large for encoding: %d", ! cvalue))); ! ! if (cvalue > 0xffff) ! bytes = 4; ! else if (cvalue > 0x07ff) ! bytes = 3; ! else ! bytes = 2; ! ! result = (text *) palloc(VARHDRSZ + bytes); ! SET_VARSIZE(result, VARHDRSZ + bytes); ! wch = VARDATA(result); ! ! if (bytes == 2) ! { ! wch[0] = 0xC0 | ((cvalue >> 6) & 0x1F); ! wch[1] = 0x80 | (cvalue & 0x3F);; ! } ! else if (bytes == 3) ! { ! wch[0] = 0xE0 | ((cvalue >> 12) & 0x0F); ! wch[1] = 0x80 | ((cvalue >> 6) & 0x3F); ! wch[2] = 0x80 | (cvalue & 0x3F); ! } ! else ! { ! wch[0] = 0xF0 | ((cvalue >> 18) & 0x07); ! wch[1] = 0x80 | ((cvalue >> 12) & 0x3F); ! wch[2] = 0x80 | ((cvalue >> 6) & 0x3F); ! wch[3] = 0x80 | (cvalue & 0x3F); ! } ! ! } ! ! else ! { ! bool is_mb; ! ! /* Error out on arguments that make no sense or that we ! * can't validly represent in the encoding. ! */ ! ! if (cvalue == 0) ! ereport(ERROR, ! (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), ! errmsg("null character not permitted"))); ! ! is_mb = pg_encoding_max_length(encoding) > 1; ! ! if ((is_mb && (cvalue > 255)) || (! is_mb && (cvalue > 127))) ! ereport(ERROR, ! (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), ! errmsg("requested character too large for encoding: %d", ! cvalue))); ! ! ! result = (text *) palloc(VARHDRSZ + 1); ! SET_VARSIZE(result, VARHDRSZ + 1); ! *VARDATA(result) = (char) cvalue; ! } PG_RETURN_TEXT_P(result); }
pgsql-patches by date: