From 00321a601847ad1c45f6e9fba4ae47bf64e1deb0 Mon Sep 17 00:00:00 2001 From: Vignesh C Date: Mon, 17 Aug 2020 17:48:09 +0530 Subject: [PATCH v5 2/2] Add header matching mode to "COPY FROM" COPY FROM supports the HEADER option to silently discard the header from a CSV or text file. It is possible to load by mistake a file that matches the expected format, for example if two text columns have been swapped, resulting in garbage in the database. This option adds the possibility to actually check the header to make sure it matches what is expected and exit immediatly if it does not. Discussion: https://www.postgresql.org/message-id/flat/CAF1-J-0PtCWMeLtswwGV2M70U26n4g33gpe1rcKQqe6wVQDrFA@mail.gmail.com --- contrib/file_fdw/input/file_fdw.source | 6 ++ contrib/file_fdw/output/file_fdw.source | 9 ++- doc/src/sgml/ref/copy.sgml | 12 ++- src/backend/commands/copy.c | 128 +++++++++++++++++++++++++++++--- src/test/regress/input/copy.source | 25 +++++++ src/test/regress/output/copy.source | 17 +++++ 6 files changed, 183 insertions(+), 14 deletions(-) diff --git a/contrib/file_fdw/input/file_fdw.source b/contrib/file_fdw/input/file_fdw.source index 83edb71..7a3983c 100644 --- a/contrib/file_fdw/input/file_fdw.source +++ b/contrib/file_fdw/input/file_fdw.source @@ -79,6 +79,12 @@ CREATE FOREIGN TABLE agg_bad ( OPTIONS (format 'csv', filename '@abs_srcdir@/data/agg.bad', header 'true', delimiter ';', quote '@', escape '"', null ''); ALTER FOREIGN TABLE agg_bad ADD CHECK (a >= 0); +-- test header matching +CREATE FOREIGN TABLE header_match ("1" int, foo text) SERVER file_server +OPTIONS (format 'csv', filename '@abs_srcdir@/data/list1.csv', delimiter ',', header 'match'); +CREATE FOREIGN TABLE header_dont_match (a int, foo text) SERVER file_server +OPTIONS (format 'csv', filename '@abs_srcdir@/data/list1.csv', delimiter ',', header 'match'); -- ERROR + -- per-column options tests CREATE FOREIGN TABLE text_csv ( word1 text OPTIONS (force_not_null 'true'), diff --git a/contrib/file_fdw/output/file_fdw.source b/contrib/file_fdw/output/file_fdw.source index 547b81f..ebe826b 100644 --- a/contrib/file_fdw/output/file_fdw.source +++ b/contrib/file_fdw/output/file_fdw.source @@ -93,6 +93,11 @@ CREATE FOREIGN TABLE agg_bad ( ) SERVER file_server OPTIONS (format 'csv', filename '@abs_srcdir@/data/agg.bad', header 'true', delimiter ';', quote '@', escape '"', null ''); ALTER FOREIGN TABLE agg_bad ADD CHECK (a >= 0); +-- test header matching +CREATE FOREIGN TABLE header_match ("1" int, foo text) SERVER file_server +OPTIONS (format 'csv', filename '@abs_srcdir@/data/list1.csv', delimiter ',', header 'match'); +CREATE FOREIGN TABLE header_dont_match (a int, foo text) SERVER file_server +OPTIONS (format 'csv', filename '@abs_srcdir@/data/list1.csv', delimiter ',', header 'match'); -- ERROR -- per-column options tests CREATE FOREIGN TABLE text_csv ( word1 text OPTIONS (force_not_null 'true'), @@ -439,12 +444,14 @@ SET ROLE regress_file_fdw_superuser; -- cleanup RESET ROLE; DROP EXTENSION file_fdw CASCADE; -NOTICE: drop cascades to 7 other objects +NOTICE: drop cascades to 9 other objects DETAIL: drop cascades to server file_server drop cascades to user mapping for regress_file_fdw_superuser on server file_server drop cascades to user mapping for regress_no_priv_user on server file_server drop cascades to foreign table agg_text drop cascades to foreign table agg_csv drop cascades to foreign table agg_bad +drop cascades to foreign table header_match +drop cascades to foreign table header_dont_match drop cascades to foreign table text_csv DROP ROLE regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user; diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml index c628a69..cb8232d 100644 --- a/doc/src/sgml/ref/copy.sgml +++ b/doc/src/sgml/ref/copy.sgml @@ -36,7 +36,7 @@ COPY { table_name [ ( boolean ] DELIMITER 'delimiter_character' NULL 'null_string' - HEADER [ boolean ] + HEADER [ match | true | false ] QUOTE 'quote_character' ESCAPE 'escape_character' FORCE_QUOTE { ( column_name [, ...] ) | * } @@ -268,9 +268,13 @@ COPY { table_name [ ( Specifies that the file contains a header line with the names of each column in the file. On output, the first line contains the column - names from the table, and on input, the first line is ignored. - This option is allowed only when using CSV or - text format. + names from the table. On input, the first line is discarded when + header is set to true or required + to match the column names if set to match. If the + number of columns in the header is not correct, their order differs + from the one expected, or the name or case do not match, the copy will + be aborted with an error. This option is allowed only when using + CSV or text format. diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 5d5ad43..0625090 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -96,6 +96,16 @@ typedef enum CopyInsertMethod } CopyInsertMethod; /* + * Represents whether the header must be absent, present or present and match. + */ +typedef enum CopyHeader +{ + COPY_HEADER_ABSENT, + COPY_HEADER_PRESENT, + COPY_HEADER_MATCH +} CopyHeader; + +/* * This struct contains all the state variables used throughout a COPY * operation. For simplicity, we use the same struct for all variants of COPY, * even though some fields are used in only some cases. @@ -136,7 +146,7 @@ typedef struct CopyStateData bool binary; /* binary format? */ bool freeze; /* freeze rows on loading? */ bool csv_mode; /* Comma Separated Value format? */ - bool header_line; /* CSV or text header line? */ + CopyHeader header_line; /* CSV or text header line? */ char *null_print; /* NULL marker string (server encoding!) */ int null_print_len; /* length of same */ char *null_print_client; /* same converted to file encoding */ @@ -1136,6 +1146,64 @@ DoCopy(ParseState *pstate, const CopyStmt *stmt, } /* + * Extract a CopyHeader value from a DefElem. + */ +static CopyHeader +DefGetCopyHeader(DefElem *def) +{ + /* + * If no parameter given, assume "true" is meant. + */ + if (def->arg == NULL) + return COPY_HEADER_PRESENT; + + /* + * Allow 0, 1, "true", "false", "on", "off" or "match". + */ + switch (nodeTag(def->arg)) + { + case T_Integer: + switch (intVal(def->arg)) + { + case 0: + return COPY_HEADER_ABSENT; + case 1: + return COPY_HEADER_PRESENT; + default: + /* otherwise, error out below */ + break; + } + break; + default: + { + char *sval = defGetString(def); + + /* + * The set of strings accepted here should match up with the + * grammar's opt_boolean_or_string production. + */ + if (pg_strcasecmp(sval, "true") == 0) + return COPY_HEADER_PRESENT; + if (pg_strcasecmp(sval, "false") == 0) + return COPY_HEADER_ABSENT; + if (pg_strcasecmp(sval, "on") == 0) + return COPY_HEADER_PRESENT; + if (pg_strcasecmp(sval, "off") == 0) + return COPY_HEADER_ABSENT; + if (pg_strcasecmp(sval, "match") == 0) + return COPY_HEADER_MATCH; + + } + break; + } + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("%s requires a boolean or \"match\"", + def->defname))); + return COPY_HEADER_ABSENT; /* keep compiler quiet */ +} + +/* * Process the statement option list for COPY. * * Scan the options list (a list of DefElem) and transpose the information @@ -1230,7 +1298,8 @@ ProcessCopyOptions(ParseState *pstate, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("conflicting or redundant options"), parser_errposition(pstate, defel->location))); - cstate->header_line = defGetBoolean(defel); + + cstate->header_line = DefGetCopyHeader(defel); } else if (strcmp(defel->defname, "quote") == 0) { @@ -1411,7 +1480,7 @@ ProcessCopyOptions(ParseState *pstate, errmsg("COPY delimiter cannot be \"%s\"", cstate->delim))); /* Check header */ - if (cstate->binary && cstate->header_line) + if (cstate->binary && cstate->header_line != COPY_HEADER_ABSENT) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("COPY HEADER available only in CSV and text mode"))); @@ -2132,7 +2201,7 @@ CopyTo(CopyState cstate) cstate->file_encoding); /* if a header has been requested send the line */ - if (cstate->header_line) + if (cstate->header_line != COPY_HEADER_ABSENT) { bool hdr_delim = false; @@ -2149,7 +2218,7 @@ CopyTo(CopyState cstate) if (cstate->csv_mode) CopyAttributeOutCSV(cstate, colname, false, - list_length(cstate->attnumlist) == 1); + list_length(cstate->attnumlist) == 1); else CopyAttributeOutText(cstate, colname); } @@ -3647,12 +3716,53 @@ NextCopyFromRawFields(CopyState cstate, char ***fields, int *nfields) /* only available for text or csv input */ Assert(!cstate->binary); - /* on input just throw the header line away */ - if (cstate->cur_lineno == 0 && cstate->header_line) + /* on input check that the header line is correct if needed */ + if (cstate->cur_lineno == 0 && cstate->header_line != COPY_HEADER_ABSENT) { + ListCell *cur; + TupleDesc tupDesc; + + tupDesc = RelationGetDescr(cstate->rel); + cstate->cur_lineno++; - if (CopyReadLine(cstate)) - return false; /* done */ + done = CopyReadLine(cstate); + + if (cstate->header_line == COPY_HEADER_MATCH) + { + if (cstate->csv_mode) + fldct = CopyReadAttributesCSV(cstate); + else + fldct = CopyReadAttributesText(cstate); + + if (fldct < list_length(cstate->attnumlist)) + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("missing header"))); + else if (fldct > list_length(cstate->attnumlist)) + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("extra data after last expected header"))); + + foreach(cur, cstate->attnumlist) + { + int attnum = lfirst_int(cur); + char *colName = cstate->raw_fields[attnum - 1]; + Form_pg_attribute attr = TupleDescAttr(tupDesc, attnum - 1); + + if (colName == NULL) + colName = cstate->null_print; + + if (namestrcmp(&attr->attname, colName) != 0) { + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("wrong header for column \"%s\": got \"%s\"", + NameStr(attr->attname), colName))); + } + } + } + + if (done) + return false; } cstate->cur_lineno++; diff --git a/src/test/regress/input/copy.source b/src/test/regress/input/copy.source index 2368649..4d21c7d 100644 --- a/src/test/regress/input/copy.source +++ b/src/test/regress/input/copy.source @@ -213,3 +213,28 @@ select * from parted_copytest where b = 1; select * from parted_copytest where b = 2; drop table parted_copytest; + +-- Test header matching feature +create table header_copytest ( + a int, + b int, + c text +); +copy header_copytest from stdin with (header wrong_choice); +copy header_copytest from stdin with (header match); +a b c +1 2 foo +\. +copy header_copytest from stdin with (header match); +a b +1 2 +\. +copy header_copytest from stdin with (header match); +a b c d +1 2 foo bar +\. +copy header_copytest from stdin with (header match, format csv); +a,b,c +1,2,foo +\. +drop table header_copytest; diff --git a/src/test/regress/output/copy.source b/src/test/regress/output/copy.source index c1f7f99..b792181 100644 --- a/src/test/regress/output/copy.source +++ b/src/test/regress/output/copy.source @@ -173,3 +173,20 @@ select * from parted_copytest where b = 2; (1 row) drop table parted_copytest; +-- Test header matching feature +create table header_copytest ( + a int, + b int, + c text +); +copy header_copytest from stdin with (header wrong_choice); +ERROR: header requires a boolean or "match" +copy header_copytest from stdin with (header match); +copy header_copytest from stdin with (header match); +ERROR: missing header +CONTEXT: COPY header_copytest, line 1: "a b" +copy header_copytest from stdin with (header match); +ERROR: extra data after last expected header +CONTEXT: COPY header_copytest, line 1: "a b c d" +copy header_copytest from stdin with (header match, format csv); +drop table header_copytest; -- 1.8.3.1