From af3be3bd4e77b66f4605393617da0d15ec21e15b Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 30 Oct 2020 18:51:10 +0200 Subject: [PATCH 1/1] WIP: Find all line-endings in COPY in chunks. Refactor CopyReadLines and friends to find all the line-endings in the buffer in one go, before splitting the lines further. --- src/backend/commands/copy.c | 972 ++++++++++++++++++++---------------- 1 file changed, 536 insertions(+), 436 deletions(-) diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 36ddcdccdb8..fbf11cb2550 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -95,6 +95,18 @@ typedef enum CopyInsertMethod CIM_MULTI_CONDITIONAL /* use table_multi_insert only if valid */ } CopyInsertMethod; + +/* + * Represents the heap insert method to be used during COPY FROM. + */ +typedef enum ParseLinesState +{ + PLSTATE_NORMAL, + PLSTATE_ESCAPE, + PLSTATE_IN_QUOTE, + PLSTATE_ESCAPE_IN_QUOTE +} ParseLinesState; + /* * This struct contains all the state variables used throughout a COPY * operation. For simplicity, we use the same struct for all variants of COPY, @@ -110,6 +122,24 @@ typedef enum CopyInsertMethod * it's faster to make useless comparisons to trailing bytes than it is to * invoke pg_encoding_mblen() to skip over them. encoding_embeds_ascii is true * when we have to do it the hard way. + * + * COPY FROM buffers: + * + * In COPY FROM processing, there are three levels of buffers: + * + * raw_buf - contains raw data read from file/client + * converted_buf - contains the data in 'raw_buf', but converted to server encoding + * line_buf - contains "current" line of data, without the end-of-line char + * + * + * In simple cases, no encoding conversion are needed, and converted_buf always + * points to raw_buf. If the encoding_embeds_ascii==true, encoding conversion is + * performed on the raw buffer, before splitting it to lines. converted_buf contains + * the converted version in that case. + * + * Usually, line_buf pointer points in the middle of converted_buf, but when a line + * is split by a raw-buffer boundary, the incomplete line is reassembled + * in a separate buffer (split_line_buf), and line_buf points to that. */ typedef struct CopyStateData { @@ -205,16 +235,34 @@ typedef struct CopyStateData char **raw_fields; /* - * Similarly, line_buf holds the whole input line being processed. The + * These variables are used to track state of parsing raw data into + * lines in COPY FROM. + */ + bool last_was_cr; + ParseLinesState parse_lines_state; + + int last_line_no; /* last line in 'endlines', -1 if EOF not reached yet */ + + int nextline; + int *endlines; /* line ending positions within raw_buf */ + int numlines; + + /* split_line_buf holds partial line carried over from previous buf */ + StringInfoData split_line_buf; + + /* + * Similarly, line_buf holds the current input line being processed. The * input cycle is first to read the whole line into line_buf, convert it * to server encoding there, and then extract the individual attribute * fields into attribute_buf. line_buf is preserved unmodified so that we * can display it in error messages if appropriate. (In binary mode, * line_buf is not used.) */ - StringInfoData line_buf; + char *line_buf; + int line_len; bool line_buf_converted; /* converted to server encoding? */ bool line_buf_valid; /* contains the row being processed? */ + bool line_buf_alloced; /* * Finally, raw_buf holds raw data read from the data source (file or @@ -230,6 +278,9 @@ typedef struct CopyStateData int raw_buf_len; /* total # of bytes stored */ /* Shorthand for number of unconsumed bytes available in raw_buf */ #define RAW_BUF_BYTES(cstate) ((cstate)->raw_buf_len - (cstate)->raw_buf_index) + + char *converted_buf; + int converted_buf_len; } CopyStateData; /* DestReceiver for COPY (query) TO */ @@ -288,72 +339,6 @@ typedef struct CopyMultiInsertInfo int ti_options; /* table insert options */ } CopyMultiInsertInfo; - -/* - * These macros centralize code used to process line_buf and raw_buf buffers. - * They are macros because they often do continue/break control and to avoid - * function call overhead in tight COPY loops. - * - * We must use "if (1)" because the usual "do {...} while(0)" wrapper would - * prevent the continue/break processing from working. We end the "if (1)" - * with "else ((void) 0)" to ensure the "if" does not unintentionally match - * any "else" in the calling code, and to avoid any compiler warnings about - * empty statements. See http://www.cit.gu.edu.au/~anthony/info/C/C.macros. - */ - -/* - * This keeps the character read at the top of the loop in the buffer - * even if there is more than one read-ahead. - */ -#define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \ -if (1) \ -{ \ - if (raw_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \ - { \ - raw_buf_ptr = prev_raw_ptr; /* undo fetch */ \ - need_data = true; \ - continue; \ - } \ -} else ((void) 0) - -/* This consumes the remainder of the buffer and breaks */ -#define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \ -if (1) \ -{ \ - if (raw_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \ - { \ - if (extralen) \ - raw_buf_ptr = copy_buf_len; /* consume the partial character */ \ - /* backslash just before EOF, treat as data char */ \ - result = true; \ - break; \ - } \ -} else ((void) 0) - -/* - * Transfer any approved data to line_buf; must do this to be sure - * there is some room in raw_buf. - */ -#define REFILL_LINEBUF \ -if (1) \ -{ \ - if (raw_buf_ptr > cstate->raw_buf_index) \ - { \ - appendBinaryStringInfo(&cstate->line_buf, \ - cstate->raw_buf + cstate->raw_buf_index, \ - raw_buf_ptr - cstate->raw_buf_index); \ - cstate->raw_buf_index = raw_buf_ptr; \ - } \ -} else ((void) 0) - -/* Undo any read-ahead and jump out of the block. */ -#define NO_END_OF_COPY_GOTO \ -if (1) \ -{ \ - raw_buf_ptr = prev_raw_ptr + 1; \ - goto not_end_of_copy; \ -} else ((void) 0) - static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0"; @@ -371,7 +356,8 @@ static uint64 DoCopyTo(CopyState cstate); static uint64 CopyTo(CopyState cstate); static void CopyOneRowTo(CopyState cstate, TupleTableSlot *slot); static bool CopyReadLine(CopyState cstate); -static bool CopyReadLineText(CopyState cstate); +static void ParseLinesText(CopyState cstate); +static void ParseLinesCSV(CopyState cstate); static int CopyReadAttributesText(CopyState cstate); static int CopyReadAttributesCSV(CopyState cstate); static Datum CopyReadBinaryAttribute(CopyState cstate, FmgrInfo *flinfo, @@ -382,7 +368,7 @@ static void CopyAttributeOutCSV(CopyState cstate, char *string, bool use_quote, bool single_attr); static List *CopyGetAttnums(TupleDesc tupDesc, Relation rel, List *attnamelist); -static char *limit_printout_length(const char *str); +static char *limit_printout_length(const char *str, int slen); /* Low-level communications functions */ static void SendCopyBegin(CopyState cstate); @@ -399,6 +385,7 @@ static bool CopyGetInt32(CopyState cstate, int32 *val); static void CopySendInt16(CopyState cstate, int16 val); static bool CopyGetInt16(CopyState cstate, int16 *val); static bool CopyLoadRawBuf(CopyState cstate); +static bool CopyLoadAndConvertBuf(CopyState cstate); static int CopyReadBinaryData(CopyState cstate, char *dest, int nbytes); @@ -2311,7 +2298,7 @@ CopyFromErrorCallback(void *arg) /* error is relevant to a particular column */ char *attval; - attval = limit_printout_length(cstate->cur_attval); + attval = limit_printout_length(cstate->cur_attval, strlen(cstate->cur_attval)); errcontext("COPY %s, line %s, column %s: \"%s\"", cstate->cur_relname, curlineno_str, cstate->cur_attname, attval); @@ -2341,7 +2328,7 @@ CopyFromErrorCallback(void *arg) { char *lineval; - lineval = limit_printout_length(cstate->line_buf.data); + lineval = limit_printout_length(cstate->line_buf, cstate->line_len); errcontext("COPY %s, line %s: \"%s\"", cstate->cur_relname, curlineno_str, lineval); pfree(lineval); @@ -2361,11 +2348,10 @@ CopyFromErrorCallback(void *arg) * Returns a pstrdup'd copy of the input. */ static char * -limit_printout_length(const char *str) +limit_printout_length(const char *str, int slen) { #define MAX_COPY_DATA_DISPLAY 100 - int slen = strlen(str); int len; char *res; @@ -2819,7 +2805,6 @@ CopyFrom(CopyState cstate) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("cannot perform COPY FREEZE because the table was not created or truncated in the current subtransaction"))); - ti_options |= TABLE_INSERT_FROZEN; } @@ -3224,7 +3209,7 @@ CopyFrom(CopyState cstate) /* Add this tuple to the tuple buffer */ CopyMultiInsertInfoStore(&multiInsertInfo, resultRelInfo, myslot, - cstate->line_buf.len, + 100, // FIXME cstate->line_buf.len, cstate->cur_lineno); /* @@ -3387,16 +3372,30 @@ BeginCopyFrom(ParseState *pstate, /* * Set up variables to avoid per-attribute overhead. attribute_buf and - * raw_buf are used in both text and binary modes, but we use line_buf - * only in text mode. + * raw_buf are used in both text and binary modes, but text mode has + * some extra state. */ initStringInfo(&cstate->attribute_buf); cstate->raw_buf = (char *) palloc(RAW_BUF_SIZE + 1); cstate->raw_buf_index = cstate->raw_buf_len = 0; if (!cstate->binary) { - initStringInfo(&cstate->line_buf); + cstate->last_was_cr = false; + cstate->parse_lines_state = PLSTATE_NORMAL; + cstate->last_line_no = -1; + cstate->nextline = 0; + cstate->endlines = palloc((RAW_BUF_SIZE + 1) * sizeof(int)); + cstate->numlines = 0; + + initStringInfo(&cstate->split_line_buf); + + cstate->line_buf = NULL; + cstate->line_len = 0; cstate->line_buf_converted = false; + cstate->line_buf_valid = false; + cstate->line_buf_alloced = false; + + cstate->converted_buf = NULL; } /* Assign range table, we'll need it in CopyFrom. */ @@ -3634,7 +3633,7 @@ NextCopyFromRawFields(CopyState cstate, char ***fields, int *nfields) * characters, we act as though it was newline followed by EOF, ie, * process the line and then exit loop on next iteration. */ - if (done && cstate->line_buf.len == 0) + if (done && cstate->line_len == 0) return false; /* Parse the line into de-escaped field values */ @@ -3863,451 +3862,550 @@ EndCopyFrom(CopyState cstate) static bool CopyReadLine(CopyState cstate) { - bool result; - - resetStringInfo(&cstate->line_buf); - cstate->line_buf_valid = true; + resetStringInfo(&cstate->split_line_buf); /* Mark that encoding conversion hasn't occurred yet */ cstate->line_buf_converted = false; + cstate->line_buf_valid = false; + + if (cstate->line_buf_alloced) + pfree(cstate->line_buf); - /* Parse data and transfer into line_buf */ - result = CopyReadLineText(cstate); + if (cstate->split_line_buf.data > 0) + resetStringInfo(&cstate->split_line_buf); - if (result) + if (cstate->last_line_no != -1 && cstate->nextline > cstate->last_line_no) + return true; + + /* + * If we processed all lines from previous batch, load more + */ + if (cstate->nextline == cstate->numlines) { - /* - * Reached EOF. In protocol version 3, we should ignore anything - * after \. up to the protocol end of copy data. (XXX maybe better - * not to treat \. as special?) - */ - if (cstate->copy_dest == COPY_NEW_FE) + for (;;) { - do + int endpos; + bool done; + + cstate->nextline = 0; + + /* + * Transfer any remaining data from previous buffer to split_line_buf. + */ + if (cstate->numlines == 0) + { + /* this chunk contained no line-ends at all. */ + endpos = 0; + } + else + { + endpos = cstate->endlines[cstate->numlines - 1]; + } + appendBinaryStringInfo(&cstate->split_line_buf, cstate->raw_buf + endpos, + cstate->raw_buf_len - endpos); + + /* Get next raw (and possibly converted) buf */ + done = !CopyLoadAndConvertBuf(cstate); + + /* Detect line boundaries within the buffer */ + if (cstate->csv_mode) + ParseLinesCSV(cstate); + else + ParseLinesText(cstate); + + /* + * If we reached the EOF, remember it, and add a sentinel end-of-line to + * 'endlines' so that the logic below doesn't need to special case the + * last line. + */ + if (done) { - cstate->raw_buf_index = cstate->raw_buf_len; - } while (CopyLoadRawBuf(cstate)); + cstate->last_line_no = cstate->numlines; + cstate->endlines[cstate->numlines] = cstate->converted_buf_len; + cstate->numlines++; + break; + } + else + cstate->last_line_no = -1; + + if (cstate->numlines > 0) + break; } } + + Assert(cstate->nextline < cstate->numlines); + + /* + * The first line in this buffer could be a contination of a split line that + * started on previous buffer. Treat it specially. + */ + if (cstate->nextline == 0) + { + if (cstate->split_line_buf.len > 0) + { + appendBinaryStringInfo(&cstate->split_line_buf, cstate->converted_buf, + cstate->endlines[0]); + cstate->line_buf = cstate->split_line_buf.data; + cstate->line_len = cstate->split_line_buf.len; + } + else + { + cstate->line_buf = cstate->converted_buf; + cstate->line_len = cstate->endlines[0]; + } + } + else + { + int startpos; + int endpos; + + startpos = cstate->endlines[cstate->nextline - 1]; + endpos = cstate->endlines[cstate->nextline]; + + cstate->line_buf = cstate->converted_buf + startpos; + cstate->line_len = endpos - startpos; + } + + if (cstate->nextline == cstate->last_line_no) + { + /* + * EOF at start of line means we're done. If we see EOF after some + * characters, we act as though it was newline followed by EOF, ie, + * process the line and then exit loop on next iteration. + */ + if (cstate->line_len == 0) + return true; + } else { /* * If we didn't hit EOF, then we must have transferred the EOL marker * to line_buf along with the data. Get rid of it. */ - switch (cstate->eol_type) + if (cstate->nextline != cstate->last_line_no) { - case EOL_NL: - Assert(cstate->line_buf.len >= 1); - Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n'); - cstate->line_buf.len--; - cstate->line_buf.data[cstate->line_buf.len] = '\0'; - break; - case EOL_CR: - Assert(cstate->line_buf.len >= 1); - Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\r'); - cstate->line_buf.len--; - cstate->line_buf.data[cstate->line_buf.len] = '\0'; - break; - case EOL_CRNL: - Assert(cstate->line_buf.len >= 2); - Assert(cstate->line_buf.data[cstate->line_buf.len - 2] == '\r'); - Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n'); - cstate->line_buf.len -= 2; - cstate->line_buf.data[cstate->line_buf.len] = '\0'; - break; - case EOL_UNKNOWN: - /* shouldn't get here */ - Assert(false); - break; + switch (cstate->eol_type) + { + case EOL_NL: + Assert(cstate->line_len >= 1); + Assert(cstate->line_buf[cstate->line_len - 1] == '\n'); + cstate->line_len--; + cstate->line_buf[cstate->line_len] = '\0'; + break; + case EOL_CR: + Assert(cstate->line_len >= 1); + Assert(cstate->line_buf[cstate->line_len - 1] == '\r'); + cstate->line_len--; + cstate->line_buf[cstate->line_len] = '\0'; + break; + case EOL_CRNL: + Assert(cstate->line_len >= 2); + Assert(cstate->line_buf[cstate->line_len - 2] == '\r'); + Assert(cstate->line_buf[cstate->line_len - 1] == '\n'); + cstate->line_len -= 2; + cstate->line_buf[cstate->line_len] = '\0'; + break; + case EOL_UNKNOWN: + /* shouldn't get here */ + Assert(false); + break; + } } } + cstate->nextline++; - /* Done reading the line. Convert it to server encoding. */ - if (cstate->need_transcoding) + cstate->line_buf_valid = true; + cstate->line_buf_alloced = false; + + /* + * Done reading the line. Convert it to server encoding. If the encoding was + * one that embeds ASCII, we did it for the whole raw buffer already + */ + if (cstate->need_transcoding && !cstate->encoding_embeds_ascii) { char *cvt; - cvt = pg_any_to_server(cstate->line_buf.data, - cstate->line_buf.len, + cvt = pg_any_to_server(cstate->line_buf, cstate->line_len, cstate->file_encoding); - if (cvt != cstate->line_buf.data) + if (cvt != cstate->line_buf) { /* transfer converted data back to line_buf */ - resetStringInfo(&cstate->line_buf); - appendBinaryStringInfo(&cstate->line_buf, cvt, strlen(cvt)); - pfree(cvt); + cstate->line_buf = cvt; + cstate->line_len = strlen(cvt); + cstate->line_buf_alloced = true; } } /* Now it's safe to use the buffer in error messages */ cstate->line_buf_converted = true; - return result; + return false; } -/* - * CopyReadLineText - inner loop of CopyReadLine for text mode - */ static bool -CopyReadLineText(CopyState cstate) +CopyLoadAndConvertBuf(CopyState cstate) { - char *copy_raw_buf; - int raw_buf_ptr; - int copy_buf_len; - bool need_data = false; - bool hit_eof = false; - bool result = false; - char mblen_str[2]; - - /* CSV variables */ - bool first_char_in_line = true; - bool in_quote = false, - last_was_esc = false; - char quotec = '\0'; - char escapec = '\0'; + bool moredata; - if (cstate->csv_mode) + /* Get next raw buf */ + moredata = CopyLoadRawBuf(cstate); + + /* convert if necessary */ + if (cstate->encoding_embeds_ascii) { - quotec = cstate->quote[0]; - escapec = cstate->escape[0]; - /* ignore special escape processing if it's the same as quotec */ - if (quotec == escapec) - escapec = '\0'; + Assert(cstate->need_transcoding); + + if (cstate->converted_buf && cstate->converted_buf != cstate->raw_buf) + pfree(cstate->converted_buf); + + while (moredata && cstate->raw_buf_len < MAX_CONVERSION_GROWTH) + moredata = CopyLoadRawBuf(cstate); + + if (!moredata) + { + cstate->raw_buf_index = cstate->raw_buf_len; + } + else + { + /* Find length */ + char *p; + char *pend; + + p = cstate->raw_buf; + pend = cstate->raw_buf + cstate->raw_buf_len; + while (p < pend - MAX_CONVERSION_GROWTH) + { + if (IS_HIGHBIT_SET(*p)) + { + int mblen; + + mblen = pg_encoding_mblen(cstate->file_encoding, p); + p += mblen; + } + else + p++; + } + cstate->raw_buf_index = pend - p; + } + cstate->converted_buf = pg_any_to_server(cstate->raw_buf, + cstate->raw_buf_index, + cstate->file_encoding); + if (cstate->converted_buf != cstate->raw_buf) + cstate->converted_buf_len = strlen(cstate->converted_buf); + else + cstate->converted_buf_len = cstate->raw_buf_index; + } + else + { + cstate->converted_buf = cstate->raw_buf; + cstate->converted_buf_len = cstate->raw_buf_len; + cstate->raw_buf_index = cstate->raw_buf_len; } - mblen_str[1] = '\0'; + return moredata; +} + +/* + * Find all newlines (or CRs or CRLNs) in the buffer in cstate->converted_buf. + * + * The positions of the newlines are stored in cstate->endlines array. + * Each position points to the *next* character, after the newline. + * + * A position can also be 0, meaning that there was a newline immediatedly + * before the current buffer. That case can currently only arise when + * processing the first line in EOL_UNKNOWN mode, and we see a CR at the + * end a buffer. In that case, we won't know until we see the first + * character of the next buffer, that the CR at the end of the previous + * buffer was really the end-of-line. + */ +static void +ParseLinesText(CopyState cstate) +{ + /* pre-requisites: there is data in converted_buf */ + char *startp; + char *p; + char *endp; + int *endlines; + int nlines; /* - * The objective of this loop is to transfer the entire next input line - * into line_buf. Hence, we only care for detecting newlines (\r and/or - * \n) and the end-of-copy marker (\.). - * - * In CSV mode, \r and \n inside a quoted field are just part of the data - * value and are put in line_buf. We keep just enough state to know if we - * are currently in a quoted field or not. - * - * These four characters, and the CSV escape and quote characters, are - * assumed the same in frontend and backend encodings. + * TODO: support multibyte encodings. Plan: * - * For speed, we try to move data from raw_buf to line_buf in chunks - * rather than one character at a time. raw_buf_ptr points to the next - * character to examine; any characters from raw_buf_index to raw_buf_ptr - * have been determined to be part of the line, but not yet transferred to - * line_buf. + * If encoding_embeds_ascii, the caller converts the raw buffer + * before calling this function, scanning through the buffer with + * pg_mblen() to find the multibyte character boundary. Stash any + * remaining bytes for next call. * - * For a little extra speed within the loop, we copy raw_buf and - * raw_buf_len into local variables. + * Otherwise, the conversion can be done separately on each line, after + * calling this function. */ - copy_raw_buf = cstate->raw_buf; - raw_buf_ptr = cstate->raw_buf_index; - copy_buf_len = cstate->raw_buf_len; - for (;;) - { - int prev_raw_ptr; - char c; + p = cstate->converted_buf; + startp = cstate->converted_buf; + endp = cstate->converted_buf + cstate->converted_buf_len; - /* - * Load more data if needed. Ideally we would just force four bytes - * of read-ahead and avoid the many calls to - * IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(), but the COPY_OLD_FE protocol - * does not allow us to read too far ahead or we might read into the - * next data, so we read-ahead only as far we know we can. One - * optimization would be to read-ahead four byte here if - * cstate->copy_dest != COPY_OLD_FE, but it hardly seems worth it, - * considering the size of the buffer. - */ - if (raw_buf_ptr >= copy_buf_len || need_data) - { - REFILL_LINEBUF; + endlines = cstate->endlines; + nlines = 0; - /* - * Try to read some more data. This will certainly reset - * raw_buf_index to zero, and raw_buf_ptr must go with it. - */ - if (!CopyLoadRawBuf(cstate)) - hit_eof = true; - raw_buf_ptr = 0; - copy_buf_len = cstate->raw_buf_len; + if (cstate->eol_type == EOL_UNKNOWN) + { + while (p < endp) + { + char c = *(p++); - /* - * If we are completely out of data, break out of the loop, - * reporting EOF. - */ - if (copy_buf_len <= 0) + if (c == '\n') + { + if (cstate->last_was_cr) + cstate->eol_type = EOL_CRNL; + else + cstate->eol_type = EOL_NL; + endlines[nlines++] = p - startp; + break; + } + else if (cstate->last_was_cr) { - result = true; + /* + * The previous character was \r, and this character is the first + * character of the next line. The line ended just *before* this + * character. + */ + endlines[nlines++] = (p - 1) - startp; + cstate->eol_type = EOL_CR; + cstate->last_was_cr = false; /* not used in EOL_CR mode */ break; } - need_data = false; + else if (c == '\r') + { + cstate->last_was_cr = true; + } } + /* continue processing according to the new 'eol_type' */ + } - /* OK to fetch a character */ - prev_raw_ptr = raw_buf_ptr; - c = copy_raw_buf[raw_buf_ptr++]; - - if (cstate->csv_mode) + if (cstate->eol_type == EOL_NL) + { + while (p < endp) { - /* - * If character is '\\' or '\r', we may need to look ahead below. - * Force fetch of the next character if we don't already have it. - * We need to do this before changing CSV state, in case one of - * these characters is also the quote or escape character. - * - * Note: old-protocol does not like forced prefetch, but it's OK - * here since we cannot validly be at EOF. - */ - if (c == '\\' || c == '\r') + char c = *(p++); + + /* Process \n */ + if (c == '\n') { - IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); + endlines[nlines++] = p - startp; } - /* - * Dealing with quotes and escapes here is mildly tricky. If the - * quote char is also the escape char, there's no problem - we - * just use the char as a toggle. If they are different, we need - * to ensure that we only take account of an escape inside a - * quoted field and immediately preceding a quote char, and not - * the second in an escape-escape sequence. - */ - if (in_quote && c == escapec) - last_was_esc = !last_was_esc; - if (c == quotec && !last_was_esc) - in_quote = !in_quote; - if (c != escapec) - last_was_esc = false; - - /* - * Updating the line count for embedded CR and/or LF chars is - * necessarily a little fragile - this test is probably about the - * best we can do. (XXX it's arguable whether we should do this - * at all --- is cur_lineno a physical or logical count?) - */ - if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r')) - cstate->cur_lineno++; + /* Process \r */ + if (c == '\r') + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("literal carriage return found in data"), + errhint("Use \"\\r\" to represent carriage return."))); } - - /* Process \r */ - if (c == '\r' && (!cstate->csv_mode || !in_quote)) + } + else if (cstate->eol_type == EOL_CR) + { + while (p < endp) { - /* Check for \r\n on first line, _and_ handle \r\n. */ - if (cstate->eol_type == EOL_UNKNOWN || - cstate->eol_type == EOL_CRNL) - { - /* - * If need more data, go back to loop top to load it. - * - * Note that if we are at EOF, c will wind up as '\0' because - * of the guaranteed pad of raw_buf. - */ - IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); + char c = *(p++); - /* get next char */ - c = copy_raw_buf[raw_buf_ptr]; + /* Process \r */ + if (c == '\r') + endlines[nlines++] = p - startp; - if (c == '\n') + /* Process \n */ + if (c == '\r') + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("literal newline found in data"), + errhint("Use \"\\n\" to represent carriage return."))); + } + } + else if (cstate->eol_type == EOL_CRNL) + { + while (p < endp) + { + char c = *(p++); + + if (c == '\n') + { + if (cstate->last_was_cr) { - raw_buf_ptr++; /* eat newline */ - cstate->eol_type = EOL_CRNL; /* in case not set yet */ + endlines[nlines++] = p - startp; + cstate->last_was_cr = false; } else - { - /* found \r, but no \n */ - if (cstate->eol_type == EOL_CRNL) - ereport(ERROR, - (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), - !cstate->csv_mode ? - errmsg("literal carriage return found in data") : - errmsg("unquoted carriage return found in data"), - !cstate->csv_mode ? - errhint("Use \"\\r\" to represent carriage return.") : - errhint("Use quoted CSV field to represent carriage return."))); - - /* - * if we got here, it is the first line and we didn't find - * \n, so don't consume the peeked character - */ - cstate->eol_type = EOL_CR; - } + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("literal newline found in data"), + errhint("Use \"\\n\" to represent carriage return."))); } - else if (cstate->eol_type == EOL_NL) + else if (cstate->last_was_cr) + { ereport(ERROR, (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), - !cstate->csv_mode ? - errmsg("literal carriage return found in data") : - errmsg("unquoted carriage return found in data"), - !cstate->csv_mode ? - errhint("Use \"\\r\" to represent carriage return.") : - errhint("Use quoted CSV field to represent carriage return."))); - /* If reach here, we have found the line terminator */ - break; + errmsg("literal carriage return found in data"), + errhint("Use \"\\r\" to represent carriage return."))); + } + else if (c == '\r') + { + cstate->last_was_cr = true; + } } + } - /* Process \n */ - if (c == '\n' && (!cstate->csv_mode || !in_quote)) - { - if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL) - ereport(ERROR, - (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), - !cstate->csv_mode ? - errmsg("literal newline found in data") : - errmsg("unquoted newline found in data"), - !cstate->csv_mode ? - errhint("Use \"\\n\" to represent newline.") : - errhint("Use quoted CSV field to represent newline."))); - cstate->eol_type = EOL_NL; /* in case not set yet */ - /* If reach here, we have found the line terminator */ - break; - } + cstate->numlines = nlines; +} - /* - * In CSV mode, we only recognize \. alone on a line. This is because - * \. is a valid CSV data value. - */ - if (c == '\\' && (!cstate->csv_mode || first_char_in_line)) - { - char c2; +/* + * Like ParseLinesText, but in CSV mode. + */ +static void +ParseLinesCSV(CopyState cstate) +{ + /* pre-requisites: there is data in converted_buf */ + char *startp; + char *p; + char *endp; + int *endlines; + int nlines; + int state = cstate->parse_lines_state; + char quotec = '\0'; + char escapec = '\0'; - IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); - IF_NEED_REFILL_AND_EOF_BREAK(0); + quotec = cstate->quote[0]; + escapec = cstate->escape[0]; + /* ignore special escape processing if it's the same as quotec */ + if (quotec == escapec) + escapec = '\0'; - /* ----- - * get next character - * Note: we do not change c so if it isn't \., we can fall - * through and continue processing for file encoding. - * ----- - */ - c2 = copy_raw_buf[raw_buf_ptr]; + p = cstate->converted_buf; + startp = cstate->converted_buf; + endp = cstate->converted_buf + cstate->converted_buf_len; - if (c2 == '.') - { - raw_buf_ptr++; /* consume the '.' */ + endlines = cstate->endlines; + nlines = 0; - /* - * Note: if we loop back for more data here, it does not - * matter that the CSV state change checks are re-executed; we - * will come back here with no important state changed. - */ - if (cstate->eol_type == EOL_CRNL) - { - /* Get the next character */ - IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); - /* if hit_eof, c2 will become '\0' */ - c2 = copy_raw_buf[raw_buf_ptr++]; + while (p < endp) + { + char c = *(p++); + bool last_was_cr; + + last_was_cr = cstate->last_was_cr; + cstate->last_was_cr = false; - if (c2 == '\n') + switch (state) + { + case PLSTATE_NORMAL: + if (c == '\n') + { + if (cstate->eol_type == EOL_NL) + endlines[nlines++] = p - startp; + else if (cstate->eol_type == EOL_CR) + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("unquoted newline found in data"), + errhint("Use quoted CSV field to represent newline."))); + else if (cstate->eol_type == EOL_CRNL) { - if (!cstate->csv_mode) + if (last_was_cr) + endlines[nlines++] = p - startp; + else ereport(ERROR, (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), - errmsg("end-of-copy marker does not match previous newline style"))); + errmsg("unquoted newline found in data"), + errhint("Use quoted CSV field to represent newline."))); + } + else if (cstate->eol_type == EOL_UNKNOWN) + { + if (last_was_cr) + cstate->eol_type = EOL_CRNL; else - NO_END_OF_COPY_GOTO; + cstate->eol_type = EOL_NL; + endlines[nlines++] = p - startp; } - else if (c2 != '\r') + } + else if (c == '\r') + { + if (cstate->eol_type == EOL_NL) + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("unquoted carriage return found in data"), + errhint("Use quoted CSV field to represent carriage return."))); + else if (cstate->eol_type == EOL_CR) + endlines[nlines++] = p - startp; + else if (cstate->eol_type == EOL_CRNL) { - if (!cstate->csv_mode) + if (last_was_cr) ereport(ERROR, (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), - errmsg("end-of-copy marker corrupt"))); + errmsg("unquoted carriage return found in data"), + errhint("Use quoted CSV field to represent carriage return."))); + cstate->last_was_cr = true; + } + else if (cstate->eol_type == EOL_UNKNOWN) + { + if (last_was_cr) + { + /* oops, the previous char was actually a line boundary already */ + cstate->eol_type = EOL_CR; + endlines[nlines++] = (p - 1) - startp; + endlines[nlines++] = p - startp; + } else - NO_END_OF_COPY_GOTO; + cstate->last_was_cr = true; } } - - /* Get the next character */ - IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); - /* if hit_eof, c2 will become '\0' */ - c2 = copy_raw_buf[raw_buf_ptr++]; - - if (c2 != '\r' && c2 != '\n') + else if (c == escapec) + state = PLSTATE_ESCAPE; + else if (c == quotec) + state = PLSTATE_IN_QUOTE; + else if (last_was_cr) { - if (!cstate->csv_mode) + if (cstate->eol_type == EOL_CRNL) ereport(ERROR, (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), - errmsg("end-of-copy marker corrupt"))); + errmsg("unquoted carriage return found in data"), + errhint("Use quoted CSV field to represent carriage return."))); else - NO_END_OF_COPY_GOTO; + { + Assert(cstate->eol_type == EOL_UNKNOWN); + cstate->eol_type = EOL_CR; + endlines[nlines++] = p - startp; + } } + break; - if ((cstate->eol_type == EOL_NL && c2 != '\n') || - (cstate->eol_type == EOL_CRNL && c2 != '\n') || - (cstate->eol_type == EOL_CR && c2 != '\r')) + case PLSTATE_ESCAPE: + if (quotec == escapec && c != quotec) { - ereport(ERROR, - (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), - errmsg("end-of-copy marker does not match previous newline style"))); + /* the escape was actually a quote */ + state = PLSTATE_IN_QUOTE; } - - /* - * Transfer only the data before the \. into line_buf, then - * discard the data and the \. sequence. - */ - if (prev_raw_ptr > cstate->raw_buf_index) - appendBinaryStringInfo(&cstate->line_buf, - cstate->raw_buf + cstate->raw_buf_index, - prev_raw_ptr - cstate->raw_buf_index); - cstate->raw_buf_index = raw_buf_ptr; - result = true; /* report EOF */ + else + state = PLSTATE_NORMAL; break; - } - else if (!cstate->csv_mode) - /* - * If we are here, it means we found a backslash followed by - * something other than a period. In non-CSV mode, anything - * after a backslash is special, so we skip over that second - * character too. If we didn't do that \\. would be - * considered an eof-of copy, while in non-CSV mode it is a - * literal backslash followed by a period. In CSV mode, - * backslashes are not special, so we want to process the - * character after the backslash just like a normal character, - * so we don't increment in those cases. - */ - raw_buf_ptr++; - } - - /* - * This label is for CSV cases where \. appears at the start of a - * line, but there is more text after it, meaning it was a data value. - * We are more strict for \. in CSV mode because \. could be a data - * value, while in non-CSV mode, \. cannot be a data value. - */ -not_end_of_copy: - - /* - * Process all bytes of a multi-byte character as a group. - * - * We only support multi-byte sequences where the first byte has the - * high-bit set, so as an optimization we can avoid this block - * entirely if it is not set. - */ - if (cstate->encoding_embeds_ascii && IS_HIGHBIT_SET(c)) - { - int mblen; - - /* - * It is enough to look at the first byte in all our encodings, to - * get the length. (GB18030 is a bit special, but still works for - * our purposes; see comment in pg_gb18030_mblen()) - */ - mblen_str[0] = c; - mblen = pg_encoding_mblen(cstate->file_encoding, mblen_str); + case PLSTATE_IN_QUOTE: + if (c == escapec) + state = PLSTATE_ESCAPE_IN_QUOTE; + else if (c == quotec) + state = PLSTATE_NORMAL; + break; - IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(mblen - 1); - IF_NEED_REFILL_AND_EOF_BREAK(mblen - 1); - raw_buf_ptr += mblen - 1; + case PLSTATE_ESCAPE_IN_QUOTE: + if (quotec == escapec && c != quotec) + { + /* the escape was actually the end quote */ + state = PLSTATE_NORMAL; + continue; /* process this byte again, as a normal */ + } + else + state = PLSTATE_IN_QUOTE; + break; } - first_char_in_line = false; - } /* end of outer loop */ - - /* - * Transfer any still-uncopied data to line_buf. - */ - REFILL_LINEBUF; - - return result; + } + cstate->numlines = nlines; + cstate->parse_lines_state = state; } /* @@ -4344,6 +4442,8 @@ GetDecimalFromHex(char hex) static int CopyReadAttributesText(CopyState cstate) { + char *line_buf = cstate->line_buf; + int len = cstate->line_len; char delimc = cstate->delim[0]; int fieldno; char *output_ptr; @@ -4356,7 +4456,7 @@ CopyReadAttributesText(CopyState cstate) */ if (cstate->max_fields <= 0) { - if (cstate->line_buf.len != 0) + if (len != 0) ereport(ERROR, (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), errmsg("extra data after last expected column"))); @@ -4372,13 +4472,13 @@ CopyReadAttributesText(CopyState cstate) * it this way because enlarging attribute_buf mid-stream would invalidate * pointers already stored into cstate->raw_fields[]. */ - if (cstate->attribute_buf.maxlen <= cstate->line_buf.len) - enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len); + if (cstate->attribute_buf.maxlen <= len) + enlargeStringInfo(&cstate->attribute_buf, len); output_ptr = cstate->attribute_buf.data; /* set pointer variables for loop */ - cur_ptr = cstate->line_buf.data; - line_end_ptr = cstate->line_buf.data + cstate->line_buf.len; + cur_ptr = line_buf; + line_end_ptr = line_buf + len; /* Outer loop iterates over fields */ fieldno = 0; @@ -4586,7 +4686,7 @@ CopyReadAttributesCSV(CopyState cstate) */ if (cstate->max_fields <= 0) { - if (cstate->line_buf.len != 0) + if (cstate->line_len != 0) ereport(ERROR, (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), errmsg("extra data after last expected column"))); @@ -4602,13 +4702,13 @@ CopyReadAttributesCSV(CopyState cstate) * it this way because enlarging attribute_buf mid-stream would invalidate * pointers already stored into cstate->raw_fields[]. */ - if (cstate->attribute_buf.maxlen <= cstate->line_buf.len) - enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len); + if (cstate->attribute_buf.maxlen <= cstate->line_len) + enlargeStringInfo(&cstate->attribute_buf, cstate->line_len); output_ptr = cstate->attribute_buf.data; /* set pointer variables for loop */ - cur_ptr = cstate->line_buf.data; - line_end_ptr = cstate->line_buf.data + cstate->line_buf.len; + cur_ptr = cstate->line_buf; + line_end_ptr = cstate->line_buf + cstate->line_len; /* Outer loop iterates over fields */ fieldno = 0; -- 2.20.1