From af3be3bd4e77b66f4605393617da0d15ec21e15b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 30 Oct 2020 18:51:10 +0200
Subject: [PATCH 1/1] WIP: Find all line-endings in COPY in chunks.

Refactor CopyReadLines and friends to find all the line-endings in the
buffer in one go, before splitting the lines further.
---
 src/backend/commands/copy.c | 972 ++++++++++++++++++++----------------
 1 file changed, 536 insertions(+), 436 deletions(-)

diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 36ddcdccdb8..fbf11cb2550 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -95,6 +95,18 @@ typedef enum CopyInsertMethod
 	CIM_MULTI_CONDITIONAL		/* use table_multi_insert only if valid */
 } CopyInsertMethod;
 
+
+/*
+ * Represents the heap insert method to be used during COPY FROM.
+ */
+typedef enum ParseLinesState
+{
+	PLSTATE_NORMAL,
+	PLSTATE_ESCAPE,
+	PLSTATE_IN_QUOTE,
+	PLSTATE_ESCAPE_IN_QUOTE
+} ParseLinesState;
+
 /*
  * This struct contains all the state variables used throughout a COPY
  * operation. For simplicity, we use the same struct for all variants of COPY,
@@ -110,6 +122,24 @@ typedef enum CopyInsertMethod
  * it's faster to make useless comparisons to trailing bytes than it is to
  * invoke pg_encoding_mblen() to skip over them. encoding_embeds_ascii is true
  * when we have to do it the hard way.
+ *
+ * COPY FROM buffers:
+ *
+ * In COPY FROM processing, there are three levels of buffers:
+ *
+ * raw_buf       - contains raw data read from file/client
+ * converted_buf - contains the data in 'raw_buf', but converted to server encoding
+ * line_buf      - contains "current" line of data, without the end-of-line char
+ *
+ *
+ * In simple cases, no encoding conversion are needed, and converted_buf always
+ * points to raw_buf. If the encoding_embeds_ascii==true, encoding conversion is
+ * performed on the raw buffer, before splitting it to lines. converted_buf contains
+ * the converted version in that case.
+ *
+ * Usually, line_buf pointer points in the middle of converted_buf, but when a line
+ * is split by a raw-buffer boundary, the incomplete line is reassembled
+ * in a separate buffer (split_line_buf), and line_buf points to that.
  */
 typedef struct CopyStateData
 {
@@ -205,16 +235,34 @@ typedef struct CopyStateData
 	char	  **raw_fields;
 
 	/*
-	 * Similarly, line_buf holds the whole input line being processed. The
+	 * These variables are used to track state of parsing raw data into
+	 * lines in COPY FROM.
+	 */
+	bool		last_was_cr;
+	ParseLinesState parse_lines_state;
+
+	int			last_line_no; /* last line in 'endlines', -1 if EOF not reached yet */
+
+	int			nextline;
+	int		   *endlines; /* line ending positions within raw_buf */
+	int			numlines;
+
+	/* split_line_buf holds partial line carried over from previous buf */
+	StringInfoData split_line_buf;
+
+	/*
+	 * Similarly, line_buf holds the current input line being processed. The
 	 * input cycle is first to read the whole line into line_buf, convert it
 	 * to server encoding there, and then extract the individual attribute
 	 * fields into attribute_buf.  line_buf is preserved unmodified so that we
 	 * can display it in error messages if appropriate.  (In binary mode,
 	 * line_buf is not used.)
 	 */
-	StringInfoData line_buf;
+	char	   *line_buf;
+	int			line_len;
 	bool		line_buf_converted; /* converted to server encoding? */
 	bool		line_buf_valid; /* contains the row being processed? */
+	bool		line_buf_alloced;
 
 	/*
 	 * Finally, raw_buf holds raw data read from the data source (file or
@@ -230,6 +278,9 @@ typedef struct CopyStateData
 	int			raw_buf_len;	/* total # of bytes stored */
 	/* Shorthand for number of unconsumed bytes available in raw_buf */
 #define RAW_BUF_BYTES(cstate) ((cstate)->raw_buf_len - (cstate)->raw_buf_index)
+
+	char	   *converted_buf;
+	int			converted_buf_len;
 } CopyStateData;
 
 /* DestReceiver for COPY (query) TO */
@@ -288,72 +339,6 @@ typedef struct CopyMultiInsertInfo
 	int			ti_options;		/* table insert options */
 } CopyMultiInsertInfo;
 
-
-/*
- * These macros centralize code used to process line_buf and raw_buf buffers.
- * They are macros because they often do continue/break control and to avoid
- * function call overhead in tight COPY loops.
- *
- * We must use "if (1)" because the usual "do {...} while(0)" wrapper would
- * prevent the continue/break processing from working.  We end the "if (1)"
- * with "else ((void) 0)" to ensure the "if" does not unintentionally match
- * any "else" in the calling code, and to avoid any compiler warnings about
- * empty statements.  See http://www.cit.gu.edu.au/~anthony/info/C/C.macros.
- */
-
-/*
- * This keeps the character read at the top of the loop in the buffer
- * even if there is more than one read-ahead.
- */
-#define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \
-if (1) \
-{ \
-	if (raw_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
-	{ \
-		raw_buf_ptr = prev_raw_ptr; /* undo fetch */ \
-		need_data = true; \
-		continue; \
-	} \
-} else ((void) 0)
-
-/* This consumes the remainder of the buffer and breaks */
-#define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \
-if (1) \
-{ \
-	if (raw_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \
-	{ \
-		if (extralen) \
-			raw_buf_ptr = copy_buf_len; /* consume the partial character */ \
-		/* backslash just before EOF, treat as data char */ \
-		result = true; \
-		break; \
-	} \
-} else ((void) 0)
-
-/*
- * Transfer any approved data to line_buf; must do this to be sure
- * there is some room in raw_buf.
- */
-#define REFILL_LINEBUF \
-if (1) \
-{ \
-	if (raw_buf_ptr > cstate->raw_buf_index) \
-	{ \
-		appendBinaryStringInfo(&cstate->line_buf, \
-							 cstate->raw_buf + cstate->raw_buf_index, \
-							   raw_buf_ptr - cstate->raw_buf_index); \
-		cstate->raw_buf_index = raw_buf_ptr; \
-	} \
-} else ((void) 0)
-
-/* Undo any read-ahead and jump out of the block. */
-#define NO_END_OF_COPY_GOTO \
-if (1) \
-{ \
-	raw_buf_ptr = prev_raw_ptr + 1; \
-	goto not_end_of_copy; \
-} else ((void) 0)
-
 static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0";
 
 
@@ -371,7 +356,8 @@ static uint64 DoCopyTo(CopyState cstate);
 static uint64 CopyTo(CopyState cstate);
 static void CopyOneRowTo(CopyState cstate, TupleTableSlot *slot);
 static bool CopyReadLine(CopyState cstate);
-static bool CopyReadLineText(CopyState cstate);
+static void ParseLinesText(CopyState cstate);
+static void ParseLinesCSV(CopyState cstate);
 static int	CopyReadAttributesText(CopyState cstate);
 static int	CopyReadAttributesCSV(CopyState cstate);
 static Datum CopyReadBinaryAttribute(CopyState cstate, FmgrInfo *flinfo,
@@ -382,7 +368,7 @@ static void CopyAttributeOutCSV(CopyState cstate, char *string,
 								bool use_quote, bool single_attr);
 static List *CopyGetAttnums(TupleDesc tupDesc, Relation rel,
 							List *attnamelist);
-static char *limit_printout_length(const char *str);
+static char *limit_printout_length(const char *str, int slen);
 
 /* Low-level communications functions */
 static void SendCopyBegin(CopyState cstate);
@@ -399,6 +385,7 @@ static bool CopyGetInt32(CopyState cstate, int32 *val);
 static void CopySendInt16(CopyState cstate, int16 val);
 static bool CopyGetInt16(CopyState cstate, int16 *val);
 static bool CopyLoadRawBuf(CopyState cstate);
+static bool CopyLoadAndConvertBuf(CopyState cstate);
 static int	CopyReadBinaryData(CopyState cstate, char *dest, int nbytes);
 
 
@@ -2311,7 +2298,7 @@ CopyFromErrorCallback(void *arg)
 			/* error is relevant to a particular column */
 			char	   *attval;
 
-			attval = limit_printout_length(cstate->cur_attval);
+			attval = limit_printout_length(cstate->cur_attval, strlen(cstate->cur_attval));
 			errcontext("COPY %s, line %s, column %s: \"%s\"",
 					   cstate->cur_relname, curlineno_str,
 					   cstate->cur_attname, attval);
@@ -2341,7 +2328,7 @@ CopyFromErrorCallback(void *arg)
 			{
 				char	   *lineval;
 
-				lineval = limit_printout_length(cstate->line_buf.data);
+				lineval = limit_printout_length(cstate->line_buf, cstate->line_len);
 				errcontext("COPY %s, line %s: \"%s\"",
 						   cstate->cur_relname, curlineno_str, lineval);
 				pfree(lineval);
@@ -2361,11 +2348,10 @@ CopyFromErrorCallback(void *arg)
  * Returns a pstrdup'd copy of the input.
  */
 static char *
-limit_printout_length(const char *str)
+limit_printout_length(const char *str, int slen)
 {
 #define MAX_COPY_DATA_DISPLAY 100
 
-	int			slen = strlen(str);
 	int			len;
 	char	   *res;
 
@@ -2819,7 +2805,6 @@ CopyFrom(CopyState cstate)
 			ereport(ERROR,
 					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 					 errmsg("cannot perform COPY FREEZE because the table was not created or truncated in the current subtransaction")));
-
 		ti_options |= TABLE_INSERT_FROZEN;
 	}
 
@@ -3224,7 +3209,7 @@ CopyFrom(CopyState cstate)
 					/* Add this tuple to the tuple buffer */
 					CopyMultiInsertInfoStore(&multiInsertInfo,
 											 resultRelInfo, myslot,
-											 cstate->line_buf.len,
+											 100, // FIXME cstate->line_buf.len,
 											 cstate->cur_lineno);
 
 					/*
@@ -3387,16 +3372,30 @@ BeginCopyFrom(ParseState *pstate,
 
 	/*
 	 * Set up variables to avoid per-attribute overhead.  attribute_buf and
-	 * raw_buf are used in both text and binary modes, but we use line_buf
-	 * only in text mode.
+	 * raw_buf are used in both text and binary modes, but text mode has
+	 * some extra state.
 	 */
 	initStringInfo(&cstate->attribute_buf);
 	cstate->raw_buf = (char *) palloc(RAW_BUF_SIZE + 1);
 	cstate->raw_buf_index = cstate->raw_buf_len = 0;
 	if (!cstate->binary)
 	{
-		initStringInfo(&cstate->line_buf);
+		cstate->last_was_cr = false;
+		cstate->parse_lines_state = PLSTATE_NORMAL;
+		cstate->last_line_no = -1;
+		cstate->nextline = 0;
+		cstate->endlines = palloc((RAW_BUF_SIZE + 1) * sizeof(int));
+		cstate->numlines = 0;
+
+		initStringInfo(&cstate->split_line_buf);
+
+		cstate->line_buf = NULL;
+		cstate->line_len = 0;
 		cstate->line_buf_converted = false;
+		cstate->line_buf_valid = false;
+		cstate->line_buf_alloced = false;
+
+		cstate->converted_buf = NULL;
 	}
 
 	/* Assign range table, we'll need it in CopyFrom. */
@@ -3634,7 +3633,7 @@ NextCopyFromRawFields(CopyState cstate, char ***fields, int *nfields)
 	 * characters, we act as though it was newline followed by EOF, ie,
 	 * process the line and then exit loop on next iteration.
 	 */
-	if (done && cstate->line_buf.len == 0)
+	if (done && cstate->line_len == 0)
 		return false;
 
 	/* Parse the line into de-escaped field values */
@@ -3863,451 +3862,550 @@ EndCopyFrom(CopyState cstate)
 static bool
 CopyReadLine(CopyState cstate)
 {
-	bool		result;
-
-	resetStringInfo(&cstate->line_buf);
-	cstate->line_buf_valid = true;
+	resetStringInfo(&cstate->split_line_buf);
 
 	/* Mark that encoding conversion hasn't occurred yet */
 	cstate->line_buf_converted = false;
+	cstate->line_buf_valid = false;
+
+	if (cstate->line_buf_alloced)
+		pfree(cstate->line_buf);
 
-	/* Parse data and transfer into line_buf */
-	result = CopyReadLineText(cstate);
+	if (cstate->split_line_buf.data > 0)
+		resetStringInfo(&cstate->split_line_buf);
 
-	if (result)
+	if (cstate->last_line_no != -1 && cstate->nextline > cstate->last_line_no)
+		return true;
+
+	/*
+	 * If we processed all lines from previous batch, load more
+	 */
+	if (cstate->nextline == cstate->numlines)
 	{
-		/*
-		 * Reached EOF.  In protocol version 3, we should ignore anything
-		 * after \. up to the protocol end of copy data.  (XXX maybe better
-		 * not to treat \. as special?)
-		 */
-		if (cstate->copy_dest == COPY_NEW_FE)
+		for (;;)
 		{
-			do
+			int			endpos;
+			bool		done;
+
+			cstate->nextline = 0;
+
+			/*
+			 * Transfer any remaining data from previous buffer to split_line_buf.
+			 */
+			if (cstate->numlines == 0)
+			{
+				/* this chunk contained no line-ends at all. */
+				endpos = 0;
+			}
+			else
+			{
+				endpos = cstate->endlines[cstate->numlines - 1];
+			}
+			appendBinaryStringInfo(&cstate->split_line_buf, cstate->raw_buf + endpos,
+								   cstate->raw_buf_len - endpos);
+
+			/* Get next raw (and possibly converted) buf */
+			done = !CopyLoadAndConvertBuf(cstate);
+
+			/* Detect line boundaries within the buffer */
+			if (cstate->csv_mode)
+				ParseLinesCSV(cstate);
+			else
+				ParseLinesText(cstate);
+
+			/*
+			 * If we reached the EOF, remember it, and add a sentinel end-of-line to
+			 * 'endlines' so that the logic below doesn't need to special case the
+			 * last line.
+			 */
+			if (done)
 			{
-				cstate->raw_buf_index = cstate->raw_buf_len;
-			} while (CopyLoadRawBuf(cstate));
+				cstate->last_line_no = cstate->numlines;
+				cstate->endlines[cstate->numlines] = cstate->converted_buf_len;
+				cstate->numlines++;
+				break;
+			}
+			else
+				cstate->last_line_no = -1;
+
+			if (cstate->numlines > 0)
+				break;
 		}
 	}
+
+	Assert(cstate->nextline < cstate->numlines);
+
+	/*
+	 * The first line in this buffer could be a contination of a split line that
+	 * started on previous buffer. Treat it specially.
+	 */
+	if (cstate->nextline == 0)
+	{
+		if (cstate->split_line_buf.len > 0)
+		{
+			appendBinaryStringInfo(&cstate->split_line_buf, cstate->converted_buf,
+								   cstate->endlines[0]);
+			cstate->line_buf = cstate->split_line_buf.data;
+			cstate->line_len = cstate->split_line_buf.len;
+		}
+		else
+		{
+			cstate->line_buf = cstate->converted_buf;
+			cstate->line_len = cstate->endlines[0];
+		}
+	}
+	else
+	{
+		int startpos;
+		int endpos;
+
+		startpos = cstate->endlines[cstate->nextline - 1];
+		endpos = cstate->endlines[cstate->nextline];
+
+		cstate->line_buf = cstate->converted_buf + startpos;
+		cstate->line_len = endpos - startpos;
+	}
+
+	if (cstate->nextline == cstate->last_line_no)
+	{
+		/*
+		 * EOF at start of line means we're done.  If we see EOF after some
+		 * characters, we act as though it was newline followed by EOF, ie,
+		 * process the line and then exit loop on next iteration.
+		 */
+		if (cstate->line_len == 0)
+			return true;
+	}
 	else
 	{
 		/*
 		 * If we didn't hit EOF, then we must have transferred the EOL marker
 		 * to line_buf along with the data.  Get rid of it.
 		 */
-		switch (cstate->eol_type)
+		if (cstate->nextline != cstate->last_line_no)
 		{
-			case EOL_NL:
-				Assert(cstate->line_buf.len >= 1);
-				Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
-				cstate->line_buf.len--;
-				cstate->line_buf.data[cstate->line_buf.len] = '\0';
-				break;
-			case EOL_CR:
-				Assert(cstate->line_buf.len >= 1);
-				Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\r');
-				cstate->line_buf.len--;
-				cstate->line_buf.data[cstate->line_buf.len] = '\0';
-				break;
-			case EOL_CRNL:
-				Assert(cstate->line_buf.len >= 2);
-				Assert(cstate->line_buf.data[cstate->line_buf.len - 2] == '\r');
-				Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
-				cstate->line_buf.len -= 2;
-				cstate->line_buf.data[cstate->line_buf.len] = '\0';
-				break;
-			case EOL_UNKNOWN:
-				/* shouldn't get here */
-				Assert(false);
-				break;
+			switch (cstate->eol_type)
+			{
+				case EOL_NL:
+					Assert(cstate->line_len >= 1);
+					Assert(cstate->line_buf[cstate->line_len - 1] == '\n');
+					cstate->line_len--;
+					cstate->line_buf[cstate->line_len] = '\0';
+					break;
+				case EOL_CR:
+					Assert(cstate->line_len >= 1);
+					Assert(cstate->line_buf[cstate->line_len - 1] == '\r');
+					cstate->line_len--;
+					cstate->line_buf[cstate->line_len] = '\0';
+					break;
+				case EOL_CRNL:
+					Assert(cstate->line_len >= 2);
+					Assert(cstate->line_buf[cstate->line_len - 2] == '\r');
+					Assert(cstate->line_buf[cstate->line_len - 1] == '\n');
+					cstate->line_len -= 2;
+					cstate->line_buf[cstate->line_len] = '\0';
+					break;
+				case EOL_UNKNOWN:
+					/* shouldn't get here */
+					Assert(false);
+					break;
+			}
 		}
 	}
+	cstate->nextline++;
 
-	/* Done reading the line.  Convert it to server encoding. */
-	if (cstate->need_transcoding)
+	cstate->line_buf_valid = true;
+	cstate->line_buf_alloced = false;
+
+	/*
+	 * Done reading the line.  Convert it to server encoding. If the encoding was
+	 * one that embeds ASCII, we did it for the whole raw buffer already
+	 */
+	if (cstate->need_transcoding && !cstate->encoding_embeds_ascii)
 	{
 		char	   *cvt;
 
-		cvt = pg_any_to_server(cstate->line_buf.data,
-							   cstate->line_buf.len,
+		cvt = pg_any_to_server(cstate->line_buf, cstate->line_len,
 							   cstate->file_encoding);
-		if (cvt != cstate->line_buf.data)
+		if (cvt != cstate->line_buf)
 		{
 			/* transfer converted data back to line_buf */
-			resetStringInfo(&cstate->line_buf);
-			appendBinaryStringInfo(&cstate->line_buf, cvt, strlen(cvt));
-			pfree(cvt);
+			cstate->line_buf = cvt;
+			cstate->line_len = strlen(cvt);
+			cstate->line_buf_alloced = true;
 		}
 	}
 
 	/* Now it's safe to use the buffer in error messages */
 	cstate->line_buf_converted = true;
 
-	return result;
+	return false;
 }
 
-/*
- * CopyReadLineText - inner loop of CopyReadLine for text mode
- */
 static bool
-CopyReadLineText(CopyState cstate)
+CopyLoadAndConvertBuf(CopyState cstate)
 {
-	char	   *copy_raw_buf;
-	int			raw_buf_ptr;
-	int			copy_buf_len;
-	bool		need_data = false;
-	bool		hit_eof = false;
-	bool		result = false;
-	char		mblen_str[2];
-
-	/* CSV variables */
-	bool		first_char_in_line = true;
-	bool		in_quote = false,
-				last_was_esc = false;
-	char		quotec = '\0';
-	char		escapec = '\0';
+	bool		moredata;
 
-	if (cstate->csv_mode)
+	/* Get next raw buf */
+	moredata = CopyLoadRawBuf(cstate);
+
+	/* convert if necessary */
+	if (cstate->encoding_embeds_ascii)
 	{
-		quotec = cstate->quote[0];
-		escapec = cstate->escape[0];
-		/* ignore special escape processing if it's the same as quotec */
-		if (quotec == escapec)
-			escapec = '\0';
+		Assert(cstate->need_transcoding);
+				
+		if (cstate->converted_buf && cstate->converted_buf != cstate->raw_buf)
+			pfree(cstate->converted_buf);
+
+		while (moredata && cstate->raw_buf_len < MAX_CONVERSION_GROWTH)
+			moredata = CopyLoadRawBuf(cstate);
+
+		if (!moredata)
+		{
+			cstate->raw_buf_index = cstate->raw_buf_len;
+		}
+		else
+		{
+			/* Find length */
+			char	   *p;
+			char	   *pend;
+
+			p = cstate->raw_buf;
+			pend = cstate->raw_buf + cstate->raw_buf_len;
+			while (p < pend - MAX_CONVERSION_GROWTH)
+			{
+				if (IS_HIGHBIT_SET(*p))
+				{
+					int			mblen;
+
+					mblen = pg_encoding_mblen(cstate->file_encoding, p);
+					p += mblen;
+				}
+				else
+					p++;
+			}
+			cstate->raw_buf_index = pend - p;
+		}
+		cstate->converted_buf = pg_any_to_server(cstate->raw_buf,
+												 cstate->raw_buf_index,
+												 cstate->file_encoding);
+		if (cstate->converted_buf != cstate->raw_buf)
+			cstate->converted_buf_len = strlen(cstate->converted_buf);
+		else
+			cstate->converted_buf_len = cstate->raw_buf_index;
+	}
+	else
+	{
+		cstate->converted_buf = cstate->raw_buf;
+		cstate->converted_buf_len = cstate->raw_buf_len;
+		cstate->raw_buf_index = cstate->raw_buf_len;
 	}
 
-	mblen_str[1] = '\0';
+	return moredata;
+}
+
+/*
+ * Find all newlines (or CRs or CRLNs) in the buffer in cstate->converted_buf.
+ *
+ * The positions of the newlines are stored in cstate->endlines array.
+ * Each position points to the *next* character, after the newline.
+ *
+ * A position can also be 0, meaning that there was a newline immediatedly
+ * before the current buffer. That case can currently only arise when
+ * processing the first line in EOL_UNKNOWN mode, and we see a CR at the
+ * end a buffer. In that case, we won't know until we see the first
+ * character of the next buffer, that the CR at the end of the previous
+ * buffer was really the end-of-line.
+ */
+static void
+ParseLinesText(CopyState cstate)
+{
+	/* pre-requisites: there is data in converted_buf */
+	char	   *startp;
+	char	   *p;
+	char	   *endp;
+	int		   *endlines;
+	int			nlines;
 
 	/*
-	 * The objective of this loop is to transfer the entire next input line
-	 * into line_buf.  Hence, we only care for detecting newlines (\r and/or
-	 * \n) and the end-of-copy marker (\.).
-	 *
-	 * In CSV mode, \r and \n inside a quoted field are just part of the data
-	 * value and are put in line_buf.  We keep just enough state to know if we
-	 * are currently in a quoted field or not.
-	 *
-	 * These four characters, and the CSV escape and quote characters, are
-	 * assumed the same in frontend and backend encodings.
+	 * TODO: support multibyte encodings. Plan:
 	 *
-	 * For speed, we try to move data from raw_buf to line_buf in chunks
-	 * rather than one character at a time.  raw_buf_ptr points to the next
-	 * character to examine; any characters from raw_buf_index to raw_buf_ptr
-	 * have been determined to be part of the line, but not yet transferred to
-	 * line_buf.
+	 * If encoding_embeds_ascii, the caller converts the raw buffer
+	 * before calling this function, scanning through the buffer with
+	 * pg_mblen() to find the multibyte character boundary. Stash any
+	 * remaining bytes for next call.
 	 *
-	 * For a little extra speed within the loop, we copy raw_buf and
-	 * raw_buf_len into local variables.
+	 * Otherwise, the conversion can be done separately on each line, after
+	 * calling this function.
 	 */
-	copy_raw_buf = cstate->raw_buf;
-	raw_buf_ptr = cstate->raw_buf_index;
-	copy_buf_len = cstate->raw_buf_len;
 
-	for (;;)
-	{
-		int			prev_raw_ptr;
-		char		c;
+	p = cstate->converted_buf;
+	startp = cstate->converted_buf;
+	endp = cstate->converted_buf + cstate->converted_buf_len;
 
-		/*
-		 * Load more data if needed.  Ideally we would just force four bytes
-		 * of read-ahead and avoid the many calls to
-		 * IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(), but the COPY_OLD_FE protocol
-		 * does not allow us to read too far ahead or we might read into the
-		 * next data, so we read-ahead only as far we know we can.  One
-		 * optimization would be to read-ahead four byte here if
-		 * cstate->copy_dest != COPY_OLD_FE, but it hardly seems worth it,
-		 * considering the size of the buffer.
-		 */
-		if (raw_buf_ptr >= copy_buf_len || need_data)
-		{
-			REFILL_LINEBUF;
+	endlines = cstate->endlines;
+	nlines = 0;
 
-			/*
-			 * Try to read some more data.  This will certainly reset
-			 * raw_buf_index to zero, and raw_buf_ptr must go with it.
-			 */
-			if (!CopyLoadRawBuf(cstate))
-				hit_eof = true;
-			raw_buf_ptr = 0;
-			copy_buf_len = cstate->raw_buf_len;
+	if (cstate->eol_type == EOL_UNKNOWN)
+	{
+		while (p < endp)
+		{
+			char		c = *(p++);
 
-			/*
-			 * If we are completely out of data, break out of the loop,
-			 * reporting EOF.
-			 */
-			if (copy_buf_len <= 0)
+			if (c == '\n')
+			{
+				if (cstate->last_was_cr)
+					cstate->eol_type = EOL_CRNL;
+				else
+					cstate->eol_type = EOL_NL;
+				endlines[nlines++] = p - startp;
+				break;
+			}
+			else if (cstate->last_was_cr)
 			{
-				result = true;
+				/*
+				 * The previous character was \r, and this character is the first
+				 * character of the next line. The line ended just *before* this
+				 * character.
+				 */
+				endlines[nlines++] = (p - 1) - startp;
+				cstate->eol_type = EOL_CR;
+				cstate->last_was_cr = false; /* not used in EOL_CR mode */
 				break;
 			}
-			need_data = false;
+			else if (c == '\r')
+			{
+				cstate->last_was_cr = true;
+			}
 		}
+		/* continue processing according to the new 'eol_type' */
+	}
 
-		/* OK to fetch a character */
-		prev_raw_ptr = raw_buf_ptr;
-		c = copy_raw_buf[raw_buf_ptr++];
-
-		if (cstate->csv_mode)
+	if (cstate->eol_type == EOL_NL)
+	{
+		while (p < endp)
 		{
-			/*
-			 * If character is '\\' or '\r', we may need to look ahead below.
-			 * Force fetch of the next character if we don't already have it.
-			 * We need to do this before changing CSV state, in case one of
-			 * these characters is also the quote or escape character.
-			 *
-			 * Note: old-protocol does not like forced prefetch, but it's OK
-			 * here since we cannot validly be at EOF.
-			 */
-			if (c == '\\' || c == '\r')
+			char		c = *(p++);
+
+			/* Process \n */
+			if (c == '\n')
 			{
-				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
+				endlines[nlines++] = p - startp;
 			}
 
-			/*
-			 * Dealing with quotes and escapes here is mildly tricky. If the
-			 * quote char is also the escape char, there's no problem - we
-			 * just use the char as a toggle. If they are different, we need
-			 * to ensure that we only take account of an escape inside a
-			 * quoted field and immediately preceding a quote char, and not
-			 * the second in an escape-escape sequence.
-			 */
-			if (in_quote && c == escapec)
-				last_was_esc = !last_was_esc;
-			if (c == quotec && !last_was_esc)
-				in_quote = !in_quote;
-			if (c != escapec)
-				last_was_esc = false;
-
-			/*
-			 * Updating the line count for embedded CR and/or LF chars is
-			 * necessarily a little fragile - this test is probably about the
-			 * best we can do.  (XXX it's arguable whether we should do this
-			 * at all --- is cur_lineno a physical or logical count?)
-			 */
-			if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r'))
-				cstate->cur_lineno++;
+			/* Process \r */
+			if (c == '\r')
+				ereport(ERROR,
+						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+						 errmsg("literal carriage return found in data"),
+						 errhint("Use \"\\r\" to represent carriage return.")));
 		}
-
-		/* Process \r */
-		if (c == '\r' && (!cstate->csv_mode || !in_quote))
+	}
+	else if (cstate->eol_type == EOL_CR)
+	{
+		while (p < endp)
 		{
-			/* Check for \r\n on first line, _and_ handle \r\n. */
-			if (cstate->eol_type == EOL_UNKNOWN ||
-				cstate->eol_type == EOL_CRNL)
-			{
-				/*
-				 * If need more data, go back to loop top to load it.
-				 *
-				 * Note that if we are at EOF, c will wind up as '\0' because
-				 * of the guaranteed pad of raw_buf.
-				 */
-				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
+			char		c = *(p++);
 
-				/* get next char */
-				c = copy_raw_buf[raw_buf_ptr];
+			/* Process \r */
+			if (c == '\r')
+				endlines[nlines++] = p - startp;
 
-				if (c == '\n')
+			/* Process \n */
+			if (c == '\r')
+				ereport(ERROR,
+						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+						 errmsg("literal newline found in data"),
+						 errhint("Use \"\\n\" to represent carriage return.")));
+		}
+	}
+	else if (cstate->eol_type == EOL_CRNL)
+	{
+		while (p < endp)
+		{
+			char		c = *(p++);
+
+			if (c == '\n')
+			{
+				if (cstate->last_was_cr)
 				{
-					raw_buf_ptr++;	/* eat newline */
-					cstate->eol_type = EOL_CRNL;	/* in case not set yet */
+					endlines[nlines++] = p - startp;
+					cstate->last_was_cr = false;
 				}
 				else
-				{
-					/* found \r, but no \n */
-					if (cstate->eol_type == EOL_CRNL)
-						ereport(ERROR,
-								(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
-								 !cstate->csv_mode ?
-								 errmsg("literal carriage return found in data") :
-								 errmsg("unquoted carriage return found in data"),
-								 !cstate->csv_mode ?
-								 errhint("Use \"\\r\" to represent carriage return.") :
-								 errhint("Use quoted CSV field to represent carriage return.")));
-
-					/*
-					 * if we got here, it is the first line and we didn't find
-					 * \n, so don't consume the peeked character
-					 */
-					cstate->eol_type = EOL_CR;
-				}
+					ereport(ERROR,
+							(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+							 errmsg("literal newline found in data"),
+							 errhint("Use \"\\n\" to represent carriage return.")));
 			}
-			else if (cstate->eol_type == EOL_NL)
+			else if (cstate->last_was_cr)
+			{
 				ereport(ERROR,
 						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
-						 !cstate->csv_mode ?
-						 errmsg("literal carriage return found in data") :
-						 errmsg("unquoted carriage return found in data"),
-						 !cstate->csv_mode ?
-						 errhint("Use \"\\r\" to represent carriage return.") :
-						 errhint("Use quoted CSV field to represent carriage return.")));
-			/* If reach here, we have found the line terminator */
-			break;
+						 errmsg("literal carriage return found in data"),
+						 errhint("Use \"\\r\" to represent carriage return.")));
+			}
+			else if (c == '\r')
+			{
+				cstate->last_was_cr = true;
+			}
 		}
+	}
 
-		/* Process \n */
-		if (c == '\n' && (!cstate->csv_mode || !in_quote))
-		{
-			if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL)
-				ereport(ERROR,
-						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
-						 !cstate->csv_mode ?
-						 errmsg("literal newline found in data") :
-						 errmsg("unquoted newline found in data"),
-						 !cstate->csv_mode ?
-						 errhint("Use \"\\n\" to represent newline.") :
-						 errhint("Use quoted CSV field to represent newline.")));
-			cstate->eol_type = EOL_NL;	/* in case not set yet */
-			/* If reach here, we have found the line terminator */
-			break;
-		}
+	cstate->numlines = nlines;
+}
 
-		/*
-		 * In CSV mode, we only recognize \. alone on a line.  This is because
-		 * \. is a valid CSV data value.
-		 */
-		if (c == '\\' && (!cstate->csv_mode || first_char_in_line))
-		{
-			char		c2;
+/*
+ * Like ParseLinesText, but in CSV mode.
+ */
+static void
+ParseLinesCSV(CopyState cstate)
+{
+	/* pre-requisites: there is data in converted_buf */
+	char	   *startp;
+	char	   *p;
+	char	   *endp;
+	int		   *endlines;
+	int			nlines;
+	int			state = cstate->parse_lines_state;
+	char		quotec = '\0';
+	char		escapec = '\0';
 
-			IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
-			IF_NEED_REFILL_AND_EOF_BREAK(0);
+	quotec = cstate->quote[0];
+	escapec = cstate->escape[0];
+	/* ignore special escape processing if it's the same as quotec */
+	if (quotec == escapec)
+		escapec = '\0';
 
-			/* -----
-			 * get next character
-			 * Note: we do not change c so if it isn't \., we can fall
-			 * through and continue processing for file encoding.
-			 * -----
-			 */
-			c2 = copy_raw_buf[raw_buf_ptr];
+	p = cstate->converted_buf;
+	startp = cstate->converted_buf;
+	endp = cstate->converted_buf + cstate->converted_buf_len;
 
-			if (c2 == '.')
-			{
-				raw_buf_ptr++;	/* consume the '.' */
+	endlines = cstate->endlines;
+	nlines = 0;
 
-				/*
-				 * Note: if we loop back for more data here, it does not
-				 * matter that the CSV state change checks are re-executed; we
-				 * will come back here with no important state changed.
-				 */
-				if (cstate->eol_type == EOL_CRNL)
-				{
-					/* Get the next character */
-					IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
-					/* if hit_eof, c2 will become '\0' */
-					c2 = copy_raw_buf[raw_buf_ptr++];
+	while (p < endp)
+	{
+		char		c = *(p++);
+		bool		last_was_cr;
+
+		last_was_cr = cstate->last_was_cr;
+		cstate->last_was_cr = false;
 
-					if (c2 == '\n')
+		switch (state)
+		{
+			case PLSTATE_NORMAL:
+				if (c == '\n')
+				{
+					if (cstate->eol_type == EOL_NL)
+						endlines[nlines++] = p - startp;
+					else if (cstate->eol_type == EOL_CR)
+						ereport(ERROR,
+								(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+								 errmsg("unquoted newline found in data"),
+								 errhint("Use quoted CSV field to represent newline.")));
+					else if (cstate->eol_type == EOL_CRNL)
 					{
-						if (!cstate->csv_mode)
+						if (last_was_cr)
+							endlines[nlines++] = p - startp;
+						else
 							ereport(ERROR,
 									(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
-									 errmsg("end-of-copy marker does not match previous newline style")));
+									 errmsg("unquoted newline found in data"),
+									 errhint("Use quoted CSV field to represent newline.")));
+					}
+					else if (cstate->eol_type == EOL_UNKNOWN)
+					{
+						if (last_was_cr)
+							cstate->eol_type = EOL_CRNL;
 						else
-							NO_END_OF_COPY_GOTO;
+							cstate->eol_type = EOL_NL;
+						endlines[nlines++] = p - startp;
 					}
-					else if (c2 != '\r')
+				}
+				else if (c == '\r')
+				{
+					if (cstate->eol_type == EOL_NL)
+						ereport(ERROR,
+								(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+								 errmsg("unquoted carriage return found in data"),
+								 errhint("Use quoted CSV field to represent carriage return.")));
+					else if (cstate->eol_type == EOL_CR)
+						endlines[nlines++] = p - startp;
+					else if (cstate->eol_type == EOL_CRNL)
 					{
-						if (!cstate->csv_mode)
+						if (last_was_cr)
 							ereport(ERROR,
 									(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
-									 errmsg("end-of-copy marker corrupt")));
+									 errmsg("unquoted carriage return found in data"),
+									 errhint("Use quoted CSV field to represent carriage return.")));
+						cstate->last_was_cr = true;
+					}
+					else if (cstate->eol_type == EOL_UNKNOWN)
+					{
+						if (last_was_cr)
+						{
+							/* oops, the previous char was actually a line boundary already */
+							cstate->eol_type = EOL_CR;
+							endlines[nlines++] = (p - 1) - startp;
+							endlines[nlines++] = p - startp;
+						}
 						else
-							NO_END_OF_COPY_GOTO;
+							cstate->last_was_cr = true;
 					}
 				}
-
-				/* Get the next character */
-				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
-				/* if hit_eof, c2 will become '\0' */
-				c2 = copy_raw_buf[raw_buf_ptr++];
-
-				if (c2 != '\r' && c2 != '\n')
+				else if (c == escapec)
+					state = PLSTATE_ESCAPE;
+				else if (c == quotec)
+					state = PLSTATE_IN_QUOTE;
+				else if (last_was_cr)
 				{
-					if (!cstate->csv_mode)
+					if (cstate->eol_type == EOL_CRNL)
 						ereport(ERROR,
 								(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
-								 errmsg("end-of-copy marker corrupt")));
+								 errmsg("unquoted carriage return found in data"),
+								 errhint("Use quoted CSV field to represent carriage return.")));
 					else
-						NO_END_OF_COPY_GOTO;
+					{
+						Assert(cstate->eol_type == EOL_UNKNOWN);
+						cstate->eol_type = EOL_CR;
+						endlines[nlines++] = p - startp;
+					}
 				}
+				break;
 
-				if ((cstate->eol_type == EOL_NL && c2 != '\n') ||
-					(cstate->eol_type == EOL_CRNL && c2 != '\n') ||
-					(cstate->eol_type == EOL_CR && c2 != '\r'))
+			case PLSTATE_ESCAPE:
+				if (quotec == escapec && c != quotec)
 				{
-					ereport(ERROR,
-							(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
-							 errmsg("end-of-copy marker does not match previous newline style")));
+					/* the escape was actually a quote */
+					state = PLSTATE_IN_QUOTE;
 				}
-
-				/*
-				 * Transfer only the data before the \. into line_buf, then
-				 * discard the data and the \. sequence.
-				 */
-				if (prev_raw_ptr > cstate->raw_buf_index)
-					appendBinaryStringInfo(&cstate->line_buf,
-										   cstate->raw_buf + cstate->raw_buf_index,
-										   prev_raw_ptr - cstate->raw_buf_index);
-				cstate->raw_buf_index = raw_buf_ptr;
-				result = true;	/* report EOF */
+				else
+					state = PLSTATE_NORMAL;
 				break;
-			}
-			else if (!cstate->csv_mode)
 
-				/*
-				 * If we are here, it means we found a backslash followed by
-				 * something other than a period.  In non-CSV mode, anything
-				 * after a backslash is special, so we skip over that second
-				 * character too.  If we didn't do that \\. would be
-				 * considered an eof-of copy, while in non-CSV mode it is a
-				 * literal backslash followed by a period.  In CSV mode,
-				 * backslashes are not special, so we want to process the
-				 * character after the backslash just like a normal character,
-				 * so we don't increment in those cases.
-				 */
-				raw_buf_ptr++;
-		}
-
-		/*
-		 * This label is for CSV cases where \. appears at the start of a
-		 * line, but there is more text after it, meaning it was a data value.
-		 * We are more strict for \. in CSV mode because \. could be a data
-		 * value, while in non-CSV mode, \. cannot be a data value.
-		 */
-not_end_of_copy:
-
-		/*
-		 * Process all bytes of a multi-byte character as a group.
-		 *
-		 * We only support multi-byte sequences where the first byte has the
-		 * high-bit set, so as an optimization we can avoid this block
-		 * entirely if it is not set.
-		 */
-		if (cstate->encoding_embeds_ascii && IS_HIGHBIT_SET(c))
-		{
-			int			mblen;
-
-			/*
-			 * It is enough to look at the first byte in all our encodings, to
-			 * get the length.  (GB18030 is a bit special, but still works for
-			 * our purposes; see comment in pg_gb18030_mblen())
-			 */
-			mblen_str[0] = c;
-			mblen = pg_encoding_mblen(cstate->file_encoding, mblen_str);
+			case PLSTATE_IN_QUOTE:
+				if (c == escapec)
+					state = PLSTATE_ESCAPE_IN_QUOTE;
+				else if (c == quotec)
+					state = PLSTATE_NORMAL;
+				break;
 
-			IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(mblen - 1);
-			IF_NEED_REFILL_AND_EOF_BREAK(mblen - 1);
-			raw_buf_ptr += mblen - 1;
+			case PLSTATE_ESCAPE_IN_QUOTE:
+				if (quotec == escapec && c != quotec)
+				{
+					/* the escape was actually the end quote */
+					state = PLSTATE_NORMAL;
+					continue; /* process this byte again, as a normal */
+				}
+				else
+					state = PLSTATE_IN_QUOTE;
+				break;
 		}
-		first_char_in_line = false;
-	}							/* end of outer loop */
-
-	/*
-	 * Transfer any still-uncopied data to line_buf.
-	 */
-	REFILL_LINEBUF;
-
-	return result;
+	}
+	cstate->numlines = nlines;
+	cstate->parse_lines_state = state;
 }
 
 /*
@@ -4344,6 +4442,8 @@ GetDecimalFromHex(char hex)
 static int
 CopyReadAttributesText(CopyState cstate)
 {
+	char	   *line_buf = cstate->line_buf;
+	int			len = cstate->line_len;
 	char		delimc = cstate->delim[0];
 	int			fieldno;
 	char	   *output_ptr;
@@ -4356,7 +4456,7 @@ CopyReadAttributesText(CopyState cstate)
 	 */
 	if (cstate->max_fields <= 0)
 	{
-		if (cstate->line_buf.len != 0)
+		if (len != 0)
 			ereport(ERROR,
 					(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
 					 errmsg("extra data after last expected column")));
@@ -4372,13 +4472,13 @@ CopyReadAttributesText(CopyState cstate)
 	 * it this way because enlarging attribute_buf mid-stream would invalidate
 	 * pointers already stored into cstate->raw_fields[].
 	 */
-	if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
-		enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
+	if (cstate->attribute_buf.maxlen <= len)
+		enlargeStringInfo(&cstate->attribute_buf, len);
 	output_ptr = cstate->attribute_buf.data;
 
 	/* set pointer variables for loop */
-	cur_ptr = cstate->line_buf.data;
-	line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
+	cur_ptr = line_buf;
+	line_end_ptr = line_buf + len;
 
 	/* Outer loop iterates over fields */
 	fieldno = 0;
@@ -4586,7 +4686,7 @@ CopyReadAttributesCSV(CopyState cstate)
 	 */
 	if (cstate->max_fields <= 0)
 	{
-		if (cstate->line_buf.len != 0)
+		if (cstate->line_len != 0)
 			ereport(ERROR,
 					(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
 					 errmsg("extra data after last expected column")));
@@ -4602,13 +4702,13 @@ CopyReadAttributesCSV(CopyState cstate)
 	 * it this way because enlarging attribute_buf mid-stream would invalidate
 	 * pointers already stored into cstate->raw_fields[].
 	 */
-	if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
-		enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
+	if (cstate->attribute_buf.maxlen <= cstate->line_len)
+		enlargeStringInfo(&cstate->attribute_buf, cstate->line_len);
 	output_ptr = cstate->attribute_buf.data;
 
 	/* set pointer variables for loop */
-	cur_ptr = cstate->line_buf.data;
-	line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
+	cur_ptr = cstate->line_buf;
+	line_end_ptr = cstate->line_buf + cstate->line_len;
 
 	/* Outer loop iterates over fields */
 	fieldno = 0;
-- 
2.20.1