From 5ae3be7d262e4251bf21ac0c73b3e0ebc2ba615d Mon Sep 17 00:00:00 2001 From: Shinya Kato Date: Mon, 28 Jul 2025 22:08:20 +0900 Subject: [PATCH v1] Speed up COPY FROM text/CSV parsing using SIMD The inner loop of CopyReadLineText scans for newlines and other special characters by processing the input byte-by-byte. For large inputs, this can be a performance bottleneck. This commit introduces a SIMD-accelerated path. When not parsing inside a quoted field, we can use vector instructions to scan the input buffer for any character of interest in 16-byte chunks. This significantly improves performance, especially for data with long, unquoted fields. --- src/backend/commands/copyfromparse.c | 72 ++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c index b1ae97b833d..5aba0fa6cb7 100644 --- a/src/backend/commands/copyfromparse.c +++ b/src/backend/commands/copyfromparse.c @@ -71,7 +71,9 @@ #include "mb/pg_wchar.h" #include "miscadmin.h" #include "pgstat.h" +#include "port/pg_bitutils.h" #include "port/pg_bswap.h" +#include "port/simd.h" #include "utils/builtins.h" #include "utils/rel.h" @@ -1255,6 +1257,14 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) char quotec = '\0'; char escapec = '\0'; +#ifndef USE_NO_SIMD + Vector8 nl = vector8_broadcast('\n'); + Vector8 cr = vector8_broadcast('\r'); + Vector8 bs = vector8_broadcast('\\'); + Vector8 quote; + Vector8 escape; +#endif + if (is_csv) { quotec = cstate->opts.quote[0]; @@ -1262,6 +1272,12 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) /* ignore special escape processing if it's the same as quotec */ if (quotec == escapec) escapec = '\0'; + +#ifndef USE_NO_SIMD + quote = vector8_broadcast(quotec); + if (quotec != escapec) + escape = vector8_broadcast(escapec); +#endif } /* @@ -1328,6 +1344,62 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) need_data = false; } +#ifndef USE_NO_SIMD + /* + * SIMD instructions are used here to efficiently scan the input buffer + * for special characters (e.g., newline, carriage return, quotes, or + * escape characters). This approach significantly improves performance + * compared to byte-by-byte iteration, especially for large input + * buffers. + * + * However, SIMD optimization cannot be applied in the following cases: + * - Inside quoted fields, where escape sequences and closing quotes + * require sequential processing to handle correctly. + * - When the remaining buffer size is smaller than the size of a SIMD + * vector register, as SIMD operations require processing data in + * fixed-size chunks. + */ + if (!in_quote && copy_buf_len - input_buf_ptr >= sizeof(Vector8)) + { + Vector8 chunk; + Vector8 match; + uint32 mask; + + /* Load a chunk of data into a vector register */ + vector8_load(&chunk, (const uint8 *) ©_input_buf[input_buf_ptr]); + + /* Create a mask of all special characters we need to stop at */ + match = vector8_or(vector8_eq(chunk, nl), vector8_eq(chunk, cr)); + + if (is_csv) + { + match = vector8_or(match, vector8_eq(chunk, quote)); + if (escapec != '\0') + match = vector8_or(match, vector8_eq(chunk, escape)); + } + else + match = vector8_or(match, vector8_eq(chunk, bs)); + + /* Check if we found any special characters */ + mask = vector8_highbit_mask(match); + if (mask != 0) + { + /* + * Found a special character. Advance up to that point and let + * the scalar code handle it. + */ + int advance = pg_rightmost_one_pos32(mask); + input_buf_ptr += advance; + } + else + { + /* No special characters found, so skip the entire chunk */ + input_buf_ptr += sizeof(Vector8); + continue; + } + } +#endif + /* OK to fetch a character */ prev_raw_ptr = input_buf_ptr; c = copy_input_buf[input_buf_ptr++]; -- 2.47.1