From 07657c620665f8e429c261d75af05efa982fae6a Mon Sep 17 00:00:00 2001 From: Bharath Rupireddy Date: Mon, 8 Aug 2022 12:32:27 +0000 Subject: [PATCH v3] Use pg_pwritev_with_retry() instead of write() in walmethods.c Use pg_pwritev_with_retry() while prepadding a WAL segment instead of write() in walmethods.c dir_open_for_write() to avoid partial writes. As the pg_pwritev_with_retry() function uses pwritev, we can avoid explicit lseek() to seek to the beginning of the WAL segment. These changes are inline with how core postgres initializes the WAL segment in XLogFileInitInternal(). --- src/backend/access/transam/xlog.c | 35 ++++------------- src/bin/pg_basebackup/walmethods.c | 19 ++------- src/common/file_utils.c | 62 ++++++++++++++++++++++++++++++ src/include/c.h | 15 ++++++++ src/include/common/file_utils.h | 2 + 5 files changed, 91 insertions(+), 42 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 34f0150d1e..072462bd9b 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -2916,7 +2916,6 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, bool *added, char *path) { char tmppath[MAXPGPATH]; - PGAlignedXLogBlock zbuffer; XLogSegNo installed_segno; XLogSegNo max_segno; int fd; @@ -2960,14 +2959,11 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", tmppath))); - memset(zbuffer.data, 0, XLOG_BLCKSZ); - pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE); save_errno = 0; if (wal_init_zero) { - struct iovec iov[PG_IOV_MAX]; - int blocks; + ssize_t rc; /* * Zero-fill the file. With this setting, we do this the hard way to @@ -2978,32 +2974,17 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, * indirect blocks are down on disk. Therefore, fdatasync(2) or * O_DSYNC will be sufficient to sync future writes to the log file. */ + rc = pg_pwritev_zeros(fd, wal_segment_size); - /* Prepare to write out a lot of copies of our zero buffer at once. */ - for (int i = 0; i < lengthof(iov); ++i) - { - iov[i].iov_base = zbuffer.data; - iov[i].iov_len = XLOG_BLCKSZ; - } - - /* Loop, writing as many blocks as we can for each system call. */ - blocks = wal_segment_size / XLOG_BLCKSZ; - for (int i = 0; i < blocks;) - { - int iovcnt = Min(blocks - i, lengthof(iov)); - off_t offset = i * XLOG_BLCKSZ; - - if (pg_pwritev_with_retry(fd, iov, iovcnt, offset) < 0) - { - save_errno = errno; - break; - } - - i += iovcnt; - } + if (rc < 0) + save_errno = errno; } else { + PGAlignedXLogBlock zbuffer; + + memset(zbuffer.data, 0, XLOG_BLCKSZ); + /* * Otherwise, seeking to the end and writing a solitary byte is * enough. diff --git a/src/bin/pg_basebackup/walmethods.c b/src/bin/pg_basebackup/walmethods.c index e90aa0ba37..5c8f69162a 100644 --- a/src/bin/pg_basebackup/walmethods.c +++ b/src/bin/pg_basebackup/walmethods.c @@ -209,23 +209,12 @@ dir_open_for_write(const char *pathname, const char *temp_suffix, size_t pad_to_ /* Do pre-padding on non-compressed files */ if (pad_to_size && dir_data->compression_algorithm == PG_COMPRESSION_NONE) { - PGAlignedXLogBlock zerobuf; - int bytes; + ssize_t rc; - memset(zerobuf.data, 0, XLOG_BLCKSZ); - for (bytes = 0; bytes < pad_to_size; bytes += XLOG_BLCKSZ) - { - errno = 0; - if (write(fd, zerobuf.data, XLOG_BLCKSZ) != XLOG_BLCKSZ) - { - /* If write didn't set errno, assume problem is no disk space */ - dir_data->lasterrno = errno ? errno : ENOSPC; - close(fd); - return NULL; - } - } + errno = 0; + rc = pg_pwritev_zeros(fd, pad_to_size); - if (lseek(fd, 0, SEEK_SET) != 0) + if (rc < 0) { dir_data->lasterrno = errno; close(fd); diff --git a/src/common/file_utils.c b/src/common/file_utils.c index 4af216e56c..1c193d14d4 100644 --- a/src/common/file_utils.c +++ b/src/common/file_utils.c @@ -525,3 +525,65 @@ pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset) return sum; } + +/* + * Function to zero-fill a file with given size. On failure, a negative value + * is returned and errno is set appropriately so that the caller can use it + * accordingly. + */ +ssize_t +pg_pwritev_zeros(int fd, size_t size) +{ + PGAlignedPWriteVBlock zbuffer; + struct iovec iov[PG_IOV_MAX]; + int blocks; + size_t block_sz; + size_t remaining_size = 0; + int i; + ssize_t rc = 0; + + block_sz = sizeof(zbuffer.data); + + memset(zbuffer.data, 0, block_sz); + + /* Prepare to write out a lot of copies of our zero buffer at once. */ + for (i = 0; i < lengthof(iov); ++i) + { + iov[i].iov_base = zbuffer.data; + iov[i].iov_len = block_sz; + } + + /* Loop, writing as many blocks as we can for each system call. */ + blocks = size / block_sz; + remaining_size = size % block_sz; + for (i = 0; i < blocks;) + { + int iovcnt = Min(blocks - i, lengthof(iov)); + off_t offset = i * block_sz; + + offset = i * block_sz; + + rc = pg_pwritev_with_retry(fd, iov, iovcnt, offset); + + if (rc < 0) + return rc; + + i += iovcnt; + } + + /* Now, write the remaining size, if any, of the file with zeros. */ + if (remaining_size > 0) + { + /* We'll never write more than one block here */ + int iovcnt = 1; + /* Jump on to the end of previously written blocks */ + off_t offset = i * block_sz; + + iov[0].iov_base = zbuffer.data; + iov[0].iov_len = remaining_size; + + rc = pg_pwritev_with_retry(fd, iov, iovcnt, offset); + } + + return rc; +} diff --git a/src/include/c.h b/src/include/c.h index de9ec04d49..2bb4d67d11 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -1151,6 +1151,21 @@ typedef union PGAlignedXLogBlock int64 force_align_i64; } PGAlignedXLogBlock; +/* + * PWRITEV_BLCKSZ is same as XLOG_BLCKSZ for now, however it may change if + * writing more bytes per pg_pwritev_with_retry() call is proven to be more + * performant. + */ +#define PWRITEV_BLCKSZ XLOG_BLCKSZ + +/* Same as above structures, but for a PWRITEV_BLCKSZ-sized buffer */ +typedef union PGAlignedPWriteVBlock +{ + char data[PWRITEV_BLCKSZ]; + double force_align_d; + int64 force_align_i64; +} PGAlignedPWriteVBlock; + /* msb for char */ #define HIGHBIT (0x80) #define IS_HIGHBIT_SET(ch) ((unsigned char)(ch) & HIGHBIT) diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h index 2c5dbcb0b1..f8d0d475fa 100644 --- a/src/include/common/file_utils.h +++ b/src/include/common/file_utils.h @@ -44,4 +44,6 @@ extern ssize_t pg_pwritev_with_retry(int fd, int iovcnt, off_t offset); +extern ssize_t pg_pwritev_zeros(int fd, size_t size); + #endif /* FILE_UTILS_H */ -- 2.34.1