From 43dab7c9c40d3aa385c7e115c137601308b9b00f Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Thu, 12 May 2016 14:50:55 +0900 Subject: [PATCH] Issue fsync more carefully in pg_receivexlog and pg_basebackup [-X] stream. Several places weren't careful about fsyncing in the way. See 1d4a0ab1 and 606e0f98 for details about required fsyns. This introduces a near-copy of initdb's fsync_fname_ext(), and of the backend's durable_rename(), fsync_parent_path(). At least the frontend duplication should be avoided; but that'd end up being hard to backpatch. --- src/bin/pg_basebackup/pg_basebackup.c | 32 +++++++++ src/bin/pg_basebackup/receivelog.c | 55 +++++++++------ src/bin/pg_basebackup/streamutil.c | 126 ++++++++++++++++++++++++++++++++++ src/bin/pg_basebackup/streamutil.h | 4 ++ 4 files changed, 195 insertions(+), 22 deletions(-) diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index 2927b60..891622d 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -1114,6 +1114,11 @@ ReceiveTarFile(PGconn *conn, PGresult *res, int rownum) if (copybuf != NULL) PQfreemem(copybuf); + + /* + * Nothing is synced here for performance reasons, everything is done + * once for all tablespaces at the end. + */ } @@ -1390,6 +1395,18 @@ ReceiveAndUnpackTarFile(PGconn *conn, PGresult *res, int rownum) if (basetablespace && writerecoveryconf) WriteRecoveryConf(); + + /* + * Sync data directory to ensure that data is safely on disk, this is + * done once for performance reasons. Each tablespace need to be processed + * once as well. + */ + if (fsync_fname_ext(current_path, true) != 0) + { + fprintf(stderr, "%s: sync of target directory %s failed\n", + progname, current_path); + disconnect_and_exit(1); + } } /* @@ -1931,6 +1948,21 @@ BaseBackup(void) PQclear(res); PQfinish(conn); + /* + * Make data persistent on disk for each tablespace for tar format, + * though nothing can be done when output is written to stdout. In + * plain format each tablespace is synced individually. + */ + if (format == 't' && strcmp(basedir, "-") != 0) + { + if (fsync_fname_ext(basedir, true) != 0) + { + fprintf(stderr, "%s: sync of target directory %s failed\n", + progname, basedir); + exit(1); + } + } + if (verbose) fprintf(stderr, "%s: base backup completed\n", progname); } diff --git a/src/bin/pg_basebackup/receivelog.c b/src/bin/pg_basebackup/receivelog.c index 595213f..b86c9e3 100644 --- a/src/bin/pg_basebackup/receivelog.c +++ b/src/bin/pg_basebackup/receivelog.c @@ -68,17 +68,13 @@ mark_file_as_archived(const char *basedir, const char *fname) return false; } - if (fsync(fd) != 0) - { - fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"), - progname, tmppath, strerror(errno)); - - close(fd); + close(fd); + if (fsync_fname_ext(tmppath, false) != 0) return false; - } - close(fd); + if (fsync_parent_path(tmppath) != 0) + return false; return true; } @@ -116,6 +112,10 @@ open_walfile(StreamCtl *stream, XLogRecPtr startpoint) /* * Verify that the file is either empty (just created), or a complete * XLogSegSize segment. Anything in between indicates a corrupt file. + * + * XXX: This means that we might not restart if a crash occurs before the + * fsync below. We probably should create the file in a temporary path + * like the backend does... */ if (fstat(f, &statbuf) != 0) { @@ -129,6 +129,16 @@ open_walfile(StreamCtl *stream, XLogRecPtr startpoint) { /* File is open and ready to use */ walfile = f; + + /* + * fsync, in case of a previous crash between padding and fsyncing the + * file. + */ + if (fsync_fname_ext(fn, false) != 0) + return false; + if (fsync_parent_path(fn) != 0) + return false; + return true; } if (statbuf.st_size != 0) @@ -157,6 +167,17 @@ open_walfile(StreamCtl *stream, XLogRecPtr startpoint) } free(zerobuf); + /* + * fsync WAL file and containing directory, to ensure the file is + * persistently created and zeroed. That's particularly important when + * using synchronous mode, where the file is modified and fsynced + * in-place, without a directory fsync. + */ + if (fsync_fname_ext(fn, false) != 0) + return false; + if (fsync_parent_path(fn) != 0) + return false; + if (lseek(f, SEEK_SET, 0) != 0) { fprintf(stderr, @@ -217,10 +238,9 @@ close_walfile(StreamCtl *stream, XLogRecPtr pos) snprintf(oldfn, sizeof(oldfn), "%s/%s%s", stream->basedir, current_walfile_name, stream->partial_suffix); snprintf(newfn, sizeof(newfn), "%s/%s", stream->basedir, current_walfile_name); - if (rename(oldfn, newfn) != 0) + if (durable_rename(oldfn, newfn) != 0) { - fprintf(stderr, _("%s: could not rename file \"%s\": %s\n"), - progname, current_walfile_name, strerror(errno)); + /* durable_rename produced a log entry */ return false; } } @@ -338,14 +358,6 @@ writeTimeLineHistoryFile(StreamCtl *stream, char *filename, char *content) return false; } - if (fsync(fd) != 0) - { - close(fd); - fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"), - progname, tmppath, strerror(errno)); - return false; - } - if (close(fd) != 0) { fprintf(stderr, _("%s: could not close file \"%s\": %s\n"), @@ -356,10 +368,9 @@ writeTimeLineHistoryFile(StreamCtl *stream, char *filename, char *content) /* * Now move the completed history file into place with its final name. */ - if (rename(tmppath, path) < 0) + if (durable_rename(tmppath, path) < 0) { - fprintf(stderr, _("%s: could not rename file \"%s\" to \"%s\": %s\n"), - progname, tmppath, path, strerror(errno)); + /* durable_rename produced a log entry */ return false; } diff --git a/src/bin/pg_basebackup/streamutil.c b/src/bin/pg_basebackup/streamutil.c index 4d1ff90..ece5abd 100644 --- a/src/bin/pg_basebackup/streamutil.c +++ b/src/bin/pg_basebackup/streamutil.c @@ -525,3 +525,129 @@ fe_recvint64(char *buf) return result; } + +/* + * fsync_fname_ext -- Try to fsync a file or directory + * + * Returns 0 if the operation succeeded, -1 otherwise. + * + * XXX: This is a near-duplicate of initdb.c's fsync_fname_ext(); they should + * be unified into a common place. + */ +int +fsync_fname_ext(const char *fname, bool isdir) +{ + int fd; + int flags; + int returncode; + + /* + * Some OSs require directories to be opened read-only whereas other + * systems don't allow us to fsync files opened read-only; so we need both + * cases here. Using O_RDWR will cause us to fail to fsync files that are + * not writable by our userid, but we assume that's OK. + */ + flags = PG_BINARY; + if (!isdir) + flags |= O_RDWR; + else + flags |= O_RDONLY; + + /* + * Open the file, silently ignoring errors about unreadable files (or + * unsupported operations, e.g. opening a directory under Windows), and + * logging others. + */ + fd = open(fname, flags); + if (fd < 0) + { + if (isdir && (errno == EISDIR || errno == EACCES)) + return 0; + fprintf(stderr, _("%s: could not open file \"%s\": %s\n"), + progname, fname, strerror(errno)); + return -1; + } + + returncode = fsync(fd); + + /* + * Some OSes don't allow us to fsync directories at all, so we can ignore + * those errors. Anything else needs to be reported. + */ + if (returncode != 0 && !(isdir && errno == EBADF)) + { + fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"), + progname, fname, strerror(errno)); + close(fd); + return -1; + } + + close(fd); + return 0; +} + +/* + * fsync_parent_path -- fsync the parent path of a file or directory + * + * This is aimed at making file operations persistent on disk in case of + * an OS crash or power failure. + */ +int +fsync_parent_path(const char *fname) +{ + char parentpath[MAXPGPATH]; + + strlcpy(parentpath, fname, MAXPGPATH); + get_parent_directory(parentpath); + + /* + * get_parent_directory() returns an empty string if the input argument is + * just a file name (see comments in path.c), so handle that as being the + * current directory. + */ + if (strlen(parentpath) == 0) + strlcpy(parentpath, ".", MAXPGPATH); + + if (fsync_fname_ext(parentpath, true) != 0) + return -1; + + return 0; +} + +/* + * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability + * + * Wrapper around rename, similar to the backend version. Note that this + * version does not fsync the target file before the rename, as it's unlikely + * to be helpful for current and prospective users. + */ +int +durable_rename(const char *oldfile, const char *newfile) +{ + /* + * First fsync the old path, to ensure that it is properly persistent on + * disk. + */ + if (fsync_fname_ext(oldfile, false) != 0) + return -1; + + /* Time to do the real deal... */ + if (rename(oldfile, newfile) != 0) + { + fprintf(stderr, _("%s: could not rename file \"%s\" to \"%s\": %s\n"), + progname, oldfile, newfile, strerror(errno)); + return -1; + } + + /* + * To guarantee renaming the file is persistent, fsync the file with its + * new name, and its containing directory. + */ + if (fsync_fname_ext(newfile, false) != 0) + return -1; + + if (fsync_parent_path(newfile) != 0) + return -1; + + return 0; +} diff --git a/src/bin/pg_basebackup/streamutil.h b/src/bin/pg_basebackup/streamutil.h index d2d5a6d..71ae9ca 100644 --- a/src/bin/pg_basebackup/streamutil.h +++ b/src/bin/pg_basebackup/streamutil.h @@ -48,4 +48,8 @@ extern bool feTimestampDifferenceExceeds(int64 start_time, int64 stop_time, extern void fe_sendint64(int64 i, char *buf); extern int64 fe_recvint64(char *buf); +extern int fsync_parent_path(const char *fname); +extern int fsync_fname_ext(const char *fname, bool isdir); +extern int durable_rename(const char *oldfile, const char *newfile); + #endif /* STREAMUTIL_H */ -- 2.8.2