diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c index f6da673..67e0f73 100644 --- a/src/backend/access/transam/timeline.c +++ b/src/backend/access/transam/timeline.c @@ -418,19 +418,20 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, TLHistoryFilePath(path, newTLI); /* - * Prefer link() to rename() here just to be really sure that we don't - * overwrite an existing file. However, there shouldn't be one, so - * rename() is an acceptable substitute except for the truly paranoid. + * Prefer link_safe() to rename_safe() here just to be really sure that + * we don't overwrite an existing file. However, there shouldn't be one, + * so rename_safe() is an acceptable substitute except for the truly + * paranoid. */ #if HAVE_WORKING_LINK - if (link(tmppath, path) < 0) + if (link_safe(tmppath, path) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not link file \"%s\" to \"%s\": %m", tmppath, path))); unlink(tmppath); #else - if (rename(tmppath, path) < 0) + if (rename_safe(tmppath, path) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", @@ -508,19 +509,20 @@ writeTimeLineHistoryFile(TimeLineID tli, char *content, int size) TLHistoryFilePath(path, tli); /* - * Prefer link() to rename() here just to be really sure that we don't - * overwrite an existing logfile. However, there shouldn't be one, so - * rename() is an acceptable substitute except for the truly paranoid. + * Prefer link_safe() to rename_safe() here just to be really sure that + * we don't overwrite an existing logfile. However, there shouldn't be + * one, so rename_safe() is an acceptable substitute except for the + * truly paranoid. */ #if HAVE_WORKING_LINK - if (link(tmppath, path) < 0) + if (link_safe(tmppath, path) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not link file \"%s\" to \"%s\": %m", tmppath, path))); unlink(tmppath); #else - if (rename(tmppath, path) < 0) + if (rename_safe(tmppath, path) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index a2846c4..fc85c81 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -3251,12 +3251,13 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, } /* - * Prefer link() to rename() here just to be really sure that we don't - * overwrite an existing logfile. However, there shouldn't be one, so - * rename() is an acceptable substitute except for the truly paranoid. + * Prefer link_safe() to rename_safe() here just to be really sure + * that we don't overwrite an existing logfile. However, there + * shouldn't be one, so rename_safe() is an acceptable substitute + * except for the truly paranoid. */ #if HAVE_WORKING_LINK - if (link(tmppath, path) < 0) + if (link_safe(tmppath, path) < 0) { if (use_lock) LWLockRelease(ControlFileLock); @@ -3268,7 +3269,7 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, } unlink(tmppath); #else - if (rename(tmppath, path) < 0) + if (rename_safe(tmppath, path) < 0) { if (use_lock) LWLockRelease(ControlFileLock); @@ -3792,7 +3793,7 @@ RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr) * flag, rename will fail. We'll try again at the next checkpoint. */ snprintf(newpath, MAXPGPATH, "%s.deleted", path); - if (rename(path, newpath) != 0) + if (rename_safe(path, newpath) != 0) { ereport(LOG, (errcode_for_file_access(), @@ -3800,10 +3801,12 @@ RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr) path))); return; } + rc = unlink(newpath); #else rc = unlink(path); #endif + if (rc != 0) { ereport(LOG, @@ -5291,7 +5294,7 @@ exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog) * re-enter archive recovery mode in a subsequent crash. */ unlink(RECOVERY_COMMAND_DONE); - if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0) + if (rename_safe(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0) ereport(FATAL, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", @@ -6138,7 +6141,7 @@ StartupXLOG(void) if (stat(TABLESPACE_MAP, &st) == 0) { unlink(TABLESPACE_MAP_OLD); - if (rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0) + if (rename_safe(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0) ereport(LOG, (errmsg("ignoring file \"%s\" because no file \"%s\" exists", TABLESPACE_MAP, BACKUP_LABEL_FILE), @@ -6501,7 +6504,7 @@ StartupXLOG(void) if (haveBackupLabel) { unlink(BACKUP_LABEL_OLD); - if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0) + if (rename_safe(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0) ereport(FATAL, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", @@ -6518,7 +6521,7 @@ StartupXLOG(void) if (haveTblspcMap) { unlink(TABLESPACE_MAP_OLD); - if (rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD) != 0) + if (rename_safe(TABLESPACE_MAP, TABLESPACE_MAP_OLD) != 0) ereport(FATAL, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", @@ -7299,7 +7302,7 @@ StartupXLOG(void) */ XLogArchiveCleanup(partialfname); - if (rename(origpath, partialpath) != 0) + if (rename_safe(origpath, partialpath) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", @@ -10863,7 +10866,7 @@ CancelBackup(void) /* remove leftover file from previously canceled backup if it exists */ unlink(BACKUP_LABEL_OLD); - if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0) + if (rename_safe(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0) { ereport(WARNING, (errcode_for_file_access(), @@ -10886,7 +10889,7 @@ CancelBackup(void) /* remove leftover file from previously canceled backup if it exists */ unlink(TABLESPACE_MAP_OLD); - if (rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0) + if (rename_safe(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0) { ereport(LOG, (errmsg("online backup mode canceled"), diff --git a/src/backend/access/transam/xlogarchive.c b/src/backend/access/transam/xlogarchive.c index 277c14a..13ab7fa 100644 --- a/src/backend/access/transam/xlogarchive.c +++ b/src/backend/access/transam/xlogarchive.c @@ -451,7 +451,7 @@ KeepFileRestoredFromArchive(char *path, char *xlogfname) */ snprintf(oldpath, MAXPGPATH, "%s.deleted%u", xlogfpath, deletedcounter++); - if (rename(xlogfpath, oldpath) != 0) + if (rename_safe(xlogfpath, oldpath) != 0) { ereport(ERROR, (errcode_for_file_access(), @@ -470,7 +470,7 @@ KeepFileRestoredFromArchive(char *path, char *xlogfname) reload = true; } - if (rename(path, xlogfpath) < 0) + if (rename_safe(path, xlogfpath) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", @@ -580,12 +580,11 @@ XLogArchiveForceDone(const char *xlog) StatusFilePath(archiveReady, xlog, ".ready"); if (stat(archiveReady, &stat_buf) == 0) { - if (rename(archiveReady, archiveDone) < 0) + if (rename_safe(archiveReady, archiveDone) < 0) ereport(WARNING, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", archiveReady, archiveDone))); - return; } diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c index 397f802..db54889 100644 --- a/src/backend/postmaster/pgarch.c +++ b/src/backend/postmaster/pgarch.c @@ -728,7 +728,7 @@ pgarch_archiveDone(char *xlog) StatusFilePath(rlogready, xlog, ".ready"); StatusFilePath(rlogdone, xlog, ".done"); - if (rename(rlogready, rlogdone) < 0) + if (rename_safe(rlogready, rlogdone) < 0) ereport(WARNING, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", diff --git a/src/backend/replication/logical/origin.c b/src/backend/replication/logical/origin.c index 5af47ec..e1497a9 100644 --- a/src/backend/replication/logical/origin.c +++ b/src/backend/replication/logical/origin.c @@ -617,16 +617,13 @@ CheckPointReplicationOrigin(void) CloseTransientFile(tmpfd); /* rename to permanent file, fsync file and directory */ - if (rename(tmppath, path) != 0) + if (rename_safe((char *) tmppath, (char *) path) != 0) { ereport(PANIC, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", tmppath, path))); } - - fsync_fname((char *) path, false); - fsync_fname("pg_logical", true); } /* diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index ed823ec..1f17f42 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -1488,8 +1488,8 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn) * That ought to be cheap because in most scenarios it should already * be safely on disk. */ - fsync_fname(path, false); - fsync_fname("pg_logical/snapshots", true); + fsync_fname(path, false, false); + fsync_fname("pg_logical/snapshots", true, false); builder->last_serialized_snapshot = lsn; goto out; @@ -1593,7 +1593,7 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn) } CloseTransientFile(fd); - fsync_fname("pg_logical/snapshots", true); + fsync_fname("pg_logical/snapshots", true, false); /* * We may overwrite the work from some other backend, but that's ok, our @@ -1608,8 +1608,8 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn) } /* make sure we persist */ - fsync_fname(path, false); - fsync_fname("pg_logical/snapshots", true); + fsync_fname(path, false, false); + fsync_fname("pg_logical/snapshots", true, false); /* * Now there's no way we can loose the dumped state anymore, remember this @@ -1660,8 +1660,8 @@ SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn) * either... * ---- */ - fsync_fname(path, false); - fsync_fname("pg_logical/snapshots", true); + fsync_fname(path, false, false); + fsync_fname("pg_logical/snapshots", true, false); /* read statically sized portion of snapshot */ diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index 251b549..65d01e8 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -453,8 +453,8 @@ ReplicationSlotDropAcquired(void) * restart. */ START_CRIT_SECTION(); - fsync_fname(tmppath, true); - fsync_fname("pg_replslot", true); + fsync_fname(tmppath, true, false); + fsync_fname("pg_replslot", true, false); END_CRIT_SECTION(); } else @@ -912,7 +912,7 @@ StartupReplicationSlots(void) errmsg("could not remove directory \"%s\"", path))); continue; } - fsync_fname("pg_replslot", true); + fsync_fname("pg_replslot", true, false); continue; } @@ -968,7 +968,7 @@ CreateSlotOnDisk(ReplicationSlot *slot) (errcode_for_file_access(), errmsg("could not create directory \"%s\": %m", tmppath))); - fsync_fname(tmppath, true); + fsync_fname(tmppath, true, false); /* Write the actual state file. */ slot->dirty = true; /* signal that we really need to write */ @@ -988,8 +988,8 @@ CreateSlotOnDisk(ReplicationSlot *slot) */ START_CRIT_SECTION(); - fsync_fname(path, true); - fsync_fname("pg_replslot", true); + fsync_fname(path, true, false); + fsync_fname("pg_replslot", true, false); END_CRIT_SECTION(); } @@ -1094,9 +1094,9 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel) /* Check CreateSlot() for the reasoning of using a crit. section. */ START_CRIT_SECTION(); - fsync_fname(path, false); - fsync_fname((char *) dir, true); - fsync_fname("pg_replslot", true); + fsync_fname(path, false, false); + fsync_fname((char *) dir, true, false); + fsync_fname("pg_replslot", true, false); END_CRIT_SECTION(); @@ -1165,7 +1165,7 @@ RestoreSlotFromDisk(const char *name) /* Also sync the parent directory */ START_CRIT_SECTION(); - fsync_fname(path, true); + fsync_fname(path, true, false); END_CRIT_SECTION(); /* read part of statefile that's guaranteed to be version independent */ @@ -1248,7 +1248,7 @@ RestoreSlotFromDisk(const char *name) (errcode_for_file_access(), errmsg("could not remove directory \"%s\"", path))); } - fsync_fname("pg_replslot", true); + fsync_fname("pg_replslot", true, false); return; } diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c index 522f420..20f49ab 100644 --- a/src/backend/storage/file/copydir.c +++ b/src/backend/storage/file/copydir.c @@ -115,7 +115,7 @@ copydir(char *fromdir, char *todir, bool recurse) errmsg("could not stat file \"%s\": %m", tofile))); if (S_ISREG(fst.st_mode)) - fsync_fname(tofile, false); + fsync_fname(tofile, false, false); } FreeDir(xldir); @@ -125,7 +125,7 @@ copydir(char *fromdir, char *todir, bool recurse) * synced. Recent versions of ext4 have made the window much wider but * it's been true for ext3 and other filesystems in the past. */ - fsync_fname(todir, true); + fsync_fname(todir, true, false); } /* diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 1b30100..bd268bd 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -307,6 +307,7 @@ static void walkdir(const char *path, static void pre_sync_fname(const char *fname, bool isdir, int elevel); #endif static void fsync_fname_ext(const char *fname, bool isdir, int elevel); +static void fsync_parent_path(const char *fname); /* @@ -410,10 +411,11 @@ pg_flush_data(int fd, off_t offset, off_t amount) * fsync_fname -- fsync a file or directory, handling errors properly * * Try to fsync a file or directory. When doing the latter, ignore errors that - * indicate the OS just doesn't allow/require fsyncing directories. + * indicate the OS just doesn't allow/require fsyncing directories. Optionally + * one can skip error regarding non-existing entries attempted to be fsync'ed. */ void -fsync_fname(char *fname, bool isdir) +fsync_fname(char *fname, bool isdir, bool missing_ok) { int fd; int returncode; @@ -440,9 +442,14 @@ fsync_fname(char *fname, bool isdir) return; else if (fd < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", fname))); + { + if (missing_ok) + return; + else + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", fname))); + } returncode = pg_fsync(fd); @@ -461,6 +468,62 @@ fsync_fname(char *fname, bool isdir) CloseTransientFile(fd); } +/* + * rename_safe -- rename of a file, making it on-disk persistent + * + * This routine ensures that a rename file persists in case of a crash by using + * fsync on the old and new files before and after performing the rename so as + * this categorizes as an all-or-nothing operation. + */ +int +rename_safe(char *oldfile, char *newfile) +{ + /* + * First fsync the old and new entries to ensure that they are properly + * persistent on disk. + */ + fsync_fname(oldfile, false, false); + fsync_fname(newfile, false, true); + + /* Time to do the real deal... */ + if (rename(oldfile, newfile) != 0) + return -1; + + /* + * Make change persistent in case of an OS crash, both the new entry and + * its parent directory need to be flushed. + */ + fsync_fname(newfile, false, false); + + /* Same for parent directory */ + fsync_parent_path(newfile); + return 0; +} + +/* + * link_safe -- make a file hard link, making it on-disk persistent + * + * This routine ensures that a hard link created on a file persists on system + * in case of a crash by using fsync where on the link generated as well as on + * its parent directory. + */ +int +link_safe(char *oldfile, char *newfile) +{ + if (link(oldfile, newfile) < 0) + return -1; + + /* + * Make the link persistent in case of an OS crash, the new entry + * generated as well as its parent directory need to be flushed. + */ + fsync_fname(newfile, false, false); + + /* Same for parent directory */ + fsync_parent_path(newfile); + return 0; +} + /* * InitFileAccess --- initialize this module during backend startup @@ -2719,3 +2782,29 @@ fsync_fname_ext(const char *fname, bool isdir, int elevel) (void) CloseTransientFile(fd); } + +/* + * fsync_parent_path -- fsync the parent path of a file or directory + * + * This is aimed at making file operations persistent on disk in case of + * an OS crash or power failure. + */ +static void +fsync_parent_path(const char *fname) +{ + char parentpath[MAXPGPATH]; + + /* Same for parent directory */ + snprintf(parentpath, MAXPGPATH, "%s", fname); + get_parent_directory(parentpath); + + /* + * get_parent_directory() returns an empty string if the input argument + * is just a file name (see comments in path.c), so handle that as being + * the current directory. + */ + if (strlen(parentpath) == 0) + fsync_fname(".", true, false); + else + fsync_fname(parentpath, true, false); +} diff --git a/src/backend/storage/file/reinit.c b/src/backend/storage/file/reinit.c index 4ad85380..a3a0a77 100644 --- a/src/backend/storage/file/reinit.c +++ b/src/backend/storage/file/reinit.c @@ -380,12 +380,12 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) dbspacedirname, oidbuf, de->d_name + oidchars + 1 + strlen(forkNames[INIT_FORKNUM])); - fsync_fname(mainpath, false); + fsync_fname(mainpath, false, false); } FreeDir(dbspace_dir); - fsync_fname((char *) dbspacedirname, true); + fsync_fname((char *) dbspacedirname, true, false); } } diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 4a3fccb..8ad5207 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -113,7 +113,9 @@ extern int pg_fsync_no_writethrough(int fd); extern int pg_fsync_writethrough(int fd); extern int pg_fdatasync(int fd); extern int pg_flush_data(int fd, off_t offset, off_t amount); -extern void fsync_fname(char *fname, bool isdir); +extern void fsync_fname(char *fname, bool isdir, bool missing_ok); +extern int rename_safe(char *oldfile, char *newfile); +extern int link_safe(char *oldfile, char *newfile); extern void SyncDataDirectory(void); /* Filename components for OpenTemporaryFile */