*** a/src/backend/bootstrap/bootstrap.c --- b/src/backend/bootstrap/bootstrap.c *************** *** 315,320 **** AuxiliaryProcessMain(int argc, char *argv[]) --- 315,323 ---- case BgWriterProcess: statmsg = "writer process"; break; + case CheckpointerProcess: + statmsg = "checkpointer process"; + break; case WalWriterProcess: statmsg = "wal writer process"; break; *************** *** 419,424 **** AuxiliaryProcessMain(int argc, char *argv[]) --- 422,432 ---- BackgroundWriterMain(); proc_exit(1); /* should never return */ + case CheckpointerProcess: + /* don't set signals, checkpointer has its own agenda */ + CheckpointerMain(); + proc_exit(1); /* should never return */ + case WalWriterProcess: /* don't set signals, walwriter has its own agenda */ InitXLOGAccess(); *** a/src/backend/postmaster/Makefile --- b/src/backend/postmaster/Makefile *************** *** 13,18 **** top_builddir = ../../.. include $(top_builddir)/src/Makefile.global OBJS = autovacuum.o bgwriter.o fork_process.o pgarch.o pgstat.o postmaster.o \ ! syslogger.o walwriter.o include $(top_srcdir)/src/backend/common.mk --- 13,18 ---- include $(top_builddir)/src/Makefile.global OBJS = autovacuum.o bgwriter.o fork_process.o pgarch.o pgstat.o postmaster.o \ ! syslogger.o walwriter.o checkpointer.o include $(top_srcdir)/src/backend/common.mk *** a/src/backend/postmaster/bgwriter.c --- b/src/backend/postmaster/bgwriter.c *************** *** 10,29 **** * still empowered to issue writes if the bgwriter fails to maintain enough * clean shared buffers. * ! * The bgwriter is also charged with handling all checkpoints. It will ! * automatically dispatch a checkpoint after a certain amount of time has ! * elapsed since the last one, and it can be signaled to perform requested ! * checkpoints as well. (The GUC parameter that mandates a checkpoint every ! * so many WAL segments is implemented by having backends signal the bgwriter ! * when they fill WAL segments; the bgwriter itself doesn't watch for the ! * condition.) * * The bgwriter is started by the postmaster as soon as the startup subprocess * finishes, or as soon as recovery begins if we are doing archive recovery. * It remains alive until the postmaster commands it to terminate. ! * Normal termination is by SIGUSR2, which instructs the bgwriter to execute ! * a shutdown checkpoint and then exit(0). (All backends must be stopped ! * before SIGUSR2 is issued!) Emergency termination is by SIGQUIT; like any * backend, the bgwriter will simply abort and exit on SIGQUIT. * * If the bgwriter exits unexpectedly, the postmaster treats that the same --- 10,22 ---- * still empowered to issue writes if the bgwriter fails to maintain enough * clean shared buffers. * ! * As of Postgres 9.2 the bgwriter no longer handles checkpoints. * * The bgwriter is started by the postmaster as soon as the startup subprocess * finishes, or as soon as recovery begins if we are doing archive recovery. * It remains alive until the postmaster commands it to terminate. ! * Normal termination is by SIGUSR2, which instructs the bgwriter to exit(0). ! * Emergency termination is by SIGQUIT; like any * backend, the bgwriter will simply abort and exit on SIGQUIT. * * If the bgwriter exits unexpectedly, the postmaster treats that the same *************** *** 54,60 **** #include "miscadmin.h" #include "pgstat.h" #include "postmaster/bgwriter.h" - #include "replication/syncrep.h" #include "storage/bufmgr.h" #include "storage/ipc.h" #include "storage/lwlock.h" --- 47,52 ---- *************** *** 67,162 **** #include "utils/resowner.h" - /*---------- - * Shared memory area for communication between bgwriter and backends - * - * The ckpt counters allow backends to watch for completion of a checkpoint - * request they send. Here's how it works: - * * At start of a checkpoint, bgwriter reads (and clears) the request flags - * and increments ckpt_started, while holding ckpt_lck. - * * On completion of a checkpoint, bgwriter sets ckpt_done to - * equal ckpt_started. - * * On failure of a checkpoint, bgwriter increments ckpt_failed - * and sets ckpt_done to equal ckpt_started. - * - * The algorithm for backends is: - * 1. Record current values of ckpt_failed and ckpt_started, and - * set request flags, while holding ckpt_lck. - * 2. Send signal to request checkpoint. - * 3. Sleep until ckpt_started changes. Now you know a checkpoint has - * begun since you started this algorithm (although *not* that it was - * specifically initiated by your signal), and that it is using your flags. - * 4. Record new value of ckpt_started. - * 5. Sleep until ckpt_done >= saved value of ckpt_started. (Use modulo - * arithmetic here in case counters wrap around.) Now you know a - * checkpoint has started and completed, but not whether it was - * successful. - * 6. If ckpt_failed is different from the originally saved value, - * assume request failed; otherwise it was definitely successful. - * - * ckpt_flags holds the OR of the checkpoint request flags sent by all - * requesting backends since the last checkpoint start. The flags are - * chosen so that OR'ing is the correct way to combine multiple requests. - * - * num_backend_writes is used to count the number of buffer writes performed - * by non-bgwriter processes. This counter should be wide enough that it - * can't overflow during a single bgwriter cycle. num_backend_fsync - * counts the subset of those writes that also had to do their own fsync, - * because the background writer failed to absorb their request. - * - * The requests array holds fsync requests sent by backends and not yet - * absorbed by the bgwriter. - * - * Unlike the checkpoint fields, num_backend_writes, num_backend_fsync, and - * the requests fields are protected by BgWriterCommLock. - *---------- - */ - typedef struct - { - RelFileNodeBackend rnode; - ForkNumber forknum; - BlockNumber segno; /* see md.c for special values */ - /* might add a real request-type field later; not needed yet */ - } BgWriterRequest; - - typedef struct - { - pid_t bgwriter_pid; /* PID of bgwriter (0 if not started) */ - - slock_t ckpt_lck; /* protects all the ckpt_* fields */ - - int ckpt_started; /* advances when checkpoint starts */ - int ckpt_done; /* advances when checkpoint done */ - int ckpt_failed; /* advances when checkpoint fails */ - - int ckpt_flags; /* checkpoint flags, as defined in xlog.h */ - - uint32 num_backend_writes; /* counts non-bgwriter buffer writes */ - uint32 num_backend_fsync; /* counts non-bgwriter fsync calls */ - - int num_requests; /* current # of requests */ - int max_requests; /* allocated array size */ - BgWriterRequest requests[1]; /* VARIABLE LENGTH ARRAY */ - } BgWriterShmemStruct; - - static BgWriterShmemStruct *BgWriterShmem; - - /* interval for calling AbsorbFsyncRequests in CheckpointWriteDelay */ - #define WRITES_PER_ABSORB 1000 - /* * GUC parameters */ int BgWriterDelay = 200; - int CheckPointTimeout = 300; - int CheckPointWarning = 30; - double CheckPointCompletionTarget = 0.5; /* * Flags set by interrupt handlers for later service in the main loop. */ static volatile sig_atomic_t got_SIGHUP = false; - static volatile sig_atomic_t checkpoint_requested = false; static volatile sig_atomic_t shutdown_requested = false; /* --- 59,73 ---- *************** *** 164,192 **** static volatile sig_atomic_t shutdown_requested = false; */ static bool am_bg_writer = false; - static bool ckpt_active = false; - - /* these values are valid when ckpt_active is true: */ - static pg_time_t ckpt_start_time; - static XLogRecPtr ckpt_start_recptr; - static double ckpt_cached_elapsed; - - static pg_time_t last_checkpoint_time; - static pg_time_t last_xlog_switch_time; - /* Prototypes for private functions */ - static void CheckArchiveTimeout(void); static void BgWriterNap(void); - static bool IsCheckpointOnSchedule(double progress); - static bool ImmediateCheckpointRequested(void); - static bool CompactBgwriterRequestQueue(void); /* Signal handlers */ static void bg_quickdie(SIGNAL_ARGS); static void BgSigHupHandler(SIGNAL_ARGS); - static void ReqCheckpointHandler(SIGNAL_ARGS); static void ReqShutdownHandler(SIGNAL_ARGS); --- 75,88 ---- *************** *** 202,208 **** BackgroundWriterMain(void) sigjmp_buf local_sigjmp_buf; MemoryContext bgwriter_context; - BgWriterShmem->bgwriter_pid = MyProcPid; am_bg_writer = true; /* --- 98,103 ---- *************** *** 228,235 **** BackgroundWriterMain(void) * process to participate in ProcSignal signalling. */ pqsignal(SIGHUP, BgSigHupHandler); /* set flag to read config file */ ! pqsignal(SIGINT, ReqCheckpointHandler); /* request checkpoint */ ! pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */ pqsignal(SIGQUIT, bg_quickdie); /* hard crash time */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); --- 123,130 ---- * process to participate in ProcSignal signalling. */ pqsignal(SIGHUP, BgSigHupHandler); /* set flag to read config file */ ! pqsignal(SIGINT, SIG_IGN); /* as of 9.2 no longer requests checkpoint */ ! pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */ pqsignal(SIGQUIT, bg_quickdie); /* hard crash time */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); *************** *** 249,259 **** BackgroundWriterMain(void) sigdelset(&BlockSig, SIGQUIT); /* - * Initialize so that first time-driven event happens at the correct time. - */ - last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL); - - /* * Create a resource owner to keep track of our resources (currently only * buffer pins). */ --- 144,149 ---- *************** *** 305,324 **** BackgroundWriterMain(void) AtEOXact_Files(); AtEOXact_HashTables(false); - /* Warn any waiting backends that the checkpoint failed. */ - if (ckpt_active) - { - /* use volatile pointer to prevent code rearrangement */ - volatile BgWriterShmemStruct *bgs = BgWriterShmem; - - SpinLockAcquire(&bgs->ckpt_lck); - bgs->ckpt_failed++; - bgs->ckpt_done = bgs->ckpt_started; - SpinLockRelease(&bgs->ckpt_lck); - - ckpt_active = false; - } - /* * Now return to normal top-level context and clear ErrorContext for * next time. --- 195,200 ---- *************** *** 361,379 **** BackgroundWriterMain(void) if (RecoveryInProgress()) ThisTimeLineID = GetRecoveryTargetTLI(); - /* Do this once before starting the loop, then just at SIGHUP time. */ - SyncRepUpdateSyncStandbysDefined(); - /* * Loop forever */ for (;;) { - bool do_checkpoint = false; - int flags = 0; - pg_time_t now; - int elapsed_secs; - /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. --- 237,247 ---- *************** *** 381,403 **** BackgroundWriterMain(void) if (!PostmasterIsAlive()) exit(1); - /* - * Process any requests or signals received recently. - */ - AbsorbFsyncRequests(); - if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); /* update global shmem state for sync rep */ - SyncRepUpdateSyncStandbysDefined(); - } - if (checkpoint_requested) - { - checkpoint_requested = false; - do_checkpoint = true; - BgWriterStats.m_requested_checkpoints++; } if (shutdown_requested) { --- 249,259 ---- *************** *** 406,547 **** BackgroundWriterMain(void) * control back to the sigsetjmp block above */ ExitOnAnyError = true; - /* Close down the database */ - ShutdownXLOG(0, 0); /* Normal exit from the bgwriter is here */ proc_exit(0); /* done */ } /* ! * Force a checkpoint if too much time has elapsed since the last one. ! * Note that we count a timed checkpoint in stats only when this ! * occurs without an external request, but we set the CAUSE_TIME flag ! * bit even if there is also an external request. */ ! now = (pg_time_t) time(NULL); ! elapsed_secs = now - last_checkpoint_time; ! if (elapsed_secs >= CheckPointTimeout) ! { ! if (!do_checkpoint) ! BgWriterStats.m_timed_checkpoints++; ! do_checkpoint = true; ! flags |= CHECKPOINT_CAUSE_TIME; ! } ! ! /* ! * Do a checkpoint if requested, otherwise do one cycle of ! * dirty-buffer writing. ! */ ! if (do_checkpoint) ! { ! bool ckpt_performed = false; ! bool do_restartpoint; ! ! /* use volatile pointer to prevent code rearrangement */ ! volatile BgWriterShmemStruct *bgs = BgWriterShmem; ! ! /* ! * Check if we should perform a checkpoint or a restartpoint. As a ! * side-effect, RecoveryInProgress() initializes TimeLineID if ! * it's not set yet. ! */ ! do_restartpoint = RecoveryInProgress(); ! ! /* ! * Atomically fetch the request flags to figure out what kind of a ! * checkpoint we should perform, and increase the started-counter ! * to acknowledge that we've started a new checkpoint. ! */ ! SpinLockAcquire(&bgs->ckpt_lck); ! flags |= bgs->ckpt_flags; ! bgs->ckpt_flags = 0; ! bgs->ckpt_started++; ! SpinLockRelease(&bgs->ckpt_lck); ! ! /* ! * The end-of-recovery checkpoint is a real checkpoint that's ! * performed while we're still in recovery. ! */ ! if (flags & CHECKPOINT_END_OF_RECOVERY) ! do_restartpoint = false; ! ! /* ! * We will warn if (a) too soon since last checkpoint (whatever ! * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag ! * since the last checkpoint start. Note in particular that this ! * implementation will not generate warnings caused by ! * CheckPointTimeout < CheckPointWarning. ! */ ! if (!do_restartpoint && ! (flags & CHECKPOINT_CAUSE_XLOG) && ! elapsed_secs < CheckPointWarning) ! ereport(LOG, ! (errmsg_plural("checkpoints are occurring too frequently (%d second apart)", ! "checkpoints are occurring too frequently (%d seconds apart)", ! elapsed_secs, ! elapsed_secs), ! errhint("Consider increasing the configuration parameter \"checkpoint_segments\"."))); ! ! /* ! * Initialize bgwriter-private variables used during checkpoint. ! */ ! ckpt_active = true; ! if (!do_restartpoint) ! ckpt_start_recptr = GetInsertRecPtr(); ! ckpt_start_time = now; ! ckpt_cached_elapsed = 0; ! ! /* ! * Do the checkpoint. ! */ ! if (!do_restartpoint) ! { ! CreateCheckPoint(flags); ! ckpt_performed = true; ! } ! else ! ckpt_performed = CreateRestartPoint(flags); ! ! /* ! * After any checkpoint, close all smgr files. This is so we ! * won't hang onto smgr references to deleted files indefinitely. ! */ ! smgrcloseall(); ! ! /* ! * Indicate checkpoint completion to any waiting backends. ! */ ! SpinLockAcquire(&bgs->ckpt_lck); ! bgs->ckpt_done = bgs->ckpt_started; ! SpinLockRelease(&bgs->ckpt_lck); ! ! if (ckpt_performed) ! { ! /* ! * Note we record the checkpoint start time not end time as ! * last_checkpoint_time. This is so that time-driven ! * checkpoints happen at a predictable spacing. ! */ ! last_checkpoint_time = now; ! } ! else ! { ! /* ! * We were not able to perform the restartpoint (checkpoints ! * throw an ERROR in case of error). Most likely because we ! * have not received any new checkpoint WAL records since the ! * last restartpoint. Try again in 15 s. ! */ ! last_checkpoint_time = now - CheckPointTimeout + 15; ! } ! ! ckpt_active = false; ! } ! else ! BgBufferSync(); ! ! /* Check for archive_timeout and switch xlog files if necessary. */ ! CheckArchiveTimeout(); /* Nap for the configured time. */ BgWriterNap(); --- 262,275 ---- * control back to the sigsetjmp block above */ ExitOnAnyError = true; /* Normal exit from the bgwriter is here */ proc_exit(0); /* done */ } /* ! * Do one cycle of dirty-buffer writing. */ ! BgBufferSync(); /* Nap for the configured time. */ BgWriterNap(); *************** *** 549,609 **** BackgroundWriterMain(void) } /* - * CheckArchiveTimeout -- check for archive_timeout and switch xlog files - * - * This will switch to a new WAL file and force an archive file write - * if any activity is recorded in the current WAL file, including just - * a single checkpoint record. - */ - static void - CheckArchiveTimeout(void) - { - pg_time_t now; - pg_time_t last_time; - - if (XLogArchiveTimeout <= 0 || RecoveryInProgress()) - return; - - now = (pg_time_t) time(NULL); - - /* First we do a quick check using possibly-stale local state. */ - if ((int) (now - last_xlog_switch_time) < XLogArchiveTimeout) - return; - - /* - * Update local state ... note that last_xlog_switch_time is the last time - * a switch was performed *or requested*. - */ - last_time = GetLastSegSwitchTime(); - - last_xlog_switch_time = Max(last_xlog_switch_time, last_time); - - /* Now we can do the real check */ - if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout) - { - XLogRecPtr switchpoint; - - /* OK, it's time to switch */ - switchpoint = RequestXLogSwitch(); - - /* - * If the returned pointer points exactly to a segment boundary, - * assume nothing happened. - */ - if ((switchpoint.xrecoff % XLogSegSize) != 0) - ereport(DEBUG1, - (errmsg("transaction log switch forced (archive_timeout=%d)", - XLogArchiveTimeout))); - - /* - * Update state in any case, so we don't retry constantly when the - * system is idle. - */ - last_xlog_switch_time = now; - } - } - - /* * BgWriterNap -- Nap for the configured time or until a signal is received. */ static void --- 277,282 ---- *************** *** 624,808 **** BgWriterNap(void) * respond reasonably promptly when someone signals us, break down the * sleep into 1-second increments, and check for interrupts after each * nap. - * - * We absorb pending requests after each short sleep. */ ! if (bgwriter_lru_maxpages > 0 || ckpt_active) udelay = BgWriterDelay * 1000L; - else if (XLogArchiveTimeout > 0) - udelay = 1000000L; /* One second */ else udelay = 10000000L; /* Ten seconds */ while (udelay > 999999L) { ! if (got_SIGHUP || shutdown_requested || ! (ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested)) break; pg_usleep(1000000L); - AbsorbFsyncRequests(); udelay -= 1000000L; } ! if (!(got_SIGHUP || shutdown_requested || ! (ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))) pg_usleep(udelay); } - /* - * Returns true if an immediate checkpoint request is pending. (Note that - * this does not check the *current* checkpoint's IMMEDIATE flag, but whether - * there is one pending behind it.) - */ - static bool - ImmediateCheckpointRequested(void) - { - if (checkpoint_requested) - { - volatile BgWriterShmemStruct *bgs = BgWriterShmem; - - /* - * We don't need to acquire the ckpt_lck in this case because we're - * only looking at a single flag bit. - */ - if (bgs->ckpt_flags & CHECKPOINT_IMMEDIATE) - return true; - } - return false; - } - - /* - * CheckpointWriteDelay -- yield control to bgwriter during a checkpoint - * - * This function is called after each page write performed by BufferSync(). - * It is responsible for keeping the bgwriter's normal activities in - * progress during a long checkpoint, and for throttling BufferSync()'s - * write rate to hit checkpoint_completion_target. - * - * The checkpoint request flags should be passed in; currently the only one - * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes. - * - * 'progress' is an estimate of how much of the work has been done, as a - * fraction between 0.0 meaning none, and 1.0 meaning all done. - */ - void - CheckpointWriteDelay(int flags, double progress) - { - static int absorb_counter = WRITES_PER_ABSORB; - - /* Do nothing if checkpoint is being executed by non-bgwriter process */ - if (!am_bg_writer) - return; - - /* - * Perform the usual bgwriter duties and take a nap, unless we're behind - * schedule, in which case we just try to catch up as quickly as possible. - */ - if (!(flags & CHECKPOINT_IMMEDIATE) && - !shutdown_requested && - !ImmediateCheckpointRequested() && - IsCheckpointOnSchedule(progress)) - { - if (got_SIGHUP) - { - got_SIGHUP = false; - ProcessConfigFile(PGC_SIGHUP); - /* update global shmem state for sync rep */ - SyncRepUpdateSyncStandbysDefined(); - } - - AbsorbFsyncRequests(); - absorb_counter = WRITES_PER_ABSORB; - - BgBufferSync(); - CheckArchiveTimeout(); - BgWriterNap(); - } - else if (--absorb_counter <= 0) - { - /* - * Absorb pending fsync requests after each WRITES_PER_ABSORB write - * operations even when we don't sleep, to prevent overflow of the - * fsync request queue. - */ - AbsorbFsyncRequests(); - absorb_counter = WRITES_PER_ABSORB; - } - } - - /* - * IsCheckpointOnSchedule -- are we on schedule to finish this checkpoint - * in time? - * - * Compares the current progress against the time/segments elapsed since last - * checkpoint, and returns true if the progress we've made this far is greater - * than the elapsed time/segments. - */ - static bool - IsCheckpointOnSchedule(double progress) - { - XLogRecPtr recptr; - struct timeval now; - double elapsed_xlogs, - elapsed_time; - - Assert(ckpt_active); - - /* Scale progress according to checkpoint_completion_target. */ - progress *= CheckPointCompletionTarget; - - /* - * Check against the cached value first. Only do the more expensive - * calculations once we reach the target previously calculated. Since - * neither time or WAL insert pointer moves backwards, a freshly - * calculated value can only be greater than or equal to the cached value. - */ - if (progress < ckpt_cached_elapsed) - return false; - - /* - * Check progress against WAL segments written and checkpoint_segments. - * - * We compare the current WAL insert location against the location - * computed before calling CreateCheckPoint. The code in XLogInsert that - * actually triggers a checkpoint when checkpoint_segments is exceeded - * compares against RedoRecptr, so this is not completely accurate. - * However, it's good enough for our purposes, we're only calculating an - * estimate anyway. - */ - if (!RecoveryInProgress()) - { - recptr = GetInsertRecPtr(); - elapsed_xlogs = - (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile + - ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) / - CheckPointSegments; - - if (progress < elapsed_xlogs) - { - ckpt_cached_elapsed = elapsed_xlogs; - return false; - } - } - - /* - * Check progress against time elapsed and checkpoint_timeout. - */ - gettimeofday(&now, NULL); - elapsed_time = ((double) ((pg_time_t) now.tv_sec - ckpt_start_time) + - now.tv_usec / 1000000.0) / CheckPointTimeout; - - if (progress < elapsed_time) - { - ckpt_cached_elapsed = elapsed_time; - return false; - } - - /* It looks like we're on schedule. */ - return true; - } - - /* -------------------------------- * signal handler routines * -------------------------------- --- 297,320 ---- * respond reasonably promptly when someone signals us, break down the * sleep into 1-second increments, and check for interrupts after each * nap. */ ! if (bgwriter_lru_maxpages > 0) udelay = BgWriterDelay * 1000L; else udelay = 10000000L; /* Ten seconds */ while (udelay > 999999L) { ! if (got_SIGHUP || shutdown_requested) break; pg_usleep(1000000L); udelay -= 1000000L; } ! if (!(got_SIGHUP || shutdown_requested)) pg_usleep(udelay); } /* -------------------------------- * signal handler routines * -------------------------------- *************** *** 847,1287 **** BgSigHupHandler(SIGNAL_ARGS) got_SIGHUP = true; } - /* SIGINT: set flag to run a normal checkpoint right away */ - static void - ReqCheckpointHandler(SIGNAL_ARGS) - { - checkpoint_requested = true; - } - /* SIGUSR2: set flag to run a shutdown checkpoint and exit */ static void ReqShutdownHandler(SIGNAL_ARGS) { shutdown_requested = true; } - - - /* -------------------------------- - * communication with backends - * -------------------------------- - */ - - /* - * BgWriterShmemSize - * Compute space needed for bgwriter-related shared memory - */ - Size - BgWriterShmemSize(void) - { - Size size; - - /* - * Currently, the size of the requests[] array is arbitrarily set equal to - * NBuffers. This may prove too large or small ... - */ - size = offsetof(BgWriterShmemStruct, requests); - size = add_size(size, mul_size(NBuffers, sizeof(BgWriterRequest))); - - return size; - } - - /* - * BgWriterShmemInit - * Allocate and initialize bgwriter-related shared memory - */ - void - BgWriterShmemInit(void) - { - bool found; - - BgWriterShmem = (BgWriterShmemStruct *) - ShmemInitStruct("Background Writer Data", - BgWriterShmemSize(), - &found); - - if (!found) - { - /* First time through, so initialize */ - MemSet(BgWriterShmem, 0, sizeof(BgWriterShmemStruct)); - SpinLockInit(&BgWriterShmem->ckpt_lck); - BgWriterShmem->max_requests = NBuffers; - } - } - - /* - * RequestCheckpoint - * Called in backend processes to request a checkpoint - * - * flags is a bitwise OR of the following: - * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown. - * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery. - * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP, - * ignoring checkpoint_completion_target parameter. - * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured - * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or - * CHECKPOINT_END_OF_RECOVERY). - * CHECKPOINT_WAIT: wait for completion before returning (otherwise, - * just signal bgwriter to do it, and return). - * CHECKPOINT_CAUSE_XLOG: checkpoint is requested due to xlog filling. - * (This affects logging, and in particular enables CheckPointWarning.) - */ - void - RequestCheckpoint(int flags) - { - /* use volatile pointer to prevent code rearrangement */ - volatile BgWriterShmemStruct *bgs = BgWriterShmem; - int ntries; - int old_failed, - old_started; - - /* - * If in a standalone backend, just do it ourselves. - */ - if (!IsPostmasterEnvironment) - { - /* - * There's no point in doing slow checkpoints in a standalone backend, - * because there's no other backends the checkpoint could disrupt. - */ - CreateCheckPoint(flags | CHECKPOINT_IMMEDIATE); - - /* - * After any checkpoint, close all smgr files. This is so we won't - * hang onto smgr references to deleted files indefinitely. - */ - smgrcloseall(); - - return; - } - - /* - * Atomically set the request flags, and take a snapshot of the counters. - * When we see ckpt_started > old_started, we know the flags we set here - * have been seen by bgwriter. - * - * Note that we OR the flags with any existing flags, to avoid overriding - * a "stronger" request by another backend. The flag senses must be - * chosen to make this work! - */ - SpinLockAcquire(&bgs->ckpt_lck); - - old_failed = bgs->ckpt_failed; - old_started = bgs->ckpt_started; - bgs->ckpt_flags |= flags; - - SpinLockRelease(&bgs->ckpt_lck); - - /* - * Send signal to request checkpoint. It's possible that the bgwriter - * hasn't started yet, or is in process of restarting, so we will retry a - * few times if needed. Also, if not told to wait for the checkpoint to - * occur, we consider failure to send the signal to be nonfatal and merely - * LOG it. - */ - for (ntries = 0;; ntries++) - { - if (BgWriterShmem->bgwriter_pid == 0) - { - if (ntries >= 20) /* max wait 2.0 sec */ - { - elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG, - "could not request checkpoint because bgwriter not running"); - break; - } - } - else if (kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0) - { - if (ntries >= 20) /* max wait 2.0 sec */ - { - elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG, - "could not signal for checkpoint: %m"); - break; - } - } - else - break; /* signal sent successfully */ - - CHECK_FOR_INTERRUPTS(); - pg_usleep(100000L); /* wait 0.1 sec, then retry */ - } - - /* - * If requested, wait for completion. We detect completion according to - * the algorithm given above. - */ - if (flags & CHECKPOINT_WAIT) - { - int new_started, - new_failed; - - /* Wait for a new checkpoint to start. */ - for (;;) - { - SpinLockAcquire(&bgs->ckpt_lck); - new_started = bgs->ckpt_started; - SpinLockRelease(&bgs->ckpt_lck); - - if (new_started != old_started) - break; - - CHECK_FOR_INTERRUPTS(); - pg_usleep(100000L); - } - - /* - * We are waiting for ckpt_done >= new_started, in a modulo sense. - */ - for (;;) - { - int new_done; - - SpinLockAcquire(&bgs->ckpt_lck); - new_done = bgs->ckpt_done; - new_failed = bgs->ckpt_failed; - SpinLockRelease(&bgs->ckpt_lck); - - if (new_done - new_started >= 0) - break; - - CHECK_FOR_INTERRUPTS(); - pg_usleep(100000L); - } - - if (new_failed != old_failed) - ereport(ERROR, - (errmsg("checkpoint request failed"), - errhint("Consult recent messages in the server log for details."))); - } - } - - /* - * ForwardFsyncRequest - * Forward a file-fsync request from a backend to the bgwriter - * - * Whenever a backend is compelled to write directly to a relation - * (which should be seldom, if the bgwriter is getting its job done), - * the backend calls this routine to pass over knowledge that the relation - * is dirty and must be fsync'd before next checkpoint. We also use this - * opportunity to count such writes for statistical purposes. - * - * segno specifies which segment (not block!) of the relation needs to be - * fsync'd. (Since the valid range is much less than BlockNumber, we can - * use high values for special flags; that's all internal to md.c, which - * see for details.) - * - * To avoid holding the lock for longer than necessary, we normally write - * to the requests[] queue without checking for duplicates. The bgwriter - * will have to eliminate dups internally anyway. However, if we discover - * that the queue is full, we make a pass over the entire queue to compact - * it. This is somewhat expensive, but the alternative is for the backend - * to perform its own fsync, which is far more expensive in practice. It - * is theoretically possible a backend fsync might still be necessary, if - * the queue is full and contains no duplicate entries. In that case, we - * let the backend know by returning false. - */ - bool - ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum, - BlockNumber segno) - { - BgWriterRequest *request; - - if (!IsUnderPostmaster) - return false; /* probably shouldn't even get here */ - - if (am_bg_writer) - elog(ERROR, "ForwardFsyncRequest must not be called in bgwriter"); - - LWLockAcquire(BgWriterCommLock, LW_EXCLUSIVE); - - /* Count all backend writes regardless of if they fit in the queue */ - BgWriterShmem->num_backend_writes++; - - /* - * If the background writer isn't running or the request queue is full, - * the backend will have to perform its own fsync request. But before - * forcing that to happen, we can try to compact the background writer - * request queue. - */ - if (BgWriterShmem->bgwriter_pid == 0 || - (BgWriterShmem->num_requests >= BgWriterShmem->max_requests - && !CompactBgwriterRequestQueue())) - { - /* - * Count the subset of writes where backends have to do their own - * fsync - */ - BgWriterShmem->num_backend_fsync++; - LWLockRelease(BgWriterCommLock); - return false; - } - request = &BgWriterShmem->requests[BgWriterShmem->num_requests++]; - request->rnode = rnode; - request->forknum = forknum; - request->segno = segno; - LWLockRelease(BgWriterCommLock); - return true; - } - - /* - * CompactBgwriterRequestQueue - * Remove duplicates from the request queue to avoid backend fsyncs. - * - * Although a full fsync request queue is not common, it can lead to severe - * performance problems when it does happen. So far, this situation has - * only been observed to occur when the system is under heavy write load, - * and especially during the "sync" phase of a checkpoint. Without this - * logic, each backend begins doing an fsync for every block written, which - * gets very expensive and can slow down the whole system. - * - * Trying to do this every time the queue is full could lose if there - * aren't any removable entries. But should be vanishingly rare in - * practice: there's one queue entry per shared buffer. - */ - static bool - CompactBgwriterRequestQueue() - { - struct BgWriterSlotMapping - { - BgWriterRequest request; - int slot; - }; - - int n, - preserve_count; - int num_skipped = 0; - HASHCTL ctl; - HTAB *htab; - bool *skip_slot; - - /* must hold BgWriterCommLock in exclusive mode */ - Assert(LWLockHeldByMe(BgWriterCommLock)); - - /* Initialize temporary hash table */ - MemSet(&ctl, 0, sizeof(ctl)); - ctl.keysize = sizeof(BgWriterRequest); - ctl.entrysize = sizeof(struct BgWriterSlotMapping); - ctl.hash = tag_hash; - htab = hash_create("CompactBgwriterRequestQueue", - BgWriterShmem->num_requests, - &ctl, - HASH_ELEM | HASH_FUNCTION); - - /* Initialize skip_slot array */ - skip_slot = palloc0(sizeof(bool) * BgWriterShmem->num_requests); - - /* - * The basic idea here is that a request can be skipped if it's followed - * by a later, identical request. It might seem more sensible to work - * backwards from the end of the queue and check whether a request is - * *preceded* by an earlier, identical request, in the hopes of doing less - * copying. But that might change the semantics, if there's an - * intervening FORGET_RELATION_FSYNC or FORGET_DATABASE_FSYNC request, so - * we do it this way. It would be possible to be even smarter if we made - * the code below understand the specific semantics of such requests (it - * could blow away preceding entries that would end up being canceled - * anyhow), but it's not clear that the extra complexity would buy us - * anything. - */ - for (n = 0; n < BgWriterShmem->num_requests; ++n) - { - BgWriterRequest *request; - struct BgWriterSlotMapping *slotmap; - bool found; - - request = &BgWriterShmem->requests[n]; - slotmap = hash_search(htab, request, HASH_ENTER, &found); - if (found) - { - skip_slot[slotmap->slot] = true; - ++num_skipped; - } - slotmap->slot = n; - } - - /* Done with the hash table. */ - hash_destroy(htab); - - /* If no duplicates, we're out of luck. */ - if (!num_skipped) - { - pfree(skip_slot); - return false; - } - - /* We found some duplicates; remove them. */ - for (n = 0, preserve_count = 0; n < BgWriterShmem->num_requests; ++n) - { - if (skip_slot[n]) - continue; - BgWriterShmem->requests[preserve_count++] = BgWriterShmem->requests[n]; - } - ereport(DEBUG1, - (errmsg("compacted fsync request queue from %d entries to %d entries", - BgWriterShmem->num_requests, preserve_count))); - BgWriterShmem->num_requests = preserve_count; - - /* Cleanup. */ - pfree(skip_slot); - return true; - } - - /* - * AbsorbFsyncRequests - * Retrieve queued fsync requests and pass them to local smgr. - * - * This is exported because it must be called during CreateCheckPoint; - * we have to be sure we have accepted all pending requests just before - * we start fsync'ing. Since CreateCheckPoint sometimes runs in - * non-bgwriter processes, do nothing if not bgwriter. - */ - void - AbsorbFsyncRequests(void) - { - BgWriterRequest *requests = NULL; - BgWriterRequest *request; - int n; - - if (!am_bg_writer) - return; - - /* - * We have to PANIC if we fail to absorb all the pending requests (eg, - * because our hashtable runs out of memory). This is because the system - * cannot run safely if we are unable to fsync what we have been told to - * fsync. Fortunately, the hashtable is so small that the problem is - * quite unlikely to arise in practice. - */ - START_CRIT_SECTION(); - - /* - * We try to avoid holding the lock for a long time by copying the request - * array. - */ - LWLockAcquire(BgWriterCommLock, LW_EXCLUSIVE); - - /* Transfer write count into pending pgstats message */ - BgWriterStats.m_buf_written_backend += BgWriterShmem->num_backend_writes; - BgWriterStats.m_buf_fsync_backend += BgWriterShmem->num_backend_fsync; - - BgWriterShmem->num_backend_writes = 0; - BgWriterShmem->num_backend_fsync = 0; - - n = BgWriterShmem->num_requests; - if (n > 0) - { - requests = (BgWriterRequest *) palloc(n * sizeof(BgWriterRequest)); - memcpy(requests, BgWriterShmem->requests, n * sizeof(BgWriterRequest)); - } - BgWriterShmem->num_requests = 0; - - LWLockRelease(BgWriterCommLock); - - for (request = requests; n > 0; request++, n--) - RememberFsyncRequest(request->rnode, request->forknum, request->segno); - - if (requests) - pfree(requests); - - END_CRIT_SECTION(); - } --- 359,367 ---- *** a/src/backend/postmaster/postmaster.c --- b/src/backend/postmaster/postmaster.c *************** *** 206,211 **** bool restart_after_crash = true; --- 206,212 ---- /* PIDs of special child processes; 0 when not running */ static pid_t StartupPID = 0, BgWriterPID = 0, + CheckpointerPID = 0, WalWriterPID = 0, WalReceiverPID = 0, AutoVacPID = 0, *************** *** 277,283 **** typedef enum PM_WAIT_BACKUP, /* waiting for online backup mode to end */ PM_WAIT_READONLY, /* waiting for read only backends to exit */ PM_WAIT_BACKENDS, /* waiting for live backends to exit */ ! PM_SHUTDOWN, /* waiting for bgwriter to do shutdown ckpt */ PM_SHUTDOWN_2, /* waiting for archiver and walsenders to * finish */ PM_WAIT_DEAD_END, /* waiting for dead_end children to exit */ --- 278,284 ---- PM_WAIT_BACKUP, /* waiting for online backup mode to end */ PM_WAIT_READONLY, /* waiting for read only backends to exit */ PM_WAIT_BACKENDS, /* waiting for live backends to exit */ ! PM_SHUTDOWN, /* waiting for checkpointer to do shutdown ckpt */ PM_SHUTDOWN_2, /* waiting for archiver and walsenders to * finish */ PM_WAIT_DEAD_END, /* waiting for dead_end children to exit */ *************** *** 462,467 **** static void ShmemBackendArrayRemove(Backend *bn); --- 463,469 ---- #define StartupDataBase() StartChildProcess(StartupProcess) #define StartBackgroundWriter() StartChildProcess(BgWriterProcess) + #define StartCheckpointer() StartChildProcess(CheckpointerProcess) #define StartWalWriter() StartChildProcess(WalWriterProcess) #define StartWalReceiver() StartChildProcess(WalReceiverProcess) *************** *** 1029,1036 **** PostmasterMain(int argc, char *argv[]) * CAUTION: when changing this list, check for side-effects on the signal * handling setup of child processes. See tcop/postgres.c, * bootstrap/bootstrap.c, postmaster/bgwriter.c, postmaster/walwriter.c, ! * postmaster/autovacuum.c, postmaster/pgarch.c, postmaster/pgstat.c, and ! * postmaster/syslogger.c. */ pqinitmask(); PG_SETMASK(&BlockSig); --- 1031,1038 ---- * CAUTION: when changing this list, check for side-effects on the signal * handling setup of child processes. See tcop/postgres.c, * bootstrap/bootstrap.c, postmaster/bgwriter.c, postmaster/walwriter.c, ! * postmaster/autovacuum.c, postmaster/pgarch.c, postmaster/pgstat.c, ! * postmaster/syslogger.c and postmaster/checkpointer.c */ pqinitmask(); PG_SETMASK(&BlockSig); *************** *** 1367,1376 **** ServerLoop(void) * state that prevents it, start one. It doesn't matter if this * fails, we'll just try again later. */ ! if (BgWriterPID == 0 && ! (pmState == PM_RUN || pmState == PM_RECOVERY || ! pmState == PM_HOT_STANDBY)) ! BgWriterPID = StartBackgroundWriter(); /* * Likewise, if we have lost the walwriter process, try to start a new --- 1369,1382 ---- * state that prevents it, start one. It doesn't matter if this * fails, we'll just try again later. */ ! if (pmState == PM_RUN || pmState == PM_RECOVERY || ! pmState == PM_HOT_STANDBY) ! { ! if (BgWriterPID == 0) ! BgWriterPID = StartBackgroundWriter(); ! if (CheckpointerPID == 0) ! CheckpointerPID = StartCheckpointer(); ! } /* * Likewise, if we have lost the walwriter process, try to start a new *************** *** 2048,2053 **** SIGHUP_handler(SIGNAL_ARGS) --- 2054,2061 ---- signal_child(StartupPID, SIGHUP); if (BgWriterPID != 0) signal_child(BgWriterPID, SIGHUP); + if (CheckpointerPID != 0) + signal_child(CheckpointerPID, SIGHUP); if (WalWriterPID != 0) signal_child(WalWriterPID, SIGHUP); if (WalReceiverPID != 0) *************** *** 2162,2168 **** pmdie(SIGNAL_ARGS) signal_child(WalReceiverPID, SIGTERM); if (pmState == PM_RECOVERY) { ! /* only bgwriter is active in this state */ pmState = PM_WAIT_BACKENDS; } else if (pmState == PM_RUN || --- 2170,2176 ---- signal_child(WalReceiverPID, SIGTERM); if (pmState == PM_RECOVERY) { ! /* only checkpointer is active in this state */ pmState = PM_WAIT_BACKENDS; } else if (pmState == PM_RUN || *************** *** 2207,2212 **** pmdie(SIGNAL_ARGS) --- 2215,2222 ---- signal_child(StartupPID, SIGQUIT); if (BgWriterPID != 0) signal_child(BgWriterPID, SIGQUIT); + if (CheckpointerPID != 0) + signal_child(CheckpointerPID, SIGQUIT); if (WalWriterPID != 0) signal_child(WalWriterPID, SIGQUIT); if (WalReceiverPID != 0) *************** *** 2337,2348 **** reaper(SIGNAL_ARGS) } /* ! * Crank up the background writer, if we didn't do that already * when we entered consistent recovery state. It doesn't matter * if this fails, we'll just try again later. */ if (BgWriterPID == 0) BgWriterPID = StartBackgroundWriter(); /* * Likewise, start other special children as needed. In a restart --- 2347,2360 ---- } /* ! * Crank up background tasks, if we didn't do that already * when we entered consistent recovery state. It doesn't matter * if this fails, we'll just try again later. */ if (BgWriterPID == 0) BgWriterPID = StartBackgroundWriter(); + if (CheckpointerPID == 0) + CheckpointerPID = StartCheckpointer(); /* * Likewise, start other special children as needed. In a restart *************** *** 2370,2379 **** reaper(SIGNAL_ARGS) if (pid == BgWriterPID) { BgWriterPID = 0; if (EXIT_STATUS_0(exitstatus) && pmState == PM_SHUTDOWN) { /* ! * OK, we saw normal exit of the bgwriter after it's been told * to shut down. We expect that it wrote a shutdown * checkpoint. (If for some reason it didn't, recovery will * occur on next postmaster start.) --- 2382,2403 ---- if (pid == BgWriterPID) { BgWriterPID = 0; + if (!EXIT_STATUS_0(exitstatus)) + HandleChildCrash(pid, exitstatus, + _("background writer process")); + continue; + } + + /* + * Was it the checkpointer? + */ + if (pid == CheckpointerPID) + { + CheckpointerPID = 0; if (EXIT_STATUS_0(exitstatus) && pmState == PM_SHUTDOWN) { /* ! * OK, we saw normal exit of the checkpointer after it's been told * to shut down. We expect that it wrote a shutdown * checkpoint. (If for some reason it didn't, recovery will * occur on next postmaster start.) *************** *** 2410,2420 **** reaper(SIGNAL_ARGS) else { /* ! * Any unexpected exit of the bgwriter (including FATAL exit) * is treated as a crash. */ HandleChildCrash(pid, exitstatus, ! _("background writer process")); } continue; --- 2434,2444 ---- else { /* ! * Any unexpected exit of the checkpointer (including FATAL exit) * is treated as a crash. */ HandleChildCrash(pid, exitstatus, ! _("checkpointer process")); } continue; *************** *** 2598,2605 **** CleanupBackend(int pid, } /* ! * HandleChildCrash -- cleanup after failed backend, bgwriter, walwriter, ! * or autovacuum. * * The objectives here are to clean up our local state about the child * process, and to signal all other remaining children to quickdie. --- 2622,2629 ---- } /* ! * HandleChildCrash -- cleanup after failed backend, bgwriter, checkpointer, ! * walwriter or autovacuum. * * The objectives here are to clean up our local state about the child * process, and to signal all other remaining children to quickdie. *************** *** 2692,2697 **** HandleChildCrash(int pid, int exitstatus, const char *procname) --- 2716,2733 ---- signal_child(BgWriterPID, (SendStop ? SIGSTOP : SIGQUIT)); } + /* Take care of the checkpointer too */ + if (pid == CheckpointerPID) + CheckpointerPID = 0; + else if (CheckpointerPID != 0 && !FatalError) + { + ereport(DEBUG2, + (errmsg_internal("sending %s to process %d", + (SendStop ? "SIGSTOP" : "SIGQUIT"), + (int) CheckpointerPID))); + signal_child(CheckpointerPID, (SendStop ? SIGSTOP : SIGQUIT)); + } + /* Take care of the walwriter too */ if (pid == WalWriterPID) WalWriterPID = 0; *************** *** 2871,2879 **** PostmasterStateMachine(void) { /* * PM_WAIT_BACKENDS state ends when we have no regular backends ! * (including autovac workers) and no walwriter or autovac launcher. ! * If we are doing crash recovery then we expect the bgwriter to exit ! * too, otherwise not. The archiver, stats, and syslogger processes * are disregarded since they are not connected to shared memory; we * also disregard dead_end children here. Walsenders are also * disregarded, they will be terminated later after writing the --- 2907,2916 ---- { /* * PM_WAIT_BACKENDS state ends when we have no regular backends ! * (including autovac workers) and no walwriter, autovac launcher ! * or bgwriter. If we are doing crash recovery then we expect the ! * checkpointer to exit as well, otherwise not. ! * The archiver, stats, and syslogger processes * are disregarded since they are not connected to shared memory; we * also disregard dead_end children here. Walsenders are also * disregarded, they will be terminated later after writing the *************** *** 2882,2888 **** PostmasterStateMachine(void) if (CountChildren(BACKEND_TYPE_NORMAL | BACKEND_TYPE_AUTOVAC) == 0 && StartupPID == 0 && WalReceiverPID == 0 && ! (BgWriterPID == 0 || !FatalError) && WalWriterPID == 0 && AutoVacPID == 0) { --- 2919,2926 ---- if (CountChildren(BACKEND_TYPE_NORMAL | BACKEND_TYPE_AUTOVAC) == 0 && StartupPID == 0 && WalReceiverPID == 0 && ! BgWriterPID == 0 && ! (CheckpointerPID == 0 || !FatalError) && WalWriterPID == 0 && AutoVacPID == 0) { *************** *** 2904,2925 **** PostmasterStateMachine(void) /* * If we get here, we are proceeding with normal shutdown. All * the regular children are gone, and it's time to tell the ! * bgwriter to do a shutdown checkpoint. */ Assert(Shutdown > NoShutdown); ! /* Start the bgwriter if not running */ ! if (BgWriterPID == 0) ! BgWriterPID = StartBackgroundWriter(); /* And tell it to shut down */ ! if (BgWriterPID != 0) { ! signal_child(BgWriterPID, SIGUSR2); pmState = PM_SHUTDOWN; } else { /* ! * If we failed to fork a bgwriter, just shut down. Any * required cleanup will happen at next restart. We set * FatalError so that an "abnormal shutdown" message gets * logged when we exit. --- 2942,2963 ---- /* * If we get here, we are proceeding with normal shutdown. All * the regular children are gone, and it's time to tell the ! * checkpointer to do a shutdown checkpoint. */ Assert(Shutdown > NoShutdown); ! /* Start the checkpointer if not running */ ! if (CheckpointerPID == 0) ! CheckpointerPID = StartCheckpointer(); /* And tell it to shut down */ ! if (CheckpointerPID != 0) { ! signal_child(CheckpointerPID, SIGUSR2); pmState = PM_SHUTDOWN; } else { /* ! * If we failed to fork a checkpointer, just shut down. Any * required cleanup will happen at next restart. We set * FatalError so that an "abnormal shutdown" message gets * logged when we exit. *************** *** 2978,2983 **** PostmasterStateMachine(void) --- 3016,3022 ---- Assert(StartupPID == 0); Assert(WalReceiverPID == 0); Assert(BgWriterPID == 0); + Assert(CheckpointerPID == 0); Assert(WalWriterPID == 0); Assert(AutoVacPID == 0); /* syslogger is not considered here */ *************** *** 4157,4162 **** sigusr1_handler(SIGNAL_ARGS) --- 4196,4203 ---- */ Assert(BgWriterPID == 0); BgWriterPID = StartBackgroundWriter(); + Assert(CheckpointerPID == 0); + CheckpointerPID = StartCheckpointer(); pmState = PM_RECOVERY; } *************** *** 4443,4448 **** StartChildProcess(AuxProcType type) --- 4484,4493 ---- ereport(LOG, (errmsg("could not fork background writer process: %m"))); break; + case CheckpointerProcess: + ereport(LOG, + (errmsg("could not fork checkpointer process: %m"))); + break; case WalWriterProcess: ereport(LOG, (errmsg("could not fork WAL writer process: %m"))); *** a/src/backend/storage/buffer/bufmgr.c --- b/src/backend/storage/buffer/bufmgr.c *************** *** 1278,1288 **** BufferSync(int flags) break; /* ! * Perform normal bgwriter duties and sleep to throttle our ! * I/O rate. */ ! CheckpointWriteDelay(flags, ! (double) num_written / num_to_write); } } --- 1278,1286 ---- break; /* ! * Sleep to throttle our I/O rate. */ ! CheckpointWriteDelay(flags, num_written, num_to_write); } } *** a/src/backend/storage/smgr/md.c --- b/src/backend/storage/smgr/md.c *************** *** 38,44 **** /* * Special values for the segno arg to RememberFsyncRequest. * ! * Note that CompactBgwriterRequestQueue assumes that it's OK to remove an * fsync request from the queue if an identical, subsequent request is found. * See comments there before making changes here. */ --- 38,44 ---- /* * Special values for the segno arg to RememberFsyncRequest. * ! * Note that CompactcheckpointerRequestQueue assumes that it's OK to remove an * fsync request from the queue if an identical, subsequent request is found. * See comments there before making changes here. */ *************** *** 77,83 **** * Inactive segments are those that once contained data but are currently * not needed because of an mdtruncate() operation. The reason for leaving * them present at size zero, rather than unlinking them, is that other ! * backends and/or the bgwriter might be holding open file references to * such segments. If the relation expands again after mdtruncate(), such * that a deactivated segment becomes active again, it is important that * such file references still be valid --- else data might get written --- 77,83 ---- * Inactive segments are those that once contained data but are currently * not needed because of an mdtruncate() operation. The reason for leaving * them present at size zero, rather than unlinking them, is that other ! * backends and/or the checkpointer might be holding open file references to * such segments. If the relation expands again after mdtruncate(), such * that a deactivated segment becomes active again, it is important that * such file references still be valid --- else data might get written *************** *** 111,117 **** static MemoryContext MdCxt; /* context for all md.c allocations */ /* ! * In some contexts (currently, standalone backends and the bgwriter process) * we keep track of pending fsync operations: we need to remember all relation * segments that have been written since the last checkpoint, so that we can * fsync them down to disk before completing the next checkpoint. This hash --- 111,117 ---- /* ! * In some contexts (currently, standalone backends and the checkpointer process) * we keep track of pending fsync operations: we need to remember all relation * segments that have been written since the last checkpoint, so that we can * fsync them down to disk before completing the next checkpoint. This hash *************** *** 123,129 **** static MemoryContext MdCxt; /* context for all md.c allocations */ * a hash table, because we don't expect there to be any duplicate requests. * * (Regular backends do not track pending operations locally, but forward ! * them to the bgwriter.) */ typedef struct { --- 123,129 ---- * a hash table, because we don't expect there to be any duplicate requests. * * (Regular backends do not track pending operations locally, but forward ! * them to the checkpointer.) */ typedef struct { *************** *** 194,200 **** mdinit(void) * Create pending-operations hashtable if we need it. Currently, we need * it if we are standalone (not under a postmaster) OR if we are a * bootstrap-mode subprocess of a postmaster (that is, a startup or ! * bgwriter process). */ if (!IsUnderPostmaster || IsBootstrapProcessingMode()) { --- 194,200 ---- * Create pending-operations hashtable if we need it. Currently, we need * it if we are standalone (not under a postmaster) OR if we are a * bootstrap-mode subprocess of a postmaster (that is, a startup or ! * checkpointer process). */ if (!IsUnderPostmaster || IsBootstrapProcessingMode()) { *************** *** 214,223 **** mdinit(void) } /* ! * In archive recovery, we rely on bgwriter to do fsyncs, but we will have * already created the pendingOpsTable during initialization of the startup * process. Calling this function drops the local pendingOpsTable so that ! * subsequent requests will be forwarded to bgwriter. */ void SetForwardFsyncRequests(void) --- 214,223 ---- } /* ! * In archive recovery, we rely on checkpointer to do fsyncs, but we will have * already created the pendingOpsTable during initialization of the startup * process. Calling this function drops the local pendingOpsTable so that ! * subsequent requests will be forwarded to checkpointer. */ void SetForwardFsyncRequests(void) *************** *** 765,773 **** mdnblocks(SMgrRelation reln, ForkNumber forknum) * NOTE: this assumption could only be wrong if another backend has * truncated the relation. We rely on higher code levels to handle that * scenario by closing and re-opening the md fd, which is handled via ! * relcache flush. (Since the bgwriter doesn't participate in relcache * flush, it could have segment chain entries for inactive segments; ! * that's OK because the bgwriter never needs to compute relation size.) */ while (v->mdfd_chain != NULL) { --- 765,773 ---- * NOTE: this assumption could only be wrong if another backend has * truncated the relation. We rely on higher code levels to handle that * scenario by closing and re-opening the md fd, which is handled via ! * relcache flush. (Since the checkpointer doesn't participate in relcache * flush, it could have segment chain entries for inactive segments; ! * that's OK because the checkpointer never needs to compute relation size.) */ while (v->mdfd_chain != NULL) { *************** *** 957,963 **** mdsync(void) elog(ERROR, "cannot sync without a pendingOpsTable"); /* ! * If we are in the bgwriter, the sync had better include all fsync * requests that were queued by backends up to this point. The tightest * race condition that could occur is that a buffer that must be written * and fsync'd for the checkpoint could have been dumped by a backend just --- 957,963 ---- elog(ERROR, "cannot sync without a pendingOpsTable"); /* ! * If we are in the checkpointer, the sync had better include all fsync * requests that were queued by backends up to this point. The tightest * race condition that could occur is that a buffer that must be written * and fsync'd for the checkpoint could have been dumped by a backend just *************** *** 1033,1039 **** mdsync(void) int failures; /* ! * If in bgwriter, we want to absorb pending requests every so * often to prevent overflow of the fsync request queue. It is * unspecified whether newly-added entries will be visited by * hash_seq_search, but we don't care since we don't need to --- 1033,1039 ---- int failures; /* ! * If in checkpointer, we want to absorb pending requests every so * often to prevent overflow of the fsync request queue. It is * unspecified whether newly-added entries will be visited by * hash_seq_search, but we don't care since we don't need to *************** *** 1070,1078 **** mdsync(void) * say "but an unreferenced SMgrRelation is still a leak!" Not * really, because the only case in which a checkpoint is done * by a process that isn't about to shut down is in the ! * bgwriter, and it will periodically do smgrcloseall(). This * fact justifies our not closing the reln in the success path ! * either, which is a good thing since in non-bgwriter cases * we couldn't safely do that.) Furthermore, in many cases * the relation will have been dirtied through this same smgr * relation, and so we can save a file open/close cycle. --- 1070,1078 ---- * say "but an unreferenced SMgrRelation is still a leak!" Not * really, because the only case in which a checkpoint is done * by a process that isn't about to shut down is in the ! * checkpointer, and it will periodically do smgrcloseall(). This * fact justifies our not closing the reln in the success path ! * either, which is a good thing since in non-checkpointer cases * we couldn't safely do that.) Furthermore, in many cases * the relation will have been dirtied through this same smgr * relation, and so we can save a file open/close cycle. *************** *** 1301,1307 **** register_unlink(RelFileNodeBackend rnode) else { /* ! * Notify the bgwriter about it. If we fail to queue the request * message, we have to sleep and try again, because we can't simply * delete the file now. Ugly, but hopefully won't happen often. * --- 1301,1307 ---- else { /* ! * Notify the checkpointer about it. If we fail to queue the request * message, we have to sleep and try again, because we can't simply * delete the file now. Ugly, but hopefully won't happen often. * *************** *** 1315,1324 **** register_unlink(RelFileNodeBackend rnode) } /* ! * RememberFsyncRequest() -- callback from bgwriter side of fsync request * * We stuff most fsync requests into the local hash table for execution ! * during the bgwriter's next checkpoint. UNLINK requests go into a * separate linked list, however, because they get processed separately. * * The range of possible segment numbers is way less than the range of --- 1315,1324 ---- } /* ! * RememberFsyncRequest() -- callback from checkpointer side of fsync request * * We stuff most fsync requests into the local hash table for execution ! * during the checkpointer's next checkpoint. UNLINK requests go into a * separate linked list, however, because they get processed separately. * * The range of possible segment numbers is way less than the range of *************** *** 1460,1479 **** ForgetRelationFsyncRequests(RelFileNodeBackend rnode, ForkNumber forknum) else if (IsUnderPostmaster) { /* ! * Notify the bgwriter about it. If we fail to queue the revoke * message, we have to sleep and try again ... ugly, but hopefully * won't happen often. * * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an * error would leave the no-longer-used file still present on disk, ! * which would be bad, so I'm inclined to assume that the bgwriter * will always empty the queue soon. */ while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC)) pg_usleep(10000L); /* 10 msec seems a good number */ /* ! * Note we don't wait for the bgwriter to actually absorb the revoke * message; see mdsync() for the implications. */ } --- 1460,1479 ---- else if (IsUnderPostmaster) { /* ! * Notify the checkpointer about it. If we fail to queue the revoke * message, we have to sleep and try again ... ugly, but hopefully * won't happen often. * * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an * error would leave the no-longer-used file still present on disk, ! * which would be bad, so I'm inclined to assume that the checkpointer * will always empty the queue soon. */ while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC)) pg_usleep(10000L); /* 10 msec seems a good number */ /* ! * Note we don't wait for the checkpointer to actually absorb the revoke * message; see mdsync() for the implications. */ } *** a/src/include/access/xlog_internal.h --- b/src/include/access/xlog_internal.h *************** *** 256,262 **** typedef struct RmgrData extern const RmgrData RmgrTable[]; /* ! * Exported to support xlog switching from bgwriter */ extern pg_time_t GetLastSegSwitchTime(void); extern XLogRecPtr RequestXLogSwitch(void); --- 256,262 ---- extern const RmgrData RmgrTable[]; /* ! * Exported to support xlog switching from checkpointer */ extern pg_time_t GetLastSegSwitchTime(void); extern XLogRecPtr RequestXLogSwitch(void); *** a/src/include/bootstrap/bootstrap.h --- b/src/include/bootstrap/bootstrap.h *************** *** 22,27 **** typedef enum --- 22,28 ---- BootstrapProcess, StartupProcess, BgWriterProcess, + CheckpointerProcess, WalWriterProcess, WalReceiverProcess, *** a/src/include/postmaster/bgwriter.h --- b/src/include/postmaster/bgwriter.h *************** *** 23,31 **** extern int CheckPointWarning; extern double CheckPointCompletionTarget; extern void BackgroundWriterMain(void); extern void RequestCheckpoint(int flags); ! extern void CheckpointWriteDelay(int flags, double progress); extern bool ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum, BlockNumber segno); --- 23,32 ---- extern double CheckPointCompletionTarget; extern void BackgroundWriterMain(void); + extern void CheckpointerMain(void); extern void RequestCheckpoint(int flags); ! extern void CheckpointWriteDelay(int flags, int num_written, int num_to_write); extern bool ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum, BlockNumber segno); *** a/src/include/storage/proc.h --- b/src/include/storage/proc.h *************** *** 190,200 **** extern PROC_HDR *ProcGlobal; * We set aside some extra PGPROC structures for auxiliary processes, * ie things that aren't full-fledged backends but need shmem access. * ! * Background writer and WAL writer run during normal operation. Startup ! * process and WAL receiver also consume 2 slots, but WAL writer is ! * launched only after startup has exited, so we only need 3 slots. */ ! #define NUM_AUXILIARY_PROCS 3 /* configurable options */ --- 190,200 ---- * We set aside some extra PGPROC structures for auxiliary processes, * ie things that aren't full-fledged backends but need shmem access. * ! * Background writer, checkpointer and WAL writer run during normal operation. ! * Startup process and WAL receiver also consume 2 slots, but WAL writer is ! * launched only after startup has exited, so we only need 4 slots. */ ! #define NUM_AUXILIARY_PROCS 4 /* configurable options */ *** a/src/include/storage/procsignal.h --- b/src/include/storage/procsignal.h *************** *** 19,25 **** /* * Reasons for signalling a Postgres child process (a backend or an auxiliary ! * process, like bgwriter). We can cope with concurrent signals for different * reasons. However, if the same reason is signaled multiple times in quick * succession, the process is likely to observe only one notification of it. * This is okay for the present uses. --- 19,25 ---- /* * Reasons for signalling a Postgres child process (a backend or an auxiliary ! * process, like checkpointer). We can cope with concurrent signals for different * reasons. However, if the same reason is signaled multiple times in quick * succession, the process is likely to observe only one notification of it. * This is okay for the present uses.