Home > mailing lists
Re: [HACKERS] Unportable implementation of background worker start - Mailing list pgsql-hackers

From	Tom Lane
Subject	Re: [HACKERS] Unportable implementation of background worker start
Date	April 22, 2017 01:28:47
Msg-id	4905.1492813727@sss.pgh.pa.us Whole thread Raw
In response to	Re: [HACKERS] Unportable implementation of background worker start (Alvaro Herrera <alvherre@2ndquadrant.com>)
List	pgsql-hackers
Tree view
Alvaro Herrera <alvherre@2ndquadrant.com> writes:
> Tom Lane wrote:
>> It also appears to me that do_start_bgworker's treatment of fork
>> failure is completely brain dead.  Did anyone really think about
>> that case?

> Hmm, I probably modelled it on autovacuum without giving that case much
> additional consideration.

Attached is a proposed patch that should make fork failure behave
sanely, ie it works much the same as a worker crash immediately after
launch.  I also refactored things a bit to make do_start_bgworker
fully responsible for updating the RegisteredBgWorker's state,
rather than doing just some of it as before.

I tested this by hot-wiring the fork_process call to fail some of
the time, which showed that the postmaster now seems to recover OK,
but parallel.c's logic is completely innocent of the idea that
worker-startup failure is possible.  The leader backend just freezes,
and nothing short of kill -9 on that backend will get you out of it.
Fixing that seems like material for a separate patch though.

            regards, tom lane

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index c382345..8461c24 100644
*** a/src/backend/postmaster/postmaster.c
--- b/src/backend/postmaster/postmaster.c
*************** static void TerminateChildren(int signal
*** 420,425 ****
--- 420,426 ----
  #define SignalChildren(sig)               SignalSomeChildren(sig, BACKEND_TYPE_ALL)

  static int    CountChildren(int target);
+ static bool assign_backendlist_entry(RegisteredBgWorker *rw);
  static void maybe_start_bgworker(void);
  static bool CreateOptsFile(int argc, char *argv[], char *fullprogname);
  static pid_t StartChildProcess(AuxProcType type);
*************** bgworker_forkexec(int shmem_slot)
*** 5531,5543 ****
   * Start a new bgworker.
   * Starting time conditions must have been checked already.
   *
   * This code is heavily based on autovacuum.c, q.v.
   */
! static void
  do_start_bgworker(RegisteredBgWorker *rw)
  {
      pid_t        worker_pid;

      ereport(DEBUG1,
              (errmsg("starting background worker process \"%s\"",
                      rw->rw_worker.bgw_name)));
--- 5532,5564 ----
   * Start a new bgworker.
   * Starting time conditions must have been checked already.
   *
+  * Returns true on success, false on failure.
+  * In either case, update the RegisteredBgWorker's state appropriately.
+  *
   * This code is heavily based on autovacuum.c, q.v.
   */
! static bool
  do_start_bgworker(RegisteredBgWorker *rw)
  {
      pid_t        worker_pid;

+     Assert(rw->rw_pid == 0);
+
+     /*
+      * Allocate and assign the Backend element.  Note we must do this before
+      * forking, so that we can handle out of memory properly.
+      *
+      * Treat failure as though the worker had crashed.  That way, the
+      * postmaster will wait a bit before attempting to start it again; if it
+      * tried again right away, most likely it'd find itself repeating the
+      * out-of-memory or fork failure condition.
+      */
+     if (!assign_backendlist_entry(rw))
+     {
+         rw->rw_crashed_at = GetCurrentTimestamp();
+         return false;
+     }
+
      ereport(DEBUG1,
              (errmsg("starting background worker process \"%s\"",
                      rw->rw_worker.bgw_name)));
*************** do_start_bgworker(RegisteredBgWorker *rw
*** 5549,5557 ****
  #endif
      {
          case -1:
              ereport(LOG,
                      (errmsg("could not fork worker process: %m")));
!             return;

  #ifndef EXEC_BACKEND
          case 0:
--- 5570,5586 ----
  #endif
      {
          case -1:
+             /* in postmaster, fork failed ... */
              ereport(LOG,
                      (errmsg("could not fork worker process: %m")));
!             /* undo what assign_backendlist_entry did */
!             ReleasePostmasterChildSlot(rw->rw_child_slot);
!             rw->rw_child_slot = 0;
!             free(rw->rw_backend);
!             rw->rw_backend = NULL;
!             /* mark entry as crashed, so we'll try again later */
!             rw->rw_crashed_at = GetCurrentTimestamp();
!             break;

  #ifndef EXEC_BACKEND
          case 0:
*************** do_start_bgworker(RegisteredBgWorker *rw
*** 5575,5588 ****
              PostmasterContext = NULL;

              StartBackgroundWorker();
              break;
  #endif
          default:
              rw->rw_pid = worker_pid;
              rw->rw_backend->pid = rw->rw_pid;
              ReportBackgroundWorkerPID(rw);
!             break;
      }
  }

  /*
--- 5604,5627 ----
              PostmasterContext = NULL;

              StartBackgroundWorker();
+
+             exit(1);            /* should not get here */
              break;
  #endif
          default:
+             /* in postmaster, fork successful ... */
              rw->rw_pid = worker_pid;
              rw->rw_backend->pid = rw->rw_pid;
              ReportBackgroundWorkerPID(rw);
!             /* add new worker to lists of backends */
!             dlist_push_head(&BackendList, &rw->rw_backend->elem);
! #ifdef EXEC_BACKEND
!             ShmemBackendArrayAdd(rw->rw_backend);
! #endif
!             return true;
      }
+
+     return false;
  }

  /*
*************** bgworker_should_start_now(BgWorkerStartT
*** 5629,5634 ****
--- 5668,5675 ----
   * Allocate the Backend struct for a connected background worker, but don't
   * add it to the list of backends just yet.
   *
+  * On failure, return false without changing any worker state.
+  *
   * Some info from the Backend is copied into the passed rw.
   */
  static bool
*************** assign_backendlist_entry(RegisteredBgWor
*** 5647,5654 ****
          ereport(LOG,
                  (errcode(ERRCODE_INTERNAL_ERROR),
                   errmsg("could not generate random cancel key")));
-
-         rw->rw_crashed_at = GetCurrentTimestamp();
          return false;
      }

--- 5688,5693 ----
*************** assign_backendlist_entry(RegisteredBgWor
*** 5658,5671 ****
          ereport(LOG,
                  (errcode(ERRCODE_OUT_OF_MEMORY),
                   errmsg("out of memory")));
-
-         /*
-          * The worker didn't really crash, but setting this nonzero makes
-          * postmaster wait a bit before attempting to start it again; if it
-          * tried again right away, most likely it'd find itself under the same
-          * memory pressure.
-          */
-         rw->rw_crashed_at = GetCurrentTimestamp();
          return false;
      }

--- 5697,5702 ----
*************** assign_backendlist_entry(RegisteredBgWor
*** 5687,5692 ****
--- 5718,5728 ----
   * As a side effect, the bgworker control variables are set or reset whenever
   * there are more workers to start after this one, and whenever the overall
   * system state requires it.
+  *
+  * The reason we start at most one worker per call is to avoid consuming the
+  * postmaster's attention for too long when many such requests are pending.
+  * As long as StartWorkerNeeded is true, ServerLoop will not block and will
+  * call this function again after dealing with any other issues.
   */
  static void
  maybe_start_bgworker(void)
*************** maybe_start_bgworker(void)
*** 5694,5706 ****
      slist_mutable_iter iter;
      TimestampTz now = 0;

      if (FatalError)
      {
          StartWorkerNeeded = false;
          HaveCrashedWorker = false;
!         return;                    /* not yet */
      }

      HaveCrashedWorker = false;

      slist_foreach_modify(iter, &BackgroundWorkerList)
--- 5730,5748 ----
      slist_mutable_iter iter;
      TimestampTz now = 0;

+     /*
+      * During crash recovery, we have no need to be called until the state
+      * transition out of recovery.
+      */
      if (FatalError)
      {
          StartWorkerNeeded = false;
          HaveCrashedWorker = false;
!         return;
      }

+     /* Don't need to be called again unless we find a reason for it below */
+     StartWorkerNeeded = false;
      HaveCrashedWorker = false;

      slist_foreach_modify(iter, &BackgroundWorkerList)
*************** maybe_start_bgworker(void)
*** 5709,5719 ****

          rw = slist_container(RegisteredBgWorker, rw_lnode, iter.cur);

!         /* already running? */
          if (rw->rw_pid != 0)
              continue;

!         /* marked for death? */
          if (rw->rw_terminate)
          {
              ForgetBackgroundWorker(&iter);
--- 5751,5761 ----

          rw = slist_container(RegisteredBgWorker, rw_lnode, iter.cur);

!         /* ignore if already running */
          if (rw->rw_pid != 0)
              continue;

!         /* if marked for death, clean up and remove from list */
          if (rw->rw_terminate)
          {
              ForgetBackgroundWorker(&iter);
*************** maybe_start_bgworker(void)
*** 5735,5746 ****
--- 5777,5790 ----
                  continue;
              }

+             /* read system time only when needed */
              if (now == 0)
                  now = GetCurrentTimestamp();

              if (!TimestampDifferenceExceeds(rw->rw_crashed_at, now,
                                        rw->rw_worker.bgw_restart_time * 1000))
              {
+                 /* Set flag to remember that we have workers to start later */
                  HaveCrashedWorker = true;
                  continue;
              }
*************** maybe_start_bgworker(void)
*** 5748,5782 ****

          if (bgworker_should_start_now(rw->rw_worker.bgw_start_time))
          {
!             /* reset crash time before calling assign_backendlist_entry */
              rw->rw_crashed_at = 0;

              /*
!              * Allocate and assign the Backend element.  Note we must do this
!              * before forking, so that we can handle out of memory properly.
               */
!             if (!assign_backendlist_entry(rw))
                  return;
!
!             do_start_bgworker(rw);        /* sets rw->rw_pid */
!
!             dlist_push_head(&BackendList, &rw->rw_backend->elem);
! #ifdef EXEC_BACKEND
!             ShmemBackendArrayAdd(rw->rw_backend);
! #endif

              /*
!              * Have ServerLoop call us again.  Note that there might not
!              * actually *be* another runnable worker, but we don't care all
!              * that much; we will find out the next time we run.
               */
              StartWorkerNeeded = true;
              return;
          }
      }
-
-     /* no runnable worker found */
-     StartWorkerNeeded = false;
  }

  /*
--- 5792,5826 ----

          if (bgworker_should_start_now(rw->rw_worker.bgw_start_time))
          {
!             /* reset crash time before trying to start worker */
              rw->rw_crashed_at = 0;

              /*
!              * Try to start the worker.
!              *
!              * On failure, give up processing workers for now, but set
!              * StartWorkerNeeded so we'll come back here on the next iteration
!              * of ServerLoop to try again.  (We don't want to wait, because
!              * there might be additional ready-to-run workers.)  We could set
!              * HaveCrashedWorker as well, since this worker is now marked
!              * crashed, but there's no need because the next run of this
!              * function will do that.
               */
!             if (!do_start_bgworker(rw))
!             {
!                 StartWorkerNeeded = true;
                  return;
!             }

              /*
!              * Quit, but have ServerLoop call us again to look for additional
!              * ready-to-run workers.  There might not be any, but we'll find
!              * out the next time we run.
               */
              StartWorkerNeeded = true;
              return;
          }
      }
  }

  /*

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
pgsql-hackers by date:
From: Robert Haas
Date: 22 April 2017, 01:28:00
Subject: Re: [HACKERS] pg_dump emits ALTER TABLE ONLY partitioned_table
From: Craig Ringer
Date: 22 April 2017, 04:14:50
Subject: Re: [HACKERS] multithreading in Batch/pipelining mode for libpq
Re: [HACKERS] Unportable implementation of background worker start - Mailing list pgsql-hackers

Previous

Next