mpm_event: Fix accounting of active/total processes on ungraceful restart.

Children processes terminated by ap_{reclaim,relieve}_child_processes() were were not un-accounted for total_daemons and active_daemons, which was done in server_main_loop() only. This led to perform_idle_server_maintenance() thinking it was over the limit of children processes and never create new ones. Have this accounting right in event_note_child_{started,stopped}() which is called both at runtime and reload time. * server/mpm/event/event.c(struct event_retained_data): Rename field max_daemons_limit to max_daemon_used to better describe what it's about and to align with AP_MPMQ_MAX_DAEMON_USED. * server/mpm/event/event.c(event_note_child_stopped): Renamed from event_note_child_killed() to clarify that it's not only called when a child is killed (i.e. on restart) but whenever a child has stopped. * server/mpm/event/event.c(event_note_child_stopped): Move decrementing {active,total}_daemons and marking child's threads as SERVER_DEAD from server_main_loop() so that it's done both at runtime and reload time. Log the current number/state of daemons at APLOG_DEBUG level for each child stopped. * server/mpm/event/event.c(event_note_child_started): Move incrementing {active,total}_daemons from make_child() for symmetry, given that make_child() calls event_note_child_started(). Log the current number/state of daemons at APLOG_DEBUG level for each child started. * server/mpm/event/event.c(perform_idle_server_maintenance): Fix possible miscounting of retained->max_daemon_used accross the multiple calls to perform_idle_server_maintenance() if ListenCoresBucketsRatio > 0. Pass an int *max_daemon_used which starts at zero and is bumped consistently for all the buckets, while retained->max_daemon_used is updated only after all the buckets have been maintained. * server/mpm/event/event.c(perform_idle_server_maintenance): Use event_note_child_stopped() to handle exited children processes. Fixes: BZ 66004 git-svn-id: https://svn.apache.org/repos/asf/httpd/httpd/trunk@1899777 13f79535-47bb-0310-9956-ffa450edef68
author: Yann Ylavic <ylavic@apache.org> 2022-04-12 12:08:02 +0000
committer: Yann Ylavic <ylavic@apache.org> 2022-04-12 12:08:02 +0000
commit: 4c8b180828d8c39daa7529c659cce5768feb87af (patch)
tree: fd2832cc76e9fa1c8e9cf5f45b2af3b3b424a05d
parent: dbe0033791e1943713dfbc5f988b62be1f97a6d2 (diff)
download: httpd-4c8b180828d8c39daa7529c659cce5768feb87af.tar.gz
1 files changed, 87 insertions, 52 deletions
diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c
index b9f8e73c48..b064d80235 100644
--- a/server/mpm/event/event.c
+++ b/server/mpm/event/event.c
@@ -417,7 +417,7 @@ typedef struct event_retained_data {
      * We use this value to optimize routines that have to scan the entire
      * scoreboard.
      */
-    int max_daemons_limit;
+    int max_daemon_used;
 
     /*
      * All running workers, active and shutting down, including those that
@@ -678,7 +678,7 @@ static int event_query(int query_code, int *result, apr_status_t *rv)
     *rv = APR_SUCCESS;
     switch (query_code) {
     case AP_MPMQ_MAX_DAEMON_USED:
-        *result = retained->max_daemons_limit;
+        *result = retained->max_daemon_used;
         break;
     case AP_MPMQ_IS_THREADED:
         *result = AP_MPMQ_STATIC;
@@ -738,14 +738,32 @@ static int event_query(int query_code, int *result, apr_status_t *rv)
     return OK;
 }
 
-static void event_note_child_killed(int childnum, pid_t pid, ap_generation_t gen)
+static void event_note_child_stopped(int slot, pid_t pid, ap_generation_t gen)
 {
-    if (childnum != -1) { /* child had a scoreboard slot? */
-        ap_run_child_status(ap_server_conf,
-                            ap_scoreboard_image->parent[childnum].pid,
-                            ap_scoreboard_image->parent[childnum].generation,
-                            childnum, MPM_CHILD_EXITED);
-        ap_scoreboard_image->parent[childnum].pid = 0;
+    if (slot != -1) { /* child had a scoreboard slot? */
+        process_score *ps = &ap_scoreboard_image->parent[slot];
+        int i;
+
+        pid = ps->pid;
+        gen = ps->generation;
+        for (i = 0; i < threads_per_child; i++) {
+            ap_update_child_status_from_indexes(slot, i, SERVER_DEAD, NULL);
+        }
+        ap_run_child_status(ap_server_conf, pid, gen, slot, MPM_CHILD_EXITED);
+        if (ps->quiescing != 2) { /* vs perform_idle_server_maintenance() */
+            retained->active_daemons--;
+        }
+        retained->total_daemons--;
+        ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf,
+                     "Child %d stopped: pid %d, gen %d, "
+                     "active %d/%d, total %d/%d/%d, quiescing %d",
+                     slot, (int)pid, (int)gen,
+                     retained->active_daemons, active_daemons_limit,
+                     retained->total_daemons, retained->max_daemon_used,
+                     server_limit, ps->quiescing);
+        ps->not_accepting = 0;
+        ps->quiescing = 0;
+        ps->pid = 0;
     }
     else {
         ap_run_child_status(ap_server_conf, pid, gen, -1, MPM_CHILD_EXITED);
@@ -755,9 +773,19 @@ static void event_note_child_killed(int childnum, pid_t pid, ap_generation_t gen
 static void event_note_child_started(int slot, pid_t pid)
 {
     ap_generation_t gen = retained->mpm->my_generation;
+
+    retained->total_daemons++;
+    retained->active_daemons++;
     ap_scoreboard_image->parent[slot].pid = pid;
     ap_scoreboard_image->parent[slot].generation = gen;
     ap_run_child_status(ap_server_conf, pid, gen, slot, MPM_CHILD_STARTED);
+    ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf,
+                 "Child %d started: pid %d, gen %d, "
+                 "active %d/%d, total %d/%d/%d",
+                 slot, (int)pid, (int)gen,
+                 retained->active_daemons, active_daemons_limit,
+                 retained->total_daemons, retained->max_daemon_used,
+                 server_limit);
 }
 
 static const char *event_get_name(void)
@@ -780,7 +808,7 @@ static void clean_child_exit(int code)
     }
 
     if (one_process) {
-        event_note_child_killed(/* slot */ 0, 0, 0);
+        event_note_child_stopped(/* slot */ 0, 0, 0);
     }
 
     exit(code);
@@ -3032,8 +3060,8 @@ static int make_child(server_rec * s, int slot, int bucket)
 {
     int pid;
 
-    if (slot + 1 > retained->max_daemons_limit) {
-        retained->max_daemons_limit = slot + 1;
+    if (slot + 1 > retained->max_daemon_used) {
+        retained->max_daemon_used = slot + 1;
     }
 
     if (ap_scoreboard_image->parent[slot].pid != 0) {
@@ -3101,11 +3129,7 @@ static int make_child(server_rec * s, int slot, int bucket)
         return -1;
     }
 
-    ap_scoreboard_image->parent[slot].quiescing = 0;
-    ap_scoreboard_image->parent[slot].not_accepting = 0;
     event_note_child_started(slot, pid);
-    retained->active_daemons++;
-    retained->total_daemons++;
     return 0;
 }
 
@@ -3125,7 +3149,8 @@ static void startup_children(int number_to_start)
     }
 }
 
-static void perform_idle_server_maintenance(int child_bucket)
+static void perform_idle_server_maintenance(int child_bucket,
+                                            int *max_daemon_used)
 {
     int num_buckets = retained->mpm->num_buckets;
     int idle_thread_count = 0;
@@ -3141,7 +3166,7 @@ static void perform_idle_server_maintenance(int child_bucket)
             /* We only care about child_bucket in this call */
             continue;
         }
-        if (i >= retained->max_daemons_limit &&
+        if (i >= retained->max_daemon_used &&
             free_length == retained->idle_spawn_rate[child_bucket]) {
             /* short cut if all active processes have been examined and
              * enough empty scoreboard slots have been found
@@ -3155,6 +3180,13 @@ static void perform_idle_server_maintenance(int child_bucket)
             if (ps->quiescing == 1) {
                 ps->quiescing = 2;
                 retained->active_daemons--;
+                ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf,
+                             "Child %d quiescing: pid %d, gen %d, "
+                             "active %d/%d, total %d/%d/%d",
+                             i, (int)ps->pid, (int)ps->generation,
+                             retained->active_daemons, active_daemons_limit,
+                             retained->total_daemons, retained->max_daemon_used,
+                             server_limit);
             }
             for (j = 0; j < threads_per_child; j++) {
                 int status = ap_scoreboard_image->servers[i][j].status;
@@ -3184,7 +3216,16 @@ static void perform_idle_server_maintenance(int child_bucket)
         }
     }
 
-    retained->max_daemons_limit = last_non_dead + 1;
+    if (*max_daemon_used < last_non_dead + 1) {
+        *max_daemon_used = last_non_dead + 1;
+
+        /* Below make_child() can grow retained->max_daemon_used, so
+         * be accurate if the one being computed is higher already.
+         */
+        if (retained->max_daemon_used < *max_daemon_used) {
+            retained->max_daemon_used = *max_daemon_used;
+        }
+    }
 
     if (retained->sick_child_detected) {
         if (had_healthy_child) {
@@ -3213,6 +3254,10 @@ static void perform_idle_server_maintenance(int child_bucket)
         }
     }
 
+    AP_DEBUG_ASSERT(retained->active_daemons <= retained->total_daemons
+                    && retained->total_daemons <= retained->max_daemon_used
+                    && retained->max_daemon_used <= server_limit);
+
     if (idle_thread_count > max_spare_threads / num_buckets) {
         /*
          * Child processes that we ask to shut down won't die immediately
@@ -3235,13 +3280,12 @@ static void perform_idle_server_maintenance(int child_bucket)
                            active_daemons_limit));
         ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf,
                      "%shutting down one child: "
-                     "active daemons %d / active limit %d / "
-                     "total daemons %d / ServerLimit %d / "
-                     "idle threads %d / max workers %d",
+                     "active %d/%d, total %d/%d/%d, "
+                     "idle threads %d, max workers %d",
                      (do_kill) ? "S" : "Not s",
                      retained->active_daemons, active_daemons_limit,
-                     retained->total_daemons, server_limit,
-                     idle_thread_count, max_workers);
+                     retained->total_daemons, retained->max_daemon_used,
+                     server_limit, idle_thread_count, max_workers);
         if (do_kill) {
             ap_mpm_podx_signal(retained->buckets[child_bucket].pod,
                                AP_MPM_PODX_GRACEFUL);
@@ -3290,10 +3334,14 @@ static void perform_idle_server_maintenance(int child_bucket)
                 else {
                     ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf,
                                  "server is at active daemons limit, spawning "
-                                 "of %d children cancelled: %d/%d active, "
-                                 "rate %d", free_length,
+                                 "of %d children cancelled: active %d/%d, "
+                                 "total %d/%d/%d, rate %d", free_length,
                                  retained->active_daemons, active_daemons_limit,
-                                 retained->idle_spawn_rate[child_bucket]);
+                                 retained->total_daemons, retained->max_daemon_used,
+                                 server_limit, retained->idle_spawn_rate[child_bucket]);
+                    /* reset the spawning rate and prevent its growth below */
+                    retained->idle_spawn_rate[child_bucket] = 1;
+                    ++retained->hold_off_on_exponential_spawning;
                     free_length = 0;
                 }
             }
@@ -3309,11 +3357,6 @@ static void perform_idle_server_maintenance(int child_bucket)
                              retained->total_daemons);
             }
             for (i = 0; i < free_length; ++i) {
-                ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf,
-                             "Spawning new child: slot %d active / "
-                             "total daemons: %d/%d",
-                             free_slots[i], retained->active_daemons,
-                             retained->total_daemons);
                 make_child(ap_server_conf, free_slots[i], child_bucket);
             }
             /* the next time around we want to spawn twice as many if this
@@ -3340,6 +3383,7 @@ static void perform_idle_server_maintenance(int child_bucket)
 static void server_main_loop(int remaining_children_to_start)
 {
     int num_buckets = retained->mpm->num_buckets;
+    int max_daemon_used = 0;
     int child_slot;
     apr_exit_why_e exitwhy;
     int status, processed_status;
@@ -3385,19 +3429,8 @@ static void server_main_loop(int remaining_children_to_start)
             }
             /* non-fatal death... note that it's gone in the scoreboard. */
             if (child_slot >= 0) {
-                process_score *ps;
-
-                for (i = 0; i < threads_per_child; i++)
-                    ap_update_child_status_from_indexes(child_slot, i,
-                                                        SERVER_DEAD, NULL);
-
-                event_note_child_killed(child_slot, 0, 0);
-                ps = &ap_scoreboard_image->parent[child_slot];
-                if (ps->quiescing != 2)
-                    retained->active_daemons--;
-                ps->quiescing = 0;
-                /* NOTE: We don't dec in the (child_slot < 0) case! */
-                retained->total_daemons--;
+                event_note_child_stopped(child_slot, 0, 0);
+
                 if (processed_status == APEXIT_CHILDSICK) {
                     /* resource shortage, minimize the fork rate */
                     retained->idle_spawn_rate[child_slot % num_buckets] = 1;
@@ -3447,9 +3480,11 @@ static void server_main_loop(int remaining_children_to_start)
             continue;
         }
 
+        max_daemon_used = 0;
         for (i = 0; i < num_buckets; i++) {
-            perform_idle_server_maintenance(i);
+            perform_idle_server_maintenance(i, &max_daemon_used);
         }
+        retained->max_daemon_used = max_daemon_used;
     }
 }
 
@@ -3488,7 +3523,7 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s)
                                    active_daemons_limit, AP_MPM_PODX_RESTART);
             }
             ap_reclaim_child_processes(1,  /* Start with SIGTERM */
-                                       event_note_child_killed);
+                                       event_note_child_stopped);
         }
         apr_pool_clear(retained->gen_pool);
         retained->buckets = NULL;
@@ -3637,7 +3672,7 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s)
                                active_daemons_limit, AP_MPM_PODX_RESTART);
         }
         ap_reclaim_child_processes(1, /* Start with SIGTERM */
-                                   event_note_child_killed);
+                                   event_note_child_stopped);
 
         if (!child_fatal) {
             /* cleanup pid file on normal shutdown */
@@ -3663,7 +3698,7 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s)
             ap_mpm_podx_killpg(retained->buckets[i].pod,
                                active_daemons_limit, AP_MPM_PODX_GRACEFUL);
         }
-        ap_relieve_child_processes(event_note_child_killed);
+        ap_relieve_child_processes(event_note_child_stopped);
 
         if (!child_fatal) {
             /* cleanup pid file on normal shutdown */
@@ -3685,10 +3720,10 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s)
             apr_sleep(apr_time_from_sec(1));
 
             /* Relieve any children which have now exited */
-            ap_relieve_child_processes(event_note_child_killed);
+            ap_relieve_child_processes(event_note_child_stopped);
 
             active_children = 0;
-            for (index = 0; index < retained->max_daemons_limit; ++index) {
+            for (index = 0; index < retained->max_daemon_used; ++index) {
                 if (ap_mpm_safe_kill(MPM_CHILD_PID(index), 0) == APR_SUCCESS) {
                     active_children = 1;
                     /* Having just one child is enough to stay around */
@@ -3706,7 +3741,7 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s)
             ap_mpm_podx_killpg(retained->buckets[i].pod,
                                active_daemons_limit, AP_MPM_PODX_RESTART);
         }
-        ap_reclaim_child_processes(1, event_note_child_killed);
+        ap_reclaim_child_processes(1, event_note_child_stopped);
 
         return DONE;
     }
author	Yann Ylavic <ylavic@apache.org>	2022-04-12 12:08:02 +0000
committer	Yann Ylavic <ylavic@apache.org>	2022-04-12 12:08:02 +0000
commit	4c8b180828d8c39daa7529c659cce5768feb87af (patch)
tree	fd2832cc76e9fa1c8e9cf5f45b2af3b3b424a05d
parent	dbe0033791e1943713dfbc5f988b62be1f97a6d2 (diff)
download	httpd-4c8b180828d8c39daa7529c659cce5768feb87af.tar.gz