summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYann Ylavic <ylavic@apache.org>2022-04-12 12:08:02 +0000
committerYann Ylavic <ylavic@apache.org>2022-04-12 12:08:02 +0000
commit4c8b180828d8c39daa7529c659cce5768feb87af (patch)
treefd2832cc76e9fa1c8e9cf5f45b2af3b3b424a05d
parentdbe0033791e1943713dfbc5f988b62be1f97a6d2 (diff)
downloadhttpd-4c8b180828d8c39daa7529c659cce5768feb87af.tar.gz
mpm_event: Fix accounting of active/total processes on ungraceful restart.
Children processes terminated by ap_{reclaim,relieve}_child_processes() were were not un-accounted for total_daemons and active_daemons, which was done in server_main_loop() only. This led to perform_idle_server_maintenance() thinking it was over the limit of children processes and never create new ones. Have this accounting right in event_note_child_{started,stopped}() which is called both at runtime and reload time. * server/mpm/event/event.c(struct event_retained_data): Rename field max_daemons_limit to max_daemon_used to better describe what it's about and to align with AP_MPMQ_MAX_DAEMON_USED. * server/mpm/event/event.c(event_note_child_stopped): Renamed from event_note_child_killed() to clarify that it's not only called when a child is killed (i.e. on restart) but whenever a child has stopped. * server/mpm/event/event.c(event_note_child_stopped): Move decrementing {active,total}_daemons and marking child's threads as SERVER_DEAD from server_main_loop() so that it's done both at runtime and reload time. Log the current number/state of daemons at APLOG_DEBUG level for each child stopped. * server/mpm/event/event.c(event_note_child_started): Move incrementing {active,total}_daemons from make_child() for symmetry, given that make_child() calls event_note_child_started(). Log the current number/state of daemons at APLOG_DEBUG level for each child started. * server/mpm/event/event.c(perform_idle_server_maintenance): Fix possible miscounting of retained->max_daemon_used accross the multiple calls to perform_idle_server_maintenance() if ListenCoresBucketsRatio > 0. Pass an int *max_daemon_used which starts at zero and is bumped consistently for all the buckets, while retained->max_daemon_used is updated only after all the buckets have been maintained. * server/mpm/event/event.c(perform_idle_server_maintenance): Use event_note_child_stopped() to handle exited children processes. Fixes: BZ 66004 git-svn-id: https://svn.apache.org/repos/asf/httpd/httpd/trunk@1899777 13f79535-47bb-0310-9956-ffa450edef68
-rw-r--r--server/mpm/event/event.c139
1 files changed, 87 insertions, 52 deletions
diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c
index b9f8e73c48..b064d80235 100644
--- a/server/mpm/event/event.c
+++ b/server/mpm/event/event.c
@@ -417,7 +417,7 @@ typedef struct event_retained_data {
* We use this value to optimize routines that have to scan the entire
* scoreboard.
*/
- int max_daemons_limit;
+ int max_daemon_used;
/*
* All running workers, active and shutting down, including those that
@@ -678,7 +678,7 @@ static int event_query(int query_code, int *result, apr_status_t *rv)
*rv = APR_SUCCESS;
switch (query_code) {
case AP_MPMQ_MAX_DAEMON_USED:
- *result = retained->max_daemons_limit;
+ *result = retained->max_daemon_used;
break;
case AP_MPMQ_IS_THREADED:
*result = AP_MPMQ_STATIC;
@@ -738,14 +738,32 @@ static int event_query(int query_code, int *result, apr_status_t *rv)
return OK;
}
-static void event_note_child_killed(int childnum, pid_t pid, ap_generation_t gen)
+static void event_note_child_stopped(int slot, pid_t pid, ap_generation_t gen)
{
- if (childnum != -1) { /* child had a scoreboard slot? */
- ap_run_child_status(ap_server_conf,
- ap_scoreboard_image->parent[childnum].pid,
- ap_scoreboard_image->parent[childnum].generation,
- childnum, MPM_CHILD_EXITED);
- ap_scoreboard_image->parent[childnum].pid = 0;
+ if (slot != -1) { /* child had a scoreboard slot? */
+ process_score *ps = &ap_scoreboard_image->parent[slot];
+ int i;
+
+ pid = ps->pid;
+ gen = ps->generation;
+ for (i = 0; i < threads_per_child; i++) {
+ ap_update_child_status_from_indexes(slot, i, SERVER_DEAD, NULL);
+ }
+ ap_run_child_status(ap_server_conf, pid, gen, slot, MPM_CHILD_EXITED);
+ if (ps->quiescing != 2) { /* vs perform_idle_server_maintenance() */
+ retained->active_daemons--;
+ }
+ retained->total_daemons--;
+ ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf,
+ "Child %d stopped: pid %d, gen %d, "
+ "active %d/%d, total %d/%d/%d, quiescing %d",
+ slot, (int)pid, (int)gen,
+ retained->active_daemons, active_daemons_limit,
+ retained->total_daemons, retained->max_daemon_used,
+ server_limit, ps->quiescing);
+ ps->not_accepting = 0;
+ ps->quiescing = 0;
+ ps->pid = 0;
}
else {
ap_run_child_status(ap_server_conf, pid, gen, -1, MPM_CHILD_EXITED);
@@ -755,9 +773,19 @@ static void event_note_child_killed(int childnum, pid_t pid, ap_generation_t gen
static void event_note_child_started(int slot, pid_t pid)
{
ap_generation_t gen = retained->mpm->my_generation;
+
+ retained->total_daemons++;
+ retained->active_daemons++;
ap_scoreboard_image->parent[slot].pid = pid;
ap_scoreboard_image->parent[slot].generation = gen;
ap_run_child_status(ap_server_conf, pid, gen, slot, MPM_CHILD_STARTED);
+ ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf,
+ "Child %d started: pid %d, gen %d, "
+ "active %d/%d, total %d/%d/%d",
+ slot, (int)pid, (int)gen,
+ retained->active_daemons, active_daemons_limit,
+ retained->total_daemons, retained->max_daemon_used,
+ server_limit);
}
static const char *event_get_name(void)
@@ -780,7 +808,7 @@ static void clean_child_exit(int code)
}
if (one_process) {
- event_note_child_killed(/* slot */ 0, 0, 0);
+ event_note_child_stopped(/* slot */ 0, 0, 0);
}
exit(code);
@@ -3032,8 +3060,8 @@ static int make_child(server_rec * s, int slot, int bucket)
{
int pid;
- if (slot + 1 > retained->max_daemons_limit) {
- retained->max_daemons_limit = slot + 1;
+ if (slot + 1 > retained->max_daemon_used) {
+ retained->max_daemon_used = slot + 1;
}
if (ap_scoreboard_image->parent[slot].pid != 0) {
@@ -3101,11 +3129,7 @@ static int make_child(server_rec * s, int slot, int bucket)
return -1;
}
- ap_scoreboard_image->parent[slot].quiescing = 0;
- ap_scoreboard_image->parent[slot].not_accepting = 0;
event_note_child_started(slot, pid);
- retained->active_daemons++;
- retained->total_daemons++;
return 0;
}
@@ -3125,7 +3149,8 @@ static void startup_children(int number_to_start)
}
}
-static void perform_idle_server_maintenance(int child_bucket)
+static void perform_idle_server_maintenance(int child_bucket,
+ int *max_daemon_used)
{
int num_buckets = retained->mpm->num_buckets;
int idle_thread_count = 0;
@@ -3141,7 +3166,7 @@ static void perform_idle_server_maintenance(int child_bucket)
/* We only care about child_bucket in this call */
continue;
}
- if (i >= retained->max_daemons_limit &&
+ if (i >= retained->max_daemon_used &&
free_length == retained->idle_spawn_rate[child_bucket]) {
/* short cut if all active processes have been examined and
* enough empty scoreboard slots have been found
@@ -3155,6 +3180,13 @@ static void perform_idle_server_maintenance(int child_bucket)
if (ps->quiescing == 1) {
ps->quiescing = 2;
retained->active_daemons--;
+ ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf,
+ "Child %d quiescing: pid %d, gen %d, "
+ "active %d/%d, total %d/%d/%d",
+ i, (int)ps->pid, (int)ps->generation,
+ retained->active_daemons, active_daemons_limit,
+ retained->total_daemons, retained->max_daemon_used,
+ server_limit);
}
for (j = 0; j < threads_per_child; j++) {
int status = ap_scoreboard_image->servers[i][j].status;
@@ -3184,7 +3216,16 @@ static void perform_idle_server_maintenance(int child_bucket)
}
}
- retained->max_daemons_limit = last_non_dead + 1;
+ if (*max_daemon_used < last_non_dead + 1) {
+ *max_daemon_used = last_non_dead + 1;
+
+ /* Below make_child() can grow retained->max_daemon_used, so
+ * be accurate if the one being computed is higher already.
+ */
+ if (retained->max_daemon_used < *max_daemon_used) {
+ retained->max_daemon_used = *max_daemon_used;
+ }
+ }
if (retained->sick_child_detected) {
if (had_healthy_child) {
@@ -3213,6 +3254,10 @@ static void perform_idle_server_maintenance(int child_bucket)
}
}
+ AP_DEBUG_ASSERT(retained->active_daemons <= retained->total_daemons
+ && retained->total_daemons <= retained->max_daemon_used
+ && retained->max_daemon_used <= server_limit);
+
if (idle_thread_count > max_spare_threads / num_buckets) {
/*
* Child processes that we ask to shut down won't die immediately
@@ -3235,13 +3280,12 @@ static void perform_idle_server_maintenance(int child_bucket)
active_daemons_limit));
ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf,
"%shutting down one child: "
- "active daemons %d / active limit %d / "
- "total daemons %d / ServerLimit %d / "
- "idle threads %d / max workers %d",
+ "active %d/%d, total %d/%d/%d, "
+ "idle threads %d, max workers %d",
(do_kill) ? "S" : "Not s",
retained->active_daemons, active_daemons_limit,
- retained->total_daemons, server_limit,
- idle_thread_count, max_workers);
+ retained->total_daemons, retained->max_daemon_used,
+ server_limit, idle_thread_count, max_workers);
if (do_kill) {
ap_mpm_podx_signal(retained->buckets[child_bucket].pod,
AP_MPM_PODX_GRACEFUL);
@@ -3290,10 +3334,14 @@ static void perform_idle_server_maintenance(int child_bucket)
else {
ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf,
"server is at active daemons limit, spawning "
- "of %d children cancelled: %d/%d active, "
- "rate %d", free_length,
+ "of %d children cancelled: active %d/%d, "
+ "total %d/%d/%d, rate %d", free_length,
retained->active_daemons, active_daemons_limit,
- retained->idle_spawn_rate[child_bucket]);
+ retained->total_daemons, retained->max_daemon_used,
+ server_limit, retained->idle_spawn_rate[child_bucket]);
+ /* reset the spawning rate and prevent its growth below */
+ retained->idle_spawn_rate[child_bucket] = 1;
+ ++retained->hold_off_on_exponential_spawning;
free_length = 0;
}
}
@@ -3309,11 +3357,6 @@ static void perform_idle_server_maintenance(int child_bucket)
retained->total_daemons);
}
for (i = 0; i < free_length; ++i) {
- ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf,
- "Spawning new child: slot %d active / "
- "total daemons: %d/%d",
- free_slots[i], retained->active_daemons,
- retained->total_daemons);
make_child(ap_server_conf, free_slots[i], child_bucket);
}
/* the next time around we want to spawn twice as many if this
@@ -3340,6 +3383,7 @@ static void perform_idle_server_maintenance(int child_bucket)
static void server_main_loop(int remaining_children_to_start)
{
int num_buckets = retained->mpm->num_buckets;
+ int max_daemon_used = 0;
int child_slot;
apr_exit_why_e exitwhy;
int status, processed_status;
@@ -3385,19 +3429,8 @@ static void server_main_loop(int remaining_children_to_start)
}
/* non-fatal death... note that it's gone in the scoreboard. */
if (child_slot >= 0) {
- process_score *ps;
-
- for (i = 0; i < threads_per_child; i++)
- ap_update_child_status_from_indexes(child_slot, i,
- SERVER_DEAD, NULL);
-
- event_note_child_killed(child_slot, 0, 0);
- ps = &ap_scoreboard_image->parent[child_slot];
- if (ps->quiescing != 2)
- retained->active_daemons--;
- ps->quiescing = 0;
- /* NOTE: We don't dec in the (child_slot < 0) case! */
- retained->total_daemons--;
+ event_note_child_stopped(child_slot, 0, 0);
+
if (processed_status == APEXIT_CHILDSICK) {
/* resource shortage, minimize the fork rate */
retained->idle_spawn_rate[child_slot % num_buckets] = 1;
@@ -3447,9 +3480,11 @@ static void server_main_loop(int remaining_children_to_start)
continue;
}
+ max_daemon_used = 0;
for (i = 0; i < num_buckets; i++) {
- perform_idle_server_maintenance(i);
+ perform_idle_server_maintenance(i, &max_daemon_used);
}
+ retained->max_daemon_used = max_daemon_used;
}
}
@@ -3488,7 +3523,7 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s)
active_daemons_limit, AP_MPM_PODX_RESTART);
}
ap_reclaim_child_processes(1, /* Start with SIGTERM */
- event_note_child_killed);
+ event_note_child_stopped);
}
apr_pool_clear(retained->gen_pool);
retained->buckets = NULL;
@@ -3637,7 +3672,7 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s)
active_daemons_limit, AP_MPM_PODX_RESTART);
}
ap_reclaim_child_processes(1, /* Start with SIGTERM */
- event_note_child_killed);
+ event_note_child_stopped);
if (!child_fatal) {
/* cleanup pid file on normal shutdown */
@@ -3663,7 +3698,7 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s)
ap_mpm_podx_killpg(retained->buckets[i].pod,
active_daemons_limit, AP_MPM_PODX_GRACEFUL);
}
- ap_relieve_child_processes(event_note_child_killed);
+ ap_relieve_child_processes(event_note_child_stopped);
if (!child_fatal) {
/* cleanup pid file on normal shutdown */
@@ -3685,10 +3720,10 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s)
apr_sleep(apr_time_from_sec(1));
/* Relieve any children which have now exited */
- ap_relieve_child_processes(event_note_child_killed);
+ ap_relieve_child_processes(event_note_child_stopped);
active_children = 0;
- for (index = 0; index < retained->max_daemons_limit; ++index) {
+ for (index = 0; index < retained->max_daemon_used; ++index) {
if (ap_mpm_safe_kill(MPM_CHILD_PID(index), 0) == APR_SUCCESS) {
active_children = 1;
/* Having just one child is enough to stay around */
@@ -3706,7 +3741,7 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s)
ap_mpm_podx_killpg(retained->buckets[i].pod,
active_daemons_limit, AP_MPM_PODX_RESTART);
}
- ap_reclaim_child_processes(1, event_note_child_killed);
+ ap_reclaim_child_processes(1, event_note_child_stopped);
return DONE;
}