diff options
author | Yann Ylavic <ylavic@apache.org> | 2022-04-12 12:08:02 +0000 |
---|---|---|
committer | Yann Ylavic <ylavic@apache.org> | 2022-04-12 12:08:02 +0000 |
commit | 4c8b180828d8c39daa7529c659cce5768feb87af (patch) | |
tree | fd2832cc76e9fa1c8e9cf5f45b2af3b3b424a05d | |
parent | dbe0033791e1943713dfbc5f988b62be1f97a6d2 (diff) | |
download | httpd-4c8b180828d8c39daa7529c659cce5768feb87af.tar.gz |
mpm_event: Fix accounting of active/total processes on ungraceful restart.
Children processes terminated by ap_{reclaim,relieve}_child_processes() were
were not un-accounted for total_daemons and active_daemons, which was done in
server_main_loop() only. This led to perform_idle_server_maintenance() thinking
it was over the limit of children processes and never create new ones.
Have this accounting right in event_note_child_{started,stopped}() which is
called both at runtime and reload time.
* server/mpm/event/event.c(struct event_retained_data):
Rename field max_daemons_limit to max_daemon_used to better describe what
it's about and to align with AP_MPMQ_MAX_DAEMON_USED.
* server/mpm/event/event.c(event_note_child_stopped):
Renamed from event_note_child_killed() to clarify that it's not only called
when a child is killed (i.e. on restart) but whenever a child has stopped.
* server/mpm/event/event.c(event_note_child_stopped):
Move decrementing {active,total}_daemons and marking child's threads as
SERVER_DEAD from server_main_loop() so that it's done both at runtime and
reload time. Log the current number/state of daemons at APLOG_DEBUG level
for each child stopped.
* server/mpm/event/event.c(event_note_child_started):
Move incrementing {active,total}_daemons from make_child() for symmetry,
given that make_child() calls event_note_child_started(). Log the current
number/state of daemons at APLOG_DEBUG level for each child started.
* server/mpm/event/event.c(perform_idle_server_maintenance):
Fix possible miscounting of retained->max_daemon_used accross the multiple
calls to perform_idle_server_maintenance() if ListenCoresBucketsRatio > 0.
Pass an int *max_daemon_used which starts at zero and is bumped consistently
for all the buckets, while retained->max_daemon_used is updated only after
all the buckets have been maintained.
* server/mpm/event/event.c(perform_idle_server_maintenance):
Use event_note_child_stopped() to handle exited children processes.
Fixes: BZ 66004
git-svn-id: https://svn.apache.org/repos/asf/httpd/httpd/trunk@1899777 13f79535-47bb-0310-9956-ffa450edef68
-rw-r--r-- | server/mpm/event/event.c | 139 |
1 files changed, 87 insertions, 52 deletions
diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c index b9f8e73c48..b064d80235 100644 --- a/server/mpm/event/event.c +++ b/server/mpm/event/event.c @@ -417,7 +417,7 @@ typedef struct event_retained_data { * We use this value to optimize routines that have to scan the entire * scoreboard. */ - int max_daemons_limit; + int max_daemon_used; /* * All running workers, active and shutting down, including those that @@ -678,7 +678,7 @@ static int event_query(int query_code, int *result, apr_status_t *rv) *rv = APR_SUCCESS; switch (query_code) { case AP_MPMQ_MAX_DAEMON_USED: - *result = retained->max_daemons_limit; + *result = retained->max_daemon_used; break; case AP_MPMQ_IS_THREADED: *result = AP_MPMQ_STATIC; @@ -738,14 +738,32 @@ static int event_query(int query_code, int *result, apr_status_t *rv) return OK; } -static void event_note_child_killed(int childnum, pid_t pid, ap_generation_t gen) +static void event_note_child_stopped(int slot, pid_t pid, ap_generation_t gen) { - if (childnum != -1) { /* child had a scoreboard slot? */ - ap_run_child_status(ap_server_conf, - ap_scoreboard_image->parent[childnum].pid, - ap_scoreboard_image->parent[childnum].generation, - childnum, MPM_CHILD_EXITED); - ap_scoreboard_image->parent[childnum].pid = 0; + if (slot != -1) { /* child had a scoreboard slot? */ + process_score *ps = &ap_scoreboard_image->parent[slot]; + int i; + + pid = ps->pid; + gen = ps->generation; + for (i = 0; i < threads_per_child; i++) { + ap_update_child_status_from_indexes(slot, i, SERVER_DEAD, NULL); + } + ap_run_child_status(ap_server_conf, pid, gen, slot, MPM_CHILD_EXITED); + if (ps->quiescing != 2) { /* vs perform_idle_server_maintenance() */ + retained->active_daemons--; + } + retained->total_daemons--; + ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, + "Child %d stopped: pid %d, gen %d, " + "active %d/%d, total %d/%d/%d, quiescing %d", + slot, (int)pid, (int)gen, + retained->active_daemons, active_daemons_limit, + retained->total_daemons, retained->max_daemon_used, + server_limit, ps->quiescing); + ps->not_accepting = 0; + ps->quiescing = 0; + ps->pid = 0; } else { ap_run_child_status(ap_server_conf, pid, gen, -1, MPM_CHILD_EXITED); @@ -755,9 +773,19 @@ static void event_note_child_killed(int childnum, pid_t pid, ap_generation_t gen static void event_note_child_started(int slot, pid_t pid) { ap_generation_t gen = retained->mpm->my_generation; + + retained->total_daemons++; + retained->active_daemons++; ap_scoreboard_image->parent[slot].pid = pid; ap_scoreboard_image->parent[slot].generation = gen; ap_run_child_status(ap_server_conf, pid, gen, slot, MPM_CHILD_STARTED); + ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, + "Child %d started: pid %d, gen %d, " + "active %d/%d, total %d/%d/%d", + slot, (int)pid, (int)gen, + retained->active_daemons, active_daemons_limit, + retained->total_daemons, retained->max_daemon_used, + server_limit); } static const char *event_get_name(void) @@ -780,7 +808,7 @@ static void clean_child_exit(int code) } if (one_process) { - event_note_child_killed(/* slot */ 0, 0, 0); + event_note_child_stopped(/* slot */ 0, 0, 0); } exit(code); @@ -3032,8 +3060,8 @@ static int make_child(server_rec * s, int slot, int bucket) { int pid; - if (slot + 1 > retained->max_daemons_limit) { - retained->max_daemons_limit = slot + 1; + if (slot + 1 > retained->max_daemon_used) { + retained->max_daemon_used = slot + 1; } if (ap_scoreboard_image->parent[slot].pid != 0) { @@ -3101,11 +3129,7 @@ static int make_child(server_rec * s, int slot, int bucket) return -1; } - ap_scoreboard_image->parent[slot].quiescing = 0; - ap_scoreboard_image->parent[slot].not_accepting = 0; event_note_child_started(slot, pid); - retained->active_daemons++; - retained->total_daemons++; return 0; } @@ -3125,7 +3149,8 @@ static void startup_children(int number_to_start) } } -static void perform_idle_server_maintenance(int child_bucket) +static void perform_idle_server_maintenance(int child_bucket, + int *max_daemon_used) { int num_buckets = retained->mpm->num_buckets; int idle_thread_count = 0; @@ -3141,7 +3166,7 @@ static void perform_idle_server_maintenance(int child_bucket) /* We only care about child_bucket in this call */ continue; } - if (i >= retained->max_daemons_limit && + if (i >= retained->max_daemon_used && free_length == retained->idle_spawn_rate[child_bucket]) { /* short cut if all active processes have been examined and * enough empty scoreboard slots have been found @@ -3155,6 +3180,13 @@ static void perform_idle_server_maintenance(int child_bucket) if (ps->quiescing == 1) { ps->quiescing = 2; retained->active_daemons--; + ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, + "Child %d quiescing: pid %d, gen %d, " + "active %d/%d, total %d/%d/%d", + i, (int)ps->pid, (int)ps->generation, + retained->active_daemons, active_daemons_limit, + retained->total_daemons, retained->max_daemon_used, + server_limit); } for (j = 0; j < threads_per_child; j++) { int status = ap_scoreboard_image->servers[i][j].status; @@ -3184,7 +3216,16 @@ static void perform_idle_server_maintenance(int child_bucket) } } - retained->max_daemons_limit = last_non_dead + 1; + if (*max_daemon_used < last_non_dead + 1) { + *max_daemon_used = last_non_dead + 1; + + /* Below make_child() can grow retained->max_daemon_used, so + * be accurate if the one being computed is higher already. + */ + if (retained->max_daemon_used < *max_daemon_used) { + retained->max_daemon_used = *max_daemon_used; + } + } if (retained->sick_child_detected) { if (had_healthy_child) { @@ -3213,6 +3254,10 @@ static void perform_idle_server_maintenance(int child_bucket) } } + AP_DEBUG_ASSERT(retained->active_daemons <= retained->total_daemons + && retained->total_daemons <= retained->max_daemon_used + && retained->max_daemon_used <= server_limit); + if (idle_thread_count > max_spare_threads / num_buckets) { /* * Child processes that we ask to shut down won't die immediately @@ -3235,13 +3280,12 @@ static void perform_idle_server_maintenance(int child_bucket) active_daemons_limit)); ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf, "%shutting down one child: " - "active daemons %d / active limit %d / " - "total daemons %d / ServerLimit %d / " - "idle threads %d / max workers %d", + "active %d/%d, total %d/%d/%d, " + "idle threads %d, max workers %d", (do_kill) ? "S" : "Not s", retained->active_daemons, active_daemons_limit, - retained->total_daemons, server_limit, - idle_thread_count, max_workers); + retained->total_daemons, retained->max_daemon_used, + server_limit, idle_thread_count, max_workers); if (do_kill) { ap_mpm_podx_signal(retained->buckets[child_bucket].pod, AP_MPM_PODX_GRACEFUL); @@ -3290,10 +3334,14 @@ static void perform_idle_server_maintenance(int child_bucket) else { ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, "server is at active daemons limit, spawning " - "of %d children cancelled: %d/%d active, " - "rate %d", free_length, + "of %d children cancelled: active %d/%d, " + "total %d/%d/%d, rate %d", free_length, retained->active_daemons, active_daemons_limit, - retained->idle_spawn_rate[child_bucket]); + retained->total_daemons, retained->max_daemon_used, + server_limit, retained->idle_spawn_rate[child_bucket]); + /* reset the spawning rate and prevent its growth below */ + retained->idle_spawn_rate[child_bucket] = 1; + ++retained->hold_off_on_exponential_spawning; free_length = 0; } } @@ -3309,11 +3357,6 @@ static void perform_idle_server_maintenance(int child_bucket) retained->total_daemons); } for (i = 0; i < free_length; ++i) { - ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf, - "Spawning new child: slot %d active / " - "total daemons: %d/%d", - free_slots[i], retained->active_daemons, - retained->total_daemons); make_child(ap_server_conf, free_slots[i], child_bucket); } /* the next time around we want to spawn twice as many if this @@ -3340,6 +3383,7 @@ static void perform_idle_server_maintenance(int child_bucket) static void server_main_loop(int remaining_children_to_start) { int num_buckets = retained->mpm->num_buckets; + int max_daemon_used = 0; int child_slot; apr_exit_why_e exitwhy; int status, processed_status; @@ -3385,19 +3429,8 @@ static void server_main_loop(int remaining_children_to_start) } /* non-fatal death... note that it's gone in the scoreboard. */ if (child_slot >= 0) { - process_score *ps; - - for (i = 0; i < threads_per_child; i++) - ap_update_child_status_from_indexes(child_slot, i, - SERVER_DEAD, NULL); - - event_note_child_killed(child_slot, 0, 0); - ps = &ap_scoreboard_image->parent[child_slot]; - if (ps->quiescing != 2) - retained->active_daemons--; - ps->quiescing = 0; - /* NOTE: We don't dec in the (child_slot < 0) case! */ - retained->total_daemons--; + event_note_child_stopped(child_slot, 0, 0); + if (processed_status == APEXIT_CHILDSICK) { /* resource shortage, minimize the fork rate */ retained->idle_spawn_rate[child_slot % num_buckets] = 1; @@ -3447,9 +3480,11 @@ static void server_main_loop(int remaining_children_to_start) continue; } + max_daemon_used = 0; for (i = 0; i < num_buckets; i++) { - perform_idle_server_maintenance(i); + perform_idle_server_maintenance(i, &max_daemon_used); } + retained->max_daemon_used = max_daemon_used; } } @@ -3488,7 +3523,7 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s) active_daemons_limit, AP_MPM_PODX_RESTART); } ap_reclaim_child_processes(1, /* Start with SIGTERM */ - event_note_child_killed); + event_note_child_stopped); } apr_pool_clear(retained->gen_pool); retained->buckets = NULL; @@ -3637,7 +3672,7 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s) active_daemons_limit, AP_MPM_PODX_RESTART); } ap_reclaim_child_processes(1, /* Start with SIGTERM */ - event_note_child_killed); + event_note_child_stopped); if (!child_fatal) { /* cleanup pid file on normal shutdown */ @@ -3663,7 +3698,7 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s) ap_mpm_podx_killpg(retained->buckets[i].pod, active_daemons_limit, AP_MPM_PODX_GRACEFUL); } - ap_relieve_child_processes(event_note_child_killed); + ap_relieve_child_processes(event_note_child_stopped); if (!child_fatal) { /* cleanup pid file on normal shutdown */ @@ -3685,10 +3720,10 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s) apr_sleep(apr_time_from_sec(1)); /* Relieve any children which have now exited */ - ap_relieve_child_processes(event_note_child_killed); + ap_relieve_child_processes(event_note_child_stopped); active_children = 0; - for (index = 0; index < retained->max_daemons_limit; ++index) { + for (index = 0; index < retained->max_daemon_used; ++index) { if (ap_mpm_safe_kill(MPM_CHILD_PID(index), 0) == APR_SUCCESS) { active_children = 1; /* Having just one child is enough to stay around */ @@ -3706,7 +3741,7 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s) ap_mpm_podx_killpg(retained->buckets[i].pod, active_daemons_limit, AP_MPM_PODX_RESTART); } - ap_reclaim_child_processes(1, event_note_child_killed); + ap_reclaim_child_processes(1, event_note_child_stopped); return DONE; } |