summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRory Byrne <rory@nybek.com>2015-05-28 19:41:25 +0100
committerSiri Hansen <siri@erlang.org>2016-02-03 12:14:11 +0100
commit264a5e804d6e23bf6c6916887669760cbf243caa (patch)
tree36c811242327adf2da0cde5003ac486a75f70602
parent4422cb32637a341eac2dc03172e4aa859be67f34 (diff)
downloaderlang-264a5e804d6e23bf6c6916887669760cbf243caa.tar.gz
Speed up supervisor:count_children/1; simple_one_for_one
Speed up supervisor:count_children/1 for simple_one_for_one supervisors. This is achieved by avoiding looping through all the child process and verifying that each one is alive. For a supervisor with 100,000 'temporary' children the count-time will drop from approx 25ms to about 0.005ms. For a supervisor with 100,000 'permanent' or 'transient' children the count-time will drop from approx 30ms to about 0.005ms. This avoids having the supervisor block for an extended period while the count takes place. Under normal circumstances the accuracy of the result should also improve since the duration is too short for many processes to die during the count.
-rw-r--r--lib/sasl/test/release_handler_SUITE.erl2
-rw-r--r--lib/stdlib/doc/src/supervisor.xml5
-rw-r--r--lib/stdlib/src/supervisor.erl49
-rw-r--r--lib/stdlib/test/supervisor_SUITE.erl52
-rw-r--r--lib/stdlib/test/supervisor_deadlock.erl14
5 files changed, 90 insertions, 32 deletions
diff --git a/lib/sasl/test/release_handler_SUITE.erl b/lib/sasl/test/release_handler_SUITE.erl
index d57de2593a..ee620dcdb4 100644
--- a/lib/sasl/test/release_handler_SUITE.erl
+++ b/lib/sasl/test/release_handler_SUITE.erl
@@ -1363,7 +1363,7 @@ upgrade_supervisor(Conf) when is_list(Conf) ->
%% Check that the restart strategy and child spec is updated
{status, _, {module, _}, [_, _, _, _, [_,_,{data,[{"State",State}]}]]} =
rpc:call(Node,sys,get_status,[a_sup]),
- {state,_,RestartStrategy,[Child],_,_,_,_,_,_} = State,
+ {state,_,RestartStrategy,[Child],_,_,_,_,_,_,_} = State,
one_for_all = RestartStrategy, % changed from one_for_one
{child,_,_,_,_,brutal_kill,_,_} = Child, % changed from timeout 2000
diff --git a/lib/stdlib/doc/src/supervisor.xml b/lib/stdlib/doc/src/supervisor.xml
index 24ff251ce3..815bf4a489 100644
--- a/lib/stdlib/doc/src/supervisor.xml
+++ b/lib/stdlib/doc/src/supervisor.xml
@@ -543,7 +543,10 @@
</item>
<item>
<p><c>active</c> - the count of all actively running child processes
- managed by this supervisor.</p>
+ managed by this supervisor. In the case of <c>simple_one_for_one</c>
+ supervisors, no check is carried out to ensure that each child process
+ is still alive, though the result provided here is likely to be very
+ accurate unless the supervisor is heavily overloaded.</p>
</item>
<item>
<p><c>supervisors</c> - the count of all children marked as
diff --git a/lib/stdlib/src/supervisor.erl b/lib/stdlib/src/supervisor.erl
index 6a5005b4b6..cecdebd0c8 100644
--- a/lib/stdlib/src/supervisor.erl
+++ b/lib/stdlib/src/supervisor.erl
@@ -116,6 +116,7 @@
intensity :: non_neg_integer(),
period :: pos_integer(),
restarts = [],
+ dynamic_restarts = 0 :: non_neg_integer(),
module,
args}).
-type state() :: #state{}.
@@ -518,39 +519,26 @@ handle_call(which_children, _From, State) ->
handle_call(count_children, _From, #state{children = [#child{restart_type = temporary,
child_type = CT}]} = State)
when ?is_simple(State) ->
- {Active, Count} =
- ?SETS:fold(fun(Pid, {Alive, Tot}) ->
- case is_pid(Pid) andalso is_process_alive(Pid) of
- true ->{Alive+1, Tot +1};
- false ->
- {Alive, Tot + 1}
- end
- end, {0, 0}, dynamics_db(temporary, State#state.dynamics)),
+ Sz = ?SETS:size(dynamics_db(temporary, State#state.dynamics)),
Reply = case CT of
- supervisor -> [{specs, 1}, {active, Active},
- {supervisors, Count}, {workers, 0}];
- worker -> [{specs, 1}, {active, Active},
- {supervisors, 0}, {workers, Count}]
+ supervisor -> [{specs, 1}, {active, Sz},
+ {supervisors, Sz}, {workers, 0}];
+ worker -> [{specs, 1}, {active, Sz},
+ {supervisors, 0}, {workers, Sz}]
end,
{reply, Reply, State};
-handle_call(count_children, _From, #state{children = [#child{restart_type = RType,
+handle_call(count_children, _From, #state{dynamic_restarts = Restarts,
+ children = [#child{restart_type = RType,
child_type = CT}]} = State)
when ?is_simple(State) ->
- {Active, Count} =
- ?DICTS:fold(fun(Pid, _Val, {Alive, Tot}) ->
- case is_pid(Pid) andalso is_process_alive(Pid) of
- true ->
- {Alive+1, Tot +1};
- false ->
- {Alive, Tot + 1}
- end
- end, {0, 0}, dynamics_db(RType, State#state.dynamics)),
+ Sz = ?DICTS:size(dynamics_db(RType, State#state.dynamics)),
+ Active = Sz - Restarts,
Reply = case CT of
supervisor -> [{specs, 1}, {active, Active},
- {supervisors, Count}, {workers, 0}];
+ {supervisors, Sz}, {workers, 0}];
worker -> [{specs, 1}, {active, Active},
- {supervisors, 0}, {workers, Count}]
+ {supervisors, 0}, {workers, Sz}]
end,
{reply, Reply, State};
@@ -820,8 +808,15 @@ restart(Child, State) ->
{shutdown, remove_child(Child, NState)}
end.
-restart(simple_one_for_one, Child, State) ->
+restart(simple_one_for_one, Child, State0) ->
#child{pid = OldPid, mfargs = {M, F, A}} = Child,
+ State = case OldPid of
+ ?restarting(_) ->
+ NRes = State0#state.dynamic_restarts - 1,
+ State0#state{dynamic_restarts = NRes};
+ _ ->
+ State0
+ end,
Dynamics = ?DICTS:erase(OldPid, dynamics_db(Child#child.restart_type,
State#state.dynamics)),
case do_start_child_i(M, F, A) of
@@ -832,7 +827,9 @@ restart(simple_one_for_one, Child, State) ->
NState = State#state{dynamics = ?DICTS:store(Pid, A, Dynamics)},
{ok, NState};
{error, Error} ->
- NState = State#state{dynamics = ?DICTS:store(restarting(OldPid), A,
+ NRestarts = State#state.dynamic_restarts + 1,
+ NState = State#state{dynamic_restarts = NRestarts,
+ dynamics = ?DICTS:store(restarting(OldPid), A,
Dynamics)},
report_error(start_error, Error, Child, State#state.name),
{try_again, NState}
diff --git a/lib/stdlib/test/supervisor_SUITE.erl b/lib/stdlib/test/supervisor_SUITE.erl
index 8cb2a5194a..903ca76575 100644
--- a/lib/stdlib/test/supervisor_SUITE.erl
+++ b/lib/stdlib/test/supervisor_SUITE.erl
@@ -67,6 +67,7 @@
%% Misc tests
-export([child_unlink/1, tree/1, count_children/1,
+ count_restarting_children/1,
do_not_save_start_parameters_for_temporary_children/1,
do_not_save_child_specs_for_temporary_children/1,
simple_one_for_one_scale_many_temporary_children/1,
@@ -90,7 +91,8 @@ all() ->
{group, normal_termination},
{group, shutdown_termination},
{group, abnormal_termination}, child_unlink, tree,
- count_children, do_not_save_start_parameters_for_temporary_children,
+ count_children, count_restarting_children,
+ do_not_save_start_parameters_for_temporary_children,
do_not_save_child_specs_for_temporary_children,
simple_one_for_one_scale_many_temporary_children, temporary_bystander,
simple_global_supervisor, hanging_restart_loop, hanging_restart_loop_simple,
@@ -1459,6 +1461,54 @@ count_children(Config) when is_list(Config) ->
[1,0,0,0] = get_child_counts(sup_test).
%%-------------------------------------------------------------------------
+%% Test count_children when some children are restarting
+count_restarting_children(Config) when is_list(Config) ->
+ process_flag(trap_exit, true),
+ Child = {child, {supervisor_deadlock, start_child_noreg, []},
+ permanent, brutal_kill, worker, []},
+ %% 2 sek delay on failing restart (see supervisor_deadlock.erl) ->
+ %% MaxR=20, MaxT=10 should ensure a restart loop when starting and
+ %% restarting 3 instances of the child (as below)
+ {ok, SupPid} = start_link({ok, {{simple_one_for_one, 20, 10}, [Child]}}),
+
+ %% Ets table with state read by supervisor_deadlock.erl
+ ets:new(supervisor_deadlock,[set,named_table,public]),
+ ets:insert(supervisor_deadlock,{fail_start,false}),
+
+ [1,0,0,0] = get_child_counts(SupPid),
+ {ok, Ch1_1} = supervisor:start_child(SupPid, []),
+ [1,1,0,1] = get_child_counts(SupPid),
+ {ok, Ch1_2} = supervisor:start_child(SupPid, []),
+ [1,2,0,2] = get_child_counts(SupPid),
+ {ok, Ch1_3} = supervisor:start_child(SupPid, []),
+ [1,3,0,3] = get_child_counts(SupPid),
+
+ supervisor_deadlock:restart_child(Ch1_1),
+ supervisor_deadlock:restart_child(Ch1_2),
+ supervisor_deadlock:restart_child(Ch1_3),
+ test_server:sleep(400),
+ [1,3,0,3] = get_child_counts(SupPid),
+ [Ch2_1, Ch2_2, Ch2_3] = [C || {_,C,_,_} <- supervisor:which_children(SupPid)],
+
+ ets:insert(supervisor_deadlock,{fail_start,true}),
+ supervisor_deadlock:restart_child(Ch2_1),
+ supervisor_deadlock:restart_child(Ch2_2),
+ test_server:sleep(4000), % allow restart to happen before proceeding
+ [1,1,0,3] = get_child_counts(SupPid),
+
+ ets:insert(supervisor_deadlock,{fail_start,false}),
+ test_server:sleep(4000), % allow restart to happen before proceeding
+ [1,3,0,3] = get_child_counts(SupPid),
+
+ ok = supervisor:terminate_child(SupPid, Ch2_3),
+ [1,2,0,2] = get_child_counts(SupPid),
+ [Ch3_1, Ch3_2] = [C || {_,C,_,_} <- supervisor:which_children(SupPid)],
+ ok = supervisor:terminate_child(SupPid, Ch3_1),
+ [1,1,0,1] = get_child_counts(SupPid),
+ ok = supervisor:terminate_child(SupPid, Ch3_2),
+ [1,0,0,0] = get_child_counts(SupPid).
+
+%%-------------------------------------------------------------------------
%% Temporary children shall not be restarted so they should not save
%% start parameters, as it potentially can take up a huge amount of
%% memory for no purpose.
diff --git a/lib/stdlib/test/supervisor_deadlock.erl b/lib/stdlib/test/supervisor_deadlock.erl
index 288547a972..8d3d1c6f30 100644
--- a/lib/stdlib/test/supervisor_deadlock.erl
+++ b/lib/stdlib/test/supervisor_deadlock.erl
@@ -11,9 +11,11 @@ init([child]) ->
%% terminates immediately
{ok, []};
[{fail_start, true}] ->
- %% Restart frequency is MaxR=8, MaxT=10, so this will
- %% ensure that restart intensity is not reached -> restart
- %% loop
+ %% A restart frequency of MaxR=8, MaxT=10 should ensure
+ %% that restart intensity is not reached -> restart loop.
+ %% (Note that if we use simple_one_for_one, and start
+ %% 'many' child instances, the restart frequency must be
+ %% ajusted accordingly.)
timer:sleep(2000), % NOTE: this could be a gen_server call timeout
{stop, error}
@@ -41,5 +43,11 @@ code_change(_OldVsn, State, _Extra) ->
start_child() ->
gen_server:start_link({local, ?MODULE}, ?MODULE, [child], []).
+start_child_noreg() ->
+ gen_server:start_link(?MODULE, [child], []).
+
restart_child() ->
gen_server:cast(supervisor_deadlock, restart).
+
+restart_child(Pid) ->
+ gen_server:cast(Pid, restart).