From adf6677168f7ecdf92020cddb5d702a7b5620bf8 Mon Sep 17 00:00:00 2001 From: Simon MacMullen Date: Wed, 17 Apr 2013 16:55:39 +0100 Subject: Rename states to hopefully be clearer; add more comments. --- src/rabbit_autoheal.erl | 54 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl index 4a6307fc..fc3ca1e8 100644 --- a/src/rabbit_autoheal.erl +++ b/src/rabbit_autoheal.erl @@ -46,6 +46,28 @@ %% not announce its decision (and thus cue other nodes to restart) %% until it has seen a request from every node that has experienced a %% partition. +%% +%% Possible states: +%% +%% not_healing +%% - the default +%% +%% {leader_wait_for_winner_requests, OutstandingRequests, Notify} +%% - we are the leader and are waiting to hear requests from all +%% other partitioned nodes +%% +%% wait_for_winner +%% - we are not the leader and are waiting to see what it has to say +%% +%% {winner_wait_for_stops, OutstandingStops, Notify} +%% - we are the winner and are waiting for all losing nodes to stop +%% before telling them they can restart +%% +%% restarting +%% - we are restarting. Of course the node monitor immediately dies +%% then so this state does not last long. We therefore send the +%% autoheal_safe_to_start message to the rabbit_outside_app_process +%% instead. %%---------------------------------------------------------------------------- @@ -68,7 +90,7 @@ maybe_start(State) -> enabled() -> {ok, autoheal} =:= application:get_env(rabbit, cluster_partition_handling). -node_down(_Node, {wait_for, _Nodes, _Notify} = Autoheal) -> +node_down(_Node, {winner_wait_for_stops, _Nodes, _Notify} = Autoheal) -> Autoheal; node_down(_Node, not_healing) -> not_healing; @@ -76,6 +98,7 @@ node_down(Node, _State) -> rabbit_log:info("Autoheal: aborting - ~p went down~n", [Node]), not_healing. +%% By receiving this message we become the leader handle_msg({request_winner, Node}, not_healing, Partitions) -> case rabbit_node_monitor:all_nodes_up() of @@ -97,33 +120,38 @@ handle_msg({request_winner, Node}, "Autoheal leader start; partitioned nodes are ~p~n", [Partitioned1]), handle_msg({request_winner, Node}, - {wait_for_winner_reqs, Partitioned1, Partitioned1}, + {leader_wait_for_winner_requests, + Partitioned1, Partitioned1}, Partitions) end; +%% This is the leader receiving its last winner request - all +%% partitioned nodes have checked in handle_msg({request_winner, Node}, - {wait_for_winner_reqs, [Node], Notify}, Partitions) -> + {leader_wait_for_winner_requests, [Node], Notify}, Partitions) -> AllPartitions = all_partitions(Partitions), Winner = select_winner(AllPartitions), rabbit_log:info("Autoheal request winner from ~p~n" " Partitions were determined to be ~p~n" " Winner is ~p~n", [Node, AllPartitions, Winner]), - [send(N, {winner, Winner}) || N <- Notify], + [send(N, {winner_is, Winner}) || N <- Notify], wait_for_winner; +%% This is the leader receiving any other winner request handle_msg({request_winner, Node}, - {wait_for_winner_reqs, Nodes, Notify}, _Partitions) -> + {leader_wait_for_winner_requests, Nodes, Notify}, _Partitions) -> rabbit_log:info("Autoheal request winner from ~p~n", [Node]), - {wait_for_winner_reqs, Nodes -- [Node], Notify}; + {leader_wait_for_winner_requests, Nodes -- [Node], Notify}; -handle_msg({winner, Winner}, +handle_msg({winner_is, Winner}, wait_for_winner, Partitions) -> case lists:member(Winner, Partitions) of false -> case node() of Winner -> rabbit_log:info( "Autoheal: waiting for nodes to stop: ~p~n", [Partitions]), - {wait_for, Partitions, Partitions}; + {winner_wait_for_stops, + Partitions, Partitions}; _ -> rabbit_log:info( "Autoheal: nothing to do~n", []), not_healing @@ -132,19 +160,21 @@ handle_msg({winner, Winner}, restarting end; -handle_msg({winner, _Winner}, State, _Partitions) -> +handle_msg({winner_is, _Winner}, State, _Partitions) -> %% ignore, we already cancelled the autoheal process State; +%% This is the winner receiving its last notification that a node has +%% stopped - all nodes can now start again handle_msg({node_stopped, Node}, - {wait_for, [Node], Notify}, _Partitions) -> + {winner_wait_for_stops, [Node], Notify}, _Partitions) -> rabbit_log:info("Autoheal: final node has stopped, starting...~n",[]), [{rabbit_outside_app_process, N} ! autoheal_safe_to_start || N <- Notify], not_healing; handle_msg({node_stopped, Node}, - {wait_for, WaitFor, Notify}, _Partitions) -> - {wait_for, WaitFor -- [Node], Notify}; + {winner_wait_for_stops, WaitFor, Notify}, _Partitions) -> + {winner_wait_for_stops, WaitFor -- [Node], Notify}; handle_msg({node_stopped, _Node}, State, _Partitions) -> %% ignore, we already cancelled the autoheal process -- cgit v1.2.1