summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJean-Sebastien Pedron <jean-sebastien@rabbitmq.com>2014-12-09 19:14:28 +0100
committerJean-Sebastien Pedron <jean-sebastien@rabbitmq.com>2014-12-09 19:14:28 +0100
commita755fcfcd6bfd9435534de2dc07288144bdf85a9 (patch)
tree3bde494cd06a95b675af77ad49c9aaf75de940d9
parent1749f8c8e0c65a4c09df720ade033706c6d37468 (diff)
downloadrabbitmq-server-a755fcfcd6bfd9435534de2dc07288144bdf85a9.tar.gz
Autoheal: The loosing leader must wait for the winner_is message
As any other loosing nodes, the leader must wait for the winner_is message, instead of restarting immediately. The previous behaviour caused transient failures in the autoheal process if the leader was in the middle of the restart at the time the winner checks that all loosing nodes are up and running.
-rw-r--r--src/rabbit_autoheal.erl9
1 files changed, 7 insertions, 2 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl
index 90458741..7089911c 100644
--- a/src/rabbit_autoheal.erl
+++ b/src/rabbit_autoheal.erl
@@ -54,6 +54,10 @@
%% - we are the winner and are waiting for all losing nodes to stop
%% before telling them they can restart
%%
+%% about_to_heal
+%% - we are the leader, and have already assigned the winner and losers.
+%% We are part of the losers and we wait for the winner_is announcement.
+%%
%% {leader_waiting, OutstandingStops}
%% - we are the leader, and have already assigned the winner and losers.
%% We are neither but need to ignore further requests to autoheal.
@@ -135,7 +139,7 @@ handle_msg({request_start, Node},
true -> Continue({become_winner, Losers});
false -> send(Winner, {become_winner, Losers}), %% [0]
case lists:member(node(), Losers) of
- true -> Continue({winner_is, Winner});
+ true -> about_to_heal;
false -> {leader_waiting, Losers}
end
end
@@ -163,7 +167,8 @@ handle_msg({become_winner, Losers},
end;
handle_msg({winner_is, Winner},
- not_healing, _Partitions) ->
+ State, _Partitions)
+ when State =:= not_healing orelse State =:= about_to_heal ->
rabbit_log:warning(
"Autoheal: we were selected to restart; winner is ~p~n", [Winner]),
rabbit_node_monitor:run_outside_applications(