diff options
author | Simon MacMullen <simon@rabbitmq.com> | 2014-10-14 15:21:24 +0100 |
---|---|---|
committer | Simon MacMullen <simon@rabbitmq.com> | 2014-10-14 15:21:24 +0100 |
commit | 5b8dbdb7f1f1265802324e6a56923a21f833e576 (patch) | |
tree | fd7f61ce60c063f3a283dfc992f74b20a88689d4 | |
parent | 6178f678ef85a7a94bc00368d36343163703dea1 (diff) | |
download | rabbitmq-server-5b8dbdb7f1f1265802324e6a56923a21f833e576.tar.gz |
Check whether the cluster is fully connected before trying to autoheal, and ignore autoheal requests if it isn't.
-rw-r--r-- | src/rabbit_autoheal.erl | 66 | ||||
-rw-r--r-- | src/rabbit_node_monitor.erl | 10 |
2 files changed, 50 insertions, 26 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl index 259e6ec2..fd592ea3 100644 --- a/src/rabbit_autoheal.erl +++ b/src/rabbit_autoheal.erl @@ -117,27 +117,28 @@ node_down(Node, _State) -> handle_msg({request_start, Node}, not_healing, Partitions) -> rabbit_log:info("Autoheal request received from ~p~n", [Node]), - rabbit_node_monitor:ping_all(), - case rabbit_node_monitor:all_rabbit_nodes_up() of - false -> not_healing; - true -> AllPartitions = all_partitions(Partitions), - {Winner, Losers} = make_decision(AllPartitions), - rabbit_log:info("Autoheal decision~n" - " * Partitions: ~p~n" - " * Winner: ~p~n" - " * Losers: ~p~n", - [AllPartitions, Winner, Losers]), - Continue = fun(Msg) -> - handle_msg(Msg, not_healing, Partitions) - end, - case node() =:= Winner of - true -> Continue({become_winner, Losers}); - false -> send(Winner, {become_winner, Losers}), %% [0] - case lists:member(node(), Losers) of - true -> Continue({winner_is, Winner}); - false -> {leader_waiting, Losers} - end - end + case check_other_nodes(Partitions) of + {error, E} -> + rabbit_log:info("Autoheal request denied: ~s~n", [fmt_error(E)]), + not_healing; + {ok, AllPartitions} -> + {Winner, Losers} = make_decision(AllPartitions), + rabbit_log:info("Autoheal decision~n" + " * Partitions: ~p~n" + " * Winner: ~p~n" + " * Losers: ~p~n", + [AllPartitions, Winner, Losers]), + Continue = fun(Msg) -> + handle_msg(Msg, not_healing, Partitions) + end, + case node() =:= Winner of + true -> Continue({become_winner, Losers}); + false -> send(Winner, {become_winner, Losers}), %% [0] + case lists:member(node(), Losers) of + true -> Continue({winner_is, Winner}); + false -> {leader_waiting, Losers} + end + end end; %% [0] If we are a loser we will never receive this message - but it %% won't stick in the mailbox as we are restarting anyway @@ -211,11 +212,21 @@ partition_value(Partition) -> %% We have our local understanding of what partitions exist; but we %% only know which nodes we have been partitioned from, not which %% nodes are partitioned from each other. -all_partitions(PartitionedWith) -> +check_other_nodes(LocalPartitions) -> Nodes = rabbit_mnesia:cluster_nodes(all), - Partitions = [{node(), PartitionedWith} | - rabbit_node_monitor:partitions(Nodes -- [node()])], - all_partitions(Partitions, [Nodes]). + {Results, Bad} = rabbit_node_monitor:status(Nodes -- [node()]), + RemotePartitions = [{Node, proplists:get_value(partitions, Res)} + || {Node, Res} <- Results], + RemoteDown = [{Node, Down} + || {Node, Res} <- Results, + Down <- [Nodes -- proplists:get_value(nodes, Res)], + Down =/= []], + case {Bad, RemoteDown} of + {[], []} -> Partitions = [{node(), LocalPartitions} | RemotePartitions], + {ok, all_partitions(Partitions, [Nodes])}; + {[], _} -> {error, {remote_down, RemoteDown}}; + {_, _} -> {error, {nodes_down, Bad}} + end. all_partitions([], Partitions) -> Partitions; @@ -230,3 +241,8 @@ all_partitions([{Node, CantSee} | Rest], Partitions) -> _ -> [A, B | Others] end, all_partitions(Rest, Partitions1). + +fmt_error({remote_down, RemoteDown}) -> + rabbit_misc:format("Remote nodes disconnected:~n ~p", [RemoteDown]); +fmt_error({nodes_down, NodesDown}) -> + rabbit_misc:format("Local nodes down: ~p", [NodesDown]). diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl index a948115d..02aac125 100644 --- a/src/rabbit_node_monitor.erl +++ b/src/rabbit_node_monitor.erl @@ -24,7 +24,7 @@ write_cluster_status/1, read_cluster_status/0, update_cluster_status/0, reset_cluster_status/0]). -export([notify_node_up/0, notify_joined_cluster/0, notify_left_cluster/1]). --export([partitions/0, partitions/1, subscribe/1]). +-export([partitions/0, partitions/1, status/1, subscribe/1]). -export([pause_minority_guard/0]). %% gen_server callbacks @@ -62,6 +62,7 @@ -spec(partitions/0 :: () -> [node()]). -spec(partitions/1 :: ([node()]) -> [{node(), [node()]}]). +-spec(status/1 :: ([node()]) -> {[{node(), [node()]}], [node()]}). -spec(subscribe/1 :: (pid()) -> 'ok'). -spec(pause_minority_guard/0 :: () -> 'ok' | 'pausing'). @@ -186,6 +187,9 @@ partitions(Nodes) -> {Replies, _} = gen_server:multi_call(Nodes, ?SERVER, partitions, infinity), Replies. +status(Nodes) -> + gen_server:multi_call(Nodes, ?SERVER, status, infinity). + subscribe(Pid) -> gen_server:cast(?SERVER, {subscribe, Pid}). @@ -252,6 +256,10 @@ init([]) -> handle_call(partitions, _From, State = #state{partitions = Partitions}) -> {reply, Partitions, State}; +handle_call(status, _From, State = #state{partitions = Partitions}) -> + {reply, [{partitions, Partitions}, + {nodes, [node() | nodes()]}], State}; + handle_call(_Request, _From, State) -> {noreply, State}. |