summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSimon MacMullen <simon@rabbitmq.com>2014-10-14 15:21:24 +0100
committerSimon MacMullen <simon@rabbitmq.com>2014-10-14 15:21:24 +0100
commit5b8dbdb7f1f1265802324e6a56923a21f833e576 (patch)
treefd7f61ce60c063f3a283dfc992f74b20a88689d4
parent6178f678ef85a7a94bc00368d36343163703dea1 (diff)
downloadrabbitmq-server-5b8dbdb7f1f1265802324e6a56923a21f833e576.tar.gz
Check whether the cluster is fully connected before trying to autoheal, and ignore autoheal requests if it isn't.
-rw-r--r--src/rabbit_autoheal.erl66
-rw-r--r--src/rabbit_node_monitor.erl10
2 files changed, 50 insertions, 26 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl
index 259e6ec2..fd592ea3 100644
--- a/src/rabbit_autoheal.erl
+++ b/src/rabbit_autoheal.erl
@@ -117,27 +117,28 @@ node_down(Node, _State) ->
handle_msg({request_start, Node},
not_healing, Partitions) ->
rabbit_log:info("Autoheal request received from ~p~n", [Node]),
- rabbit_node_monitor:ping_all(),
- case rabbit_node_monitor:all_rabbit_nodes_up() of
- false -> not_healing;
- true -> AllPartitions = all_partitions(Partitions),
- {Winner, Losers} = make_decision(AllPartitions),
- rabbit_log:info("Autoheal decision~n"
- " * Partitions: ~p~n"
- " * Winner: ~p~n"
- " * Losers: ~p~n",
- [AllPartitions, Winner, Losers]),
- Continue = fun(Msg) ->
- handle_msg(Msg, not_healing, Partitions)
- end,
- case node() =:= Winner of
- true -> Continue({become_winner, Losers});
- false -> send(Winner, {become_winner, Losers}), %% [0]
- case lists:member(node(), Losers) of
- true -> Continue({winner_is, Winner});
- false -> {leader_waiting, Losers}
- end
- end
+ case check_other_nodes(Partitions) of
+ {error, E} ->
+ rabbit_log:info("Autoheal request denied: ~s~n", [fmt_error(E)]),
+ not_healing;
+ {ok, AllPartitions} ->
+ {Winner, Losers} = make_decision(AllPartitions),
+ rabbit_log:info("Autoheal decision~n"
+ " * Partitions: ~p~n"
+ " * Winner: ~p~n"
+ " * Losers: ~p~n",
+ [AllPartitions, Winner, Losers]),
+ Continue = fun(Msg) ->
+ handle_msg(Msg, not_healing, Partitions)
+ end,
+ case node() =:= Winner of
+ true -> Continue({become_winner, Losers});
+ false -> send(Winner, {become_winner, Losers}), %% [0]
+ case lists:member(node(), Losers) of
+ true -> Continue({winner_is, Winner});
+ false -> {leader_waiting, Losers}
+ end
+ end
end;
%% [0] If we are a loser we will never receive this message - but it
%% won't stick in the mailbox as we are restarting anyway
@@ -211,11 +212,21 @@ partition_value(Partition) ->
%% We have our local understanding of what partitions exist; but we
%% only know which nodes we have been partitioned from, not which
%% nodes are partitioned from each other.
-all_partitions(PartitionedWith) ->
+check_other_nodes(LocalPartitions) ->
Nodes = rabbit_mnesia:cluster_nodes(all),
- Partitions = [{node(), PartitionedWith} |
- rabbit_node_monitor:partitions(Nodes -- [node()])],
- all_partitions(Partitions, [Nodes]).
+ {Results, Bad} = rabbit_node_monitor:status(Nodes -- [node()]),
+ RemotePartitions = [{Node, proplists:get_value(partitions, Res)}
+ || {Node, Res} <- Results],
+ RemoteDown = [{Node, Down}
+ || {Node, Res} <- Results,
+ Down <- [Nodes -- proplists:get_value(nodes, Res)],
+ Down =/= []],
+ case {Bad, RemoteDown} of
+ {[], []} -> Partitions = [{node(), LocalPartitions} | RemotePartitions],
+ {ok, all_partitions(Partitions, [Nodes])};
+ {[], _} -> {error, {remote_down, RemoteDown}};
+ {_, _} -> {error, {nodes_down, Bad}}
+ end.
all_partitions([], Partitions) ->
Partitions;
@@ -230,3 +241,8 @@ all_partitions([{Node, CantSee} | Rest], Partitions) ->
_ -> [A, B | Others]
end,
all_partitions(Rest, Partitions1).
+
+fmt_error({remote_down, RemoteDown}) ->
+ rabbit_misc:format("Remote nodes disconnected:~n ~p", [RemoteDown]);
+fmt_error({nodes_down, NodesDown}) ->
+ rabbit_misc:format("Local nodes down: ~p", [NodesDown]).
diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl
index a948115d..02aac125 100644
--- a/src/rabbit_node_monitor.erl
+++ b/src/rabbit_node_monitor.erl
@@ -24,7 +24,7 @@
write_cluster_status/1, read_cluster_status/0,
update_cluster_status/0, reset_cluster_status/0]).
-export([notify_node_up/0, notify_joined_cluster/0, notify_left_cluster/1]).
--export([partitions/0, partitions/1, subscribe/1]).
+-export([partitions/0, partitions/1, status/1, subscribe/1]).
-export([pause_minority_guard/0]).
%% gen_server callbacks
@@ -62,6 +62,7 @@
-spec(partitions/0 :: () -> [node()]).
-spec(partitions/1 :: ([node()]) -> [{node(), [node()]}]).
+-spec(status/1 :: ([node()]) -> {[{node(), [node()]}], [node()]}).
-spec(subscribe/1 :: (pid()) -> 'ok').
-spec(pause_minority_guard/0 :: () -> 'ok' | 'pausing').
@@ -186,6 +187,9 @@ partitions(Nodes) ->
{Replies, _} = gen_server:multi_call(Nodes, ?SERVER, partitions, infinity),
Replies.
+status(Nodes) ->
+ gen_server:multi_call(Nodes, ?SERVER, status, infinity).
+
subscribe(Pid) ->
gen_server:cast(?SERVER, {subscribe, Pid}).
@@ -252,6 +256,10 @@ init([]) ->
handle_call(partitions, _From, State = #state{partitions = Partitions}) ->
{reply, Partitions, State};
+handle_call(status, _From, State = #state{partitions = Partitions}) ->
+ {reply, [{partitions, Partitions},
+ {nodes, [node() | nodes()]}], State};
+
handle_call(_Request, _From, State) ->
{noreply, State}.