diff options
author | Simon MacMullen <simon@rabbitmq.com> | 2014-11-21 13:35:22 +0000 |
---|---|---|
committer | Simon MacMullen <simon@rabbitmq.com> | 2014-11-21 13:35:22 +0000 |
commit | 6bc445afbc6b1f63e8c89613c115accfc5c1ec07 (patch) | |
tree | a3d93e843a8854c78f67dbb5e74dafed5233b3ab | |
parent | bed348597806dbd80cf42d6f9bdf68361a86faa5 (diff) | |
download | rabbitmq-server-6bc445afbc6b1f63e8c89613c115accfc5c1ec07.tar.gz |
Be a bit more careful before declaring a partial partition.
-rw-r--r-- | src/rabbit_node_monitor.erl | 21 |
1 files changed, 20 insertions, 1 deletions
diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl index e6069387..5f453053 100644 --- a/src/rabbit_node_monitor.erl +++ b/src/rabbit_node_monitor.erl @@ -317,10 +317,29 @@ handle_cast({check_partial_partition, Node, Rep, NodeGUID, MyGUID, RepGUID}, node_guids = GUIDs}) -> case lists:member(Node, rabbit_mnesia:cluster_nodes(running)) andalso orddict:find(Node, GUIDs) =:= {ok, NodeGUID} of - true -> cast(Rep, {partial_partition, Node, node(), RepGUID}); + true -> spawn_link( %%[1] + fun () -> + case rpc:call(Node, rabbit, is_running, []) of + {badrpc, _} -> + ok; + _ -> + cast(Rep, {partial_partition, + Node, node(), RepGUID}) + end + end); false -> ok end, {noreply, State}; +%% [1] We checked that we haven't heard the node go down - but we +%% really should make sure we can actually communicate with +%% it. Otherwise there's a race where we falsely detect a partial +%% partition. +%% +%% Now of course the rpc:call/4 may take a long time to return if +%% connectivity with the node is actually interrupted - but that's OK, +%% we only really want to do something in a timely manner if +%% connectivity is OK. However, of course as always we must not block +%% the node monitor, so we do the check in a separate process. handle_cast({check_partial_partition, _Node, _Reporter, _NodeGUID, _GUID, _ReporterGUID}, State) -> |