summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSimon MacMullen <simon@rabbitmq.com>2014-09-30 16:14:39 +0100
committerSimon MacMullen <simon@rabbitmq.com>2014-09-30 16:14:39 +0100
commit8d3f832d16e6d9e3e2a0982c4a90701eb3cc0cae (patch)
tree935142d696238206225f331dcc3663b91e462c6a
parentfb200997e97af5d4f15166a53beca72101602a8d (diff)
downloadrabbitmq-server-8d3f832d16e6d9e3e2a0982c4a90701eb3cc0cae.tar.gz
In the event of a partial partition in pause_minority mode, pause until everything comes back - otherwise we stand a chance of just reconnecting and still being in a partial partition.
-rw-r--r--src/rabbit_node_monitor.erl39
1 files changed, 25 insertions, 14 deletions
diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl
index aa2cba2b..935b40c9 100644
--- a/src/rabbit_node_monitor.erl
+++ b/src/rabbit_node_monitor.erl
@@ -290,14 +290,25 @@ handle_cast({check_partial_partition, _Node, _NodeGUID, _Reporter, _GUID},
handle_cast({partial_partition, GUID, Reporter, Proxy},
State = #state{guid = GUID}) ->
- rabbit_log:error(
- "Partial partition detected:~n"
- " * This node was reported DOWN by ~s~n"
- " * We can still see ~s via ~s~n~n"
- "We will therefore intentionally disconnect from ~s~n",
- [Reporter, Reporter, Proxy, Proxy]),
- erlang:disconnect_node(Proxy),
- {noreply, State};
+ FmtBase = "Partial partition detected:~n"
+ " * This node was reported DOWN by ~s~n"
+ " * We can still see ~s which can see ~s~n",
+ ArgsBase = [Reporter, Proxy, Reporter],
+ case application:get_env(rabbit, cluster_partition_handling) of
+ {ok, pause_minority} ->
+ rabbit_log:error(
+ FmtBase ++ " * pause_minority mode enabled~n"
+ "We will therefore pause until the *entire* cluster recovers~n",
+ ArgsBase),
+ await_cluster_recovery(fun all_nodes_up/0),
+ {noreply, State};
+ {ok, _} ->
+ rabbit_log:error(
+ FmtBase ++ "We will therefore intentionally disconnect from ~s~n",
+ ArgsBase ++ [Proxy]),
+ erlang:disconnect_node(Proxy),
+ {noreply, State}
+ end;
handle_cast({partial_partition, _GUID, _Reporter, _Proxy}, State) ->
{noreply, State};
@@ -457,7 +468,7 @@ handle_dead_node(Node, State = #state{autoheal = Autoheal}) ->
{ok, pause_minority} ->
case majority() of
true -> ok;
- false -> await_cluster_recovery()
+ false -> await_cluster_recovery(fun majority/0)
end,
State;
{ok, ignore} ->
@@ -470,12 +481,12 @@ handle_dead_node(Node, State = #state{autoheal = Autoheal}) ->
State
end.
-await_cluster_recovery() ->
+await_cluster_recovery(Condition) ->
rabbit_log:warning("Cluster minority status detected - awaiting recovery~n",
[]),
run_outside_applications(fun () ->
rabbit:stop(),
- wait_for_cluster_recovery()
+ wait_for_cluster_recovery(Condition)
end),
ok.
@@ -492,12 +503,12 @@ run_outside_applications(Fun) ->
end
end).
-wait_for_cluster_recovery() ->
+wait_for_cluster_recovery(Condition) ->
ping_all(),
- case majority() of
+ case Condition() of
true -> rabbit:start();
false -> timer:sleep(?RABBIT_DOWN_PING_INTERVAL),
- wait_for_cluster_recovery()
+ wait_for_cluster_recovery(Condition)
end.
handle_dead_rabbit(Node, State = #state{partitions = Partitions,