Switch pause_minority mode to making its decisions entirely based on node-upness, not rabbit-application-upness and explain why.bug25471

author: Simon MacMullen <simon@rabbitmq.com> 2013-04-22 16:23:54 +0100
committer: Simon MacMullen <simon@rabbitmq.com> 2013-04-22 16:23:54 +0100
commit: d35534e6a7a0a04fc2ed68061d9c2508864a0865 (patch)
tree: b1f40193c6edefdd551d9f35a355c3402da8b1a4
parent: 8b7df16ae4b8014181c32b29db2cba53c35b2c24 (diff)
download: rabbitmq-server-bug25471.tar.gz
1 files changed, 18 insertions, 2 deletions
diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl
index ca8e6dbd..7d844c72 100644
--- a/src/rabbit_node_monitor.erl
+++ b/src/rabbit_node_monitor.erl
@@ -200,6 +200,7 @@ init([]) ->
     %% writing out the cluster status files - bad things can then
     %% happen.
     process_flag(trap_exit, true),
+    net_kernel:monitor_nodes(true),
     {ok, _} = mnesia:subscribe(system),
     {ok, #state{monitors    = pmon:new(),
                 subscribers = pmon:new(),
@@ -265,6 +266,10 @@ handle_info({'DOWN', _MRef, process, Pid, _Reason},
             State = #state{subscribers = Subscribers}) ->
     {noreply, State#state{subscribers = pmon:erase(Pid, Subscribers)}};
 
+handle_info({nodedown, Node}, State) ->
+    ok = handle_dead_node(Node),
+    {noreply, State};
+
 handle_info({mnesia_system_event,
              {inconsistent_database, running_partitioned_network, Node}},
             State = #state{partitions = Partitions,
@@ -333,6 +338,18 @@ handle_dead_rabbit(Node) ->
     ok = rabbit_amqqueue:on_node_down(Node),
     ok = rabbit_alarm:on_node_down(Node),
     ok = rabbit_mnesia:on_node_down(Node),
+    ok.
+
+handle_dead_node(_Node) ->
+    %% In general in rabbit_node_monitor we care about whether the
+    %% rabbit application is up rather than the node; we do this so
+    %% that we can respond in the same way to "rabbitmqctl stop_app"
+    %% and "rabbitmqctl stop" as much as possible.
+    %%
+    %% However, for pause_minority mode we can't do this, since we
+    %% depend on looking at whether other nodes are up to decide
+    %% whether to come back up ourselves - if we decide that based on
+    %% the rabbit application we would go down and never come back.
     case application:get_env(rabbit, cluster_partition_handling) of
         {ok, pause_minority} ->
             case majority() of
@@ -347,8 +364,7 @@ handle_dead_rabbit(Node) ->
             rabbit_log:warning("cluster_partition_handling ~p unrecognised, "
                                "assuming 'ignore'~n", [Term]),
             ok
-    end,
-    ok.
+    end.
 
 await_cluster_recovery() ->
     rabbit_log:warning("Cluster minority status detected - awaiting recovery~n",
author	Simon MacMullen <simon@rabbitmq.com>	2013-04-22 16:23:54 +0100
committer	Simon MacMullen <simon@rabbitmq.com>	2013-04-22 16:23:54 +0100
commit	d35534e6a7a0a04fc2ed68061d9c2508864a0865 (patch)
tree	b1f40193c6edefdd551d9f35a355c3402da8b1a4
parent	8b7df16ae4b8014181c32b29db2cba53c35b2c24 (diff)
download	rabbitmq-server-bug25471.tar.gz