From 264fd053a141db698a6b078f3fdf2e65378424c9 Mon Sep 17 00:00:00 2001
From: Simon MacMullen <simon@rabbitmq.com>
Date: Wed, 8 Oct 2014 15:41:31 +0100
Subject: Defend against partitions at the wrong time causing badness.

---
 src/rabbit_mirror_queue_sync.erl | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/rabbit_mirror_queue_sync.erl b/src/rabbit_mirror_queue_sync.erl
index e3fae4c0..d1ef5f30 100644
--- a/src/rabbit_mirror_queue_sync.erl
+++ b/src/rabbit_mirror_queue_sync.erl
@@ -156,18 +156,30 @@ syncer(Ref, Log, MPid, SPids) ->
     %% We wait for a reply from the slaves so that we know they are in
     %% a receive block and will thus receive messages we send to them
     %% *without* those messages ending up in their gen_server2 pqueue.
-    case [SPid || SPid <- SPids,
-                  receive
-                      {sync_ready, Ref, SPid}       -> true;
-                      {sync_deny,  Ref, SPid}       -> false;
-                      {'DOWN', _, process, SPid, _} -> false
-                  end] of
+    case await_slaves(Ref, SPids) of
         []     -> Log("all slaves already synced", []);
         SPids1 -> MPid ! {ready, self()},
                   Log("mirrors ~p to sync", [[node(SPid) || SPid <- SPids1]]),
                   syncer_loop(Ref, MPid, SPids1)
     end.
 
+await_slaves(Ref, SPids) ->
+    Nodes = rabbit_mnesia:cluster_nodes(running),
+    [SPid || SPid <- SPids,
+             lists:member(node(SPid), Nodes) andalso %% [0]
+                 receive
+                     {sync_ready, Ref, SPid}       -> true;
+                     {sync_deny,  Ref, SPid}       -> false;
+                     {'DOWN', _, process, SPid, _} -> false
+                 end].
+%% [0] This check is in case there's been a partition which has then
+%% healed in between the master retrieving the slave pids from Mnesia
+%% and sending 'sync_start' over GM. If so there might be slaves on the
+%% other side of the partition which we can monitor (since they have
+%% rejoined the distributed system with us) but which did not get the
+%% 'sync_start' and so will not reply. We need to act as though they are
+%% down.
+
 syncer_loop(Ref, MPid, SPids) ->
     MPid ! {next, Ref},
     receive
-- 
cgit v1.2.1