summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSimon MacMullen <simon@rabbitmq.com>2014-10-10 16:21:09 +0100
committerSimon MacMullen <simon@rabbitmq.com>2014-10-10 16:21:09 +0100
commitd1a4b8ebebcc981b790163d66cc1478d53cbf7ca (patch)
tree0bb95417eecfa44e47becf8eb291cbac212ed509
parent16d135e7ff952ee3efeacab5b1d34134f67de2ce (diff)
parent264fd053a141db698a6b078f3fdf2e65378424c9 (diff)
downloadrabbitmq-server-d1a4b8ebebcc981b790163d66cc1478d53cbf7ca.tar.gz
Merge in default
-rw-r--r--src/rabbit_mirror_queue_sync.erl24
1 files changed, 18 insertions, 6 deletions
diff --git a/src/rabbit_mirror_queue_sync.erl b/src/rabbit_mirror_queue_sync.erl
index e3fae4c0..d1ef5f30 100644
--- a/src/rabbit_mirror_queue_sync.erl
+++ b/src/rabbit_mirror_queue_sync.erl
@@ -156,18 +156,30 @@ syncer(Ref, Log, MPid, SPids) ->
%% We wait for a reply from the slaves so that we know they are in
%% a receive block and will thus receive messages we send to them
%% *without* those messages ending up in their gen_server2 pqueue.
- case [SPid || SPid <- SPids,
- receive
- {sync_ready, Ref, SPid} -> true;
- {sync_deny, Ref, SPid} -> false;
- {'DOWN', _, process, SPid, _} -> false
- end] of
+ case await_slaves(Ref, SPids) of
[] -> Log("all slaves already synced", []);
SPids1 -> MPid ! {ready, self()},
Log("mirrors ~p to sync", [[node(SPid) || SPid <- SPids1]]),
syncer_loop(Ref, MPid, SPids1)
end.
+await_slaves(Ref, SPids) ->
+ Nodes = rabbit_mnesia:cluster_nodes(running),
+ [SPid || SPid <- SPids,
+ lists:member(node(SPid), Nodes) andalso %% [0]
+ receive
+ {sync_ready, Ref, SPid} -> true;
+ {sync_deny, Ref, SPid} -> false;
+ {'DOWN', _, process, SPid, _} -> false
+ end].
+%% [0] This check is in case there's been a partition which has then
+%% healed in between the master retrieving the slave pids from Mnesia
+%% and sending 'sync_start' over GM. If so there might be slaves on the
+%% other side of the partition which we can monitor (since they have
+%% rejoined the distributed system with us) but which did not get the
+%% 'sync_start' and so will not reply. We need to act as though they are
+%% down.
+
syncer_loop(Ref, MPid, SPids) ->
MPid ! {next, Ref},
receive