From 264fd053a141db698a6b078f3fdf2e65378424c9 Mon Sep 17 00:00:00 2001 From: Simon MacMullen Date: Wed, 8 Oct 2014 15:41:31 +0100 Subject: Defend against partitions at the wrong time causing badness. --- src/rabbit_mirror_queue_sync.erl | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/rabbit_mirror_queue_sync.erl b/src/rabbit_mirror_queue_sync.erl index e3fae4c0..d1ef5f30 100644 --- a/src/rabbit_mirror_queue_sync.erl +++ b/src/rabbit_mirror_queue_sync.erl @@ -156,18 +156,30 @@ syncer(Ref, Log, MPid, SPids) -> %% We wait for a reply from the slaves so that we know they are in %% a receive block and will thus receive messages we send to them %% *without* those messages ending up in their gen_server2 pqueue. - case [SPid || SPid <- SPids, - receive - {sync_ready, Ref, SPid} -> true; - {sync_deny, Ref, SPid} -> false; - {'DOWN', _, process, SPid, _} -> false - end] of + case await_slaves(Ref, SPids) of [] -> Log("all slaves already synced", []); SPids1 -> MPid ! {ready, self()}, Log("mirrors ~p to sync", [[node(SPid) || SPid <- SPids1]]), syncer_loop(Ref, MPid, SPids1) end. +await_slaves(Ref, SPids) -> + Nodes = rabbit_mnesia:cluster_nodes(running), + [SPid || SPid <- SPids, + lists:member(node(SPid), Nodes) andalso %% [0] + receive + {sync_ready, Ref, SPid} -> true; + {sync_deny, Ref, SPid} -> false; + {'DOWN', _, process, SPid, _} -> false + end]. +%% [0] This check is in case there's been a partition which has then +%% healed in between the master retrieving the slave pids from Mnesia +%% and sending 'sync_start' over GM. If so there might be slaves on the +%% other side of the partition which we can monitor (since they have +%% rejoined the distributed system with us) but which did not get the +%% 'sync_start' and so will not reply. We need to act as though they are +%% down. + syncer_loop(Ref, MPid, SPids) -> MPid ! {next, Ref}, receive -- cgit v1.2.1