summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorErlang/OTP <otp@erlang.org>2022-03-28 12:39:16 +0200
committerErlang/OTP <otp@erlang.org>2022-03-28 12:39:16 +0200
commita41a194004efd69f43355b65456a0e2756fc2ffd (patch)
tree83f002a343e1c58c4b1aaa59f6f38c02da7a7f70
parent42df73ae0e05a6382c5dec45e73d3635c36e4c79 (diff)
parent1c0e9f0421e5a8356560d37aee500587d21f3309 (diff)
downloaderlang-a41a194004efd69f43355b65456a0e2756fc2ffd.tar.gz
Merge branch 'rickard/prevent-overlapping-partitions/22.3.4/ERIERL-732/OTP-17843' into maint-22
* rickard/prevent-overlapping-partitions/22.3.4/ERIERL-732/OTP-17843: global: Preventing overlapping partitions fix global: Propagate and save version between all nodes
-rw-r--r--bootstrap/lib/kernel/ebin/global.beambin28700 -> 33588 bytes
-rw-r--r--bootstrap/lib/kernel/ebin/global_group.beambin15708 -> 15744 bytes
-rw-r--r--erts/emulator/test/distribution_SUITE.erl83
-rw-r--r--erts/preloaded/ebin/init.beambin50224 -> 50808 bytes
-rw-r--r--erts/preloaded/src/erts.app.src2
-rw-r--r--erts/preloaded/src/init.erl20
-rw-r--r--lib/kernel/doc/src/global.xml33
-rw-r--r--lib/kernel/doc/src/kernel_app.xml13
-rw-r--r--lib/kernel/src/global.erl534
-rw-r--r--lib/kernel/src/global_group.erl26
-rw-r--r--lib/kernel/src/kernel.app.src5
-rw-r--r--lib/kernel/test/erl_distribution_SUITE.erl19
-rw-r--r--lib/kernel/test/global_SUITE.erl997
-rw-r--r--lib/kernel/test/pg2_SUITE.erl262
-rw-r--r--lib/kernel/test/rpc_SUITE.erl50
15 files changed, 1343 insertions, 701 deletions
diff --git a/bootstrap/lib/kernel/ebin/global.beam b/bootstrap/lib/kernel/ebin/global.beam
index aadb5bf080..537f1c8a77 100644
--- a/bootstrap/lib/kernel/ebin/global.beam
+++ b/bootstrap/lib/kernel/ebin/global.beam
Binary files differ
diff --git a/bootstrap/lib/kernel/ebin/global_group.beam b/bootstrap/lib/kernel/ebin/global_group.beam
index 83885cdeb5..364d2621ae 100644
--- a/bootstrap/lib/kernel/ebin/global_group.beam
+++ b/bootstrap/lib/kernel/ebin/global_group.beam
Binary files differ
diff --git a/erts/emulator/test/distribution_SUITE.erl b/erts/emulator/test/distribution_SUITE.erl
index 2b8f69b2b9..f9f7f847f1 100644
--- a/erts/emulator/test/distribution_SUITE.erl
+++ b/erts/emulator/test/distribution_SUITE.erl
@@ -1073,21 +1073,24 @@ dist_auto_connect_relay(Parent) ->
dist_parallel_send(Config) when is_list(Config) ->
- {ok, RNode} = start_node(dist_parallel_receiver),
- {ok, SNode} = start_node(dist_parallel_sender),
- WatchDog = spawn_link(
- fun () ->
- TRef = erlang:start_timer((2*60*1000), self(), oops),
- receive
- {timeout, TRef, _ } ->
- spawn(SNode, fun () -> abort(timeout) end),
- spawn(RNode, fun () -> abort(timeout) end)
- %% rpc:cast(SNode, erlang, halt,
- %% ["Timetrap (sender)"]),
- %% rpc:cast(RNode, erlang, halt,
- %% ["Timetrap (receiver)"])
- end
- end),
+ %% Disabled "connect all" so global wont interfere...
+ {ok, RNode} = start_node(dist_parallel_receiver, "-connect_all false"),
+ {ok, SNode} = start_node(dist_parallel_sender, "-connect_all false"),
+
+ %% WatchDog = spawn_link(
+ %% fun () ->
+ %% TRef = erlang:start_timer((2*60*1000), self(), oops),
+ %% receive
+ %% {timeout, TRef, _ } ->
+ %% spawn(SNode, fun () -> abort(timeout) end),
+ %% spawn(RNode, fun () -> abort(timeout) end)
+ %% %% rpc:cast(SNode, erlang, halt,
+ %% %% ["Timetrap (sender)"]),
+ %% %% rpc:cast(RNode, erlang, halt,
+ %% %% ["Timetrap (receiver)"])
+ %% end
+ %% end),
+
MkSndrs = fun (Receiver) ->
lists:map(fun (_) ->
spawn_link(SNode,
@@ -1096,18 +1099,23 @@ dist_parallel_send(Config) when is_list(Config) ->
[self(), Receiver, 1000])
end, lists:seq(1, 64))
end,
+ Parent = self(),
SndrsStart = fun (Sndrs) ->
- Parent = self(),
spawn_link(SNode,
fun () ->
lists:foreach(fun (P) ->
P ! {go, Parent}
- end, Sndrs)
+ end, Sndrs),
+ unlink(Parent)
end)
end,
SndrsWait = fun (Sndrs) ->
lists:foreach(fun (P) ->
- receive {P, done} -> ok end
+ receive
+ {P, done} ->
+ unlink(P),
+ ok
+ end
end, Sndrs)
end,
DPR = spawn_link(RNode, ?MODULE, dist_parallel_receiver, []),
@@ -1124,8 +1132,8 @@ dist_parallel_send(Config) when is_list(Config) ->
unlink(DEPR),
exit(DEPR, bang),
- unlink(WatchDog),
- exit(WatchDog, bang),
+ %% unlink(WatchDog),
+ %% exit(WatchDog, bang),
stop_node(RNode),
stop_node(SNode),
@@ -1765,8 +1773,9 @@ start_link(Offender,P) ->
bad_dist_structure(Config) when is_list(Config) ->
ct:timetrap({seconds, 15}),
- {ok, Offender} = start_node(bad_dist_structure_offender),
- {ok, Victim} = start_node(bad_dist_structure_victim),
+ %% Disabled "connect all" so global wont interfere...
+ {ok, Offender} = start_node(bad_dist_structure_offender, "-connect_all false"),
+ {ok, Victim} = start_node(bad_dist_structure_victim, "-connect_all false"),
start_node_monitors([Offender,Victim]),
Parent = self(),
P = spawn(Victim,
@@ -1860,8 +1869,9 @@ bad_dist_structure(Config) when is_list(Config) ->
bad_dist_fragments(Config) when is_list(Config) ->
ct:timetrap({seconds, 15}),
- {ok, Offender} = start_node(bad_dist_fragment_offender),
- {ok, Victim} = start_node(bad_dist_fragment_victim),
+ %% Disabled "connect all" so global wont interfere...
+ {ok, Offender} = start_node(bad_dist_fragment_offender, "-connect_all false"),
+ {ok, Victim} = start_node(bad_dist_fragment_victim, "-connect_all false"),
Msg = iolist_to_binary(dmsg_ext(lists:duplicate(255,255))),
@@ -1997,8 +2007,9 @@ send_bad_fragments(Offender,VictimNode,Victim,Ctrl,WhereToPutSelf,Fragments) ->
end.
bad_dist_ext_receive(Config) when is_list(Config) ->
- {ok, Offender} = start_node(bad_dist_ext_receive_offender),
- {ok, Victim} = start_node(bad_dist_ext_receive_victim),
+ %% Disabled "connect all" so global wont interfere...
+ {ok, Offender} = start_node(bad_dist_ext_receive_offender, "-connect_all false"),
+ {ok, Victim} = start_node(bad_dist_ext_receive_victim, "-connect_all false"),
start_node_monitors([Offender,Victim]),
Parent = self(),
@@ -2069,8 +2080,9 @@ bad_dist_ext_receive(Config) when is_list(Config) ->
bad_dist_ext_process_info(Config) when is_list(Config) ->
- {ok, Offender} = start_node(bad_dist_ext_process_info_offender),
- {ok, Victim} = start_node(bad_dist_ext_process_info_victim),
+ %% Disabled "connect all" so global wont interfere...
+ {ok, Offender} = start_node(bad_dist_ext_process_info_offender, "-connect_all false"),
+ {ok, Victim} = start_node(bad_dist_ext_process_info_victim, "-connect_all false"),
start_node_monitors([Offender,Victim]),
Parent = self(),
@@ -2129,8 +2141,9 @@ bad_dist_ext_process_info(Config) when is_list(Config) ->
stop_node(Victim).
bad_dist_ext_control(Config) when is_list(Config) ->
- {ok, Offender} = start_node(bad_dist_ext_control_offender),
- {ok, Victim} = start_node(bad_dist_ext_control_victim),
+ %% Disabled "connect all" so global wont interfere...
+ {ok, Offender} = start_node(bad_dist_ext_control_offender, "-connect_all false"),
+ {ok, Victim} = start_node(bad_dist_ext_control_victim, "-connect_all false"),
start_node_monitors([Offender,Victim]),
pong = rpc:call(Victim, net_adm, ping, [Offender]),
@@ -2148,8 +2161,9 @@ bad_dist_ext_control(Config) when is_list(Config) ->
stop_node(Victim).
bad_dist_ext_connection_id(Config) when is_list(Config) ->
- {ok, Offender} = start_node(bad_dist_ext_connection_id_offender),
- {ok, Victim} = start_node(bad_dist_ext_connection_id_victim),
+ %% Disabled "connect all" so global wont interfere...
+ {ok, Offender} = start_node(bad_dist_ext_connection_id_offender, "-connect_all false"),
+ {ok, Victim} = start_node(bad_dist_ext_connection_id_victim, "-connect_all false"),
start_node_monitors([Offender,Victim]),
Parent = self(),
@@ -2215,10 +2229,11 @@ bad_dist_ext_connection_id(Config) when is_list(Config) ->
%% OTP-14661: Bad message is discovered by erts_msg_attached_data_size
bad_dist_ext_size(Config) when is_list(Config) ->
- {ok, Offender} = start_node(bad_dist_ext_process_info_offender),
+ %% Disabled "connect all" so global wont interfere...
+ {ok, Offender} = start_node(bad_dist_ext_process_info_offender, "-connect_all false"),
%%Prog = "Prog=/home/uabseri/src/otp_new3/bin/cerl -rr -debug",
Prog = [],
- {ok, Victim} = start_node(bad_dist_ext_process_info_victim, [], Prog),
+ {ok, Victim} = start_node(bad_dist_ext_process_info_victim, "-connect_all false", Prog),
start_node_monitors([Offender,Victim]),
Parent = self(),
diff --git a/erts/preloaded/ebin/init.beam b/erts/preloaded/ebin/init.beam
index 6f4d981219..cd161f4dd7 100644
--- a/erts/preloaded/ebin/init.beam
+++ b/erts/preloaded/ebin/init.beam
Binary files differ
diff --git a/erts/preloaded/src/erts.app.src b/erts/preloaded/src/erts.app.src
index d3dbe0f2d1..4f11613d0e 100644
--- a/erts/preloaded/src/erts.app.src
+++ b/erts/preloaded/src/erts.app.src
@@ -42,7 +42,7 @@
{registered, []},
{applications, []},
{env, []},
- {runtime_dependencies, ["stdlib-3.5", "kernel-6.5.1", "sasl-3.3"]}
+ {runtime_dependencies, ["stdlib-3.5", "kernel-@OTP-17843@", "sasl-3.3"]}
]}.
%% vim: ft=erlang
diff --git a/erts/preloaded/src/init.erl b/erts/preloaded/src/init.erl
index b36b4b5599..b2b16678f0 100644
--- a/erts/preloaded/src/init.erl
+++ b/erts/preloaded/src/init.erl
@@ -588,11 +588,31 @@ stop_heart(State) ->
shutdown_pids(Heart,Logger,BootPid,State) ->
Timer = shutdown_timer(State#state.flags),
+ global_prepare_shutdown(),
catch shutdown(State#state.kernel,BootPid,Timer,State),
kill_all_pids(Heart,Logger), % Even the shutdown timer.
kill_all_ports(Heart), % Logger has no ports
flush_timout(Timer).
+global_prepare_shutdown() ->
+ %% Inform global that we are shutting down, so it wont
+ %% send 'lost_connection' messages when connections
+ %% goes down...
+ case whereis(global_name_server) of
+ undefined ->
+ ok;
+ Pid ->
+ Mon = erlang:monitor(process, Pid),
+ Pid ! {prepare_shutdown, self(), Mon},
+ receive
+ {Mon, ok} ->
+ erlang:demonitor(Mon, [flush]),
+ ok;
+ {'DOWN', Mon, process, Pid, _Reason} ->
+ ok
+ end
+ end.
+
get_heart(Kernel) ->
get_kernelpid(heart,Kernel).
diff --git a/lib/kernel/doc/src/global.xml b/lib/kernel/doc/src/global.xml
index 28b59c20bd..b8385436b5 100644
--- a/lib/kernel/doc/src/global.xml
+++ b/lib/kernel/doc/src/global.xml
@@ -37,6 +37,39 @@
<item>Global locks</item>
<item>Maintenance of the fully connected network</item>
</list>
+ <marker id="prevent_overlapping_partitions"/>
+ <warning>
+ <p>
+ By default <c>global</c> does <i>not</i> take any actions to restore
+ a fully connected network when connections are lost due to network
+ issues. This is problematic for all applications expecting a fully
+ connected network to be provided, such as for example <c>mnesia</c>,
+ but also for <c>global</c> itself. A network of overlapping partitions
+ might cause the internal state of <c>global</c> to become inconsistent.
+ Such an inconsistency can remain even after such partitions have been
+ brought together to form a fully connected network again. The effect
+ on other applications that expects that a fully connected network is
+ maintained may vary, but they might misbehave in very subtle hard to
+ detect ways during such a partitioning.
+ </p>
+ <p>
+ In order to prevent such issues, we have introduced a <i>prevent
+ overlapping partitions</i> fix which can be enabled using the
+ <seealso marker="kernel_app#prevent_overlapping_partitions">
+ <c>prevent_overlapping_partitions</c></seealso> <c>kernel(6)</c>
+ parameter. When this fix has been enabled, <c>global</c> will actively
+ disconnect from nodes that reports that they have lost connections to
+ other nodes. This will cause fully connected partitions to form
+ instead of leaving the network in a state with overlapping partitions.
+ Note that this fix <i>has</i> to be enabled on <i>all</i> nodes in the
+ network in order to work properly. Since this quite substantially
+ changes the behavior, this fix is currently disabled by default. Since
+ you might get hard to detect issues without this fix you are, however,
+ <i>strongly</i> advised to enable this fix in order to avoid issues
+ such as the ones described above. As of OTP 25 this fix will become
+ enabled by default.
+ </p>
+ </warning>
<p>These services are controlled through the process
<c>global_name_server</c> that exists on every node. The global
name server starts automatically when a node is started.
diff --git a/lib/kernel/doc/src/kernel_app.xml b/lib/kernel/doc/src/kernel_app.xml
index 49e007aa09..aef5a9fbe2 100644
--- a/lib/kernel/doc/src/kernel_app.xml
+++ b/lib/kernel/doc/src/kernel_app.xml
@@ -336,6 +336,19 @@ MaxT = TickTime + TickTime / 4</code>
<p>Normally, a terminating node is detected immediately by the transport
protocol (like TCP/IP).</p>
</item>
+ <tag><marker id="prevent_overlapping_partitions"/>
+ <c>prevent_overlapping_partitions = true | false</c></tag>
+ <item>
+ <p>
+ If enabled (<c>true</c>), <c>global</c> will actively prevent
+ overlapping partitions from forming when connections are lost
+ between nodes. This fix is, however, currently disabled by
+ default. See the
+ <seealso marker="kernel:global#prevent_overlapping_partitions">
+ <c>global(3)</c></seealso> documentation for more
+ information.
+ </p>
+ </item>
<tag><c>shutdown_timeout = integer() | infinity</c></tag>
<item>
<p>Specifies the time <c>application_controller</c> waits
diff --git a/lib/kernel/src/global.erl b/lib/kernel/src/global.erl
index 491f60688a..764a0bfd0f 100644
--- a/lib/kernel/src/global.erl
+++ b/lib/kernel/src/global.erl
@@ -32,7 +32,7 @@
whereis_name/1, register_name/2,
register_name/3, register_name_external/2, register_name_external/3,
unregister_name_external/1,re_register_name/2, re_register_name/3,
- unregister_name/1, registered_names/0, send/2, node_disconnected/1,
+ unregister_name/1, registered_names/0, send/2,
set_lock/1, set_lock/2, set_lock/3,
del_lock/1, del_lock/2,
trans/2, trans/3, trans/4,
@@ -54,6 +54,10 @@
-define(N_CONNECT_RETRIES, global_connect_retries).
-define(DEFAULT_N_CONNECT_RETRIES, 0).
+%% Time that we keep information about multicasted lost_connection
+%% messages...
+-define(lost_conn_info_cleanup_time, 60*60*1000).
+
%%% In certain places in the server, calling io:format hangs everything,
%%% so we'd better use erlang:display/1.
%%% my_tracer is used in testsuites
@@ -70,6 +74,9 @@
-define(trace(_), ok).
-endif.
+-define(MAX_64BIT_SMALL_INT, ((1 bsl 59) - 1)).
+-define(MIN_64BIT_SMALL_INT, (-(1 bsl 59))).
+
%% These are the protocol versions:
%% Vsn 1 is the original protocol.
%% Vsn 2 is enhanced with code to take care of registration of names from
@@ -82,15 +89,26 @@
%% nodes in the own partition. (R11B-)
%% Vsn 6 does not send any message between locker processes on different
%% nodes, but uses the server as a proxy.
+%% Vsn 7 - propagate global versions between nodes, so we always know
+%% versions of known nodes
+%% - optional "prevent overlapping partitions" fix supported
%% Current version of global does not support vsn 4 or earlier.
--define(vsn, 6).
+-define(vsn, 7).
+
+%% Version when the "prevent overlapping partitions" fix was introduced.
+-define(pop_vsn, 7).
+%% Version when the "propagate global protocol versions" feature
+%% was introduced.
+-define(pgpv_vsn, 7).
%%-----------------------------------------------------------------
%% connect_all = boolean() - true if we are supposed to set up a
%% fully connected net
-%% known = [Node] - all nodes known to us
+%% known = #{} - Map of known nodes including protocol version
+%% as well as some other information. See state
+%% record declaration below for more info.
%% synced = [Node] - all nodes that have the same names as us
%% resolvers = [{Node, MyTag, Resolver}] -
%% the tag separating different synch sessions,
@@ -112,8 +130,21 @@
%% {sync_tag_his, Node} = The Node's tag, used at synchronization
%% {lock_id, Node} = The resource locking the partitions
%%-----------------------------------------------------------------
--record(state, {connect_all :: boolean(),
- known = [] :: [node()],
+-record(conf, {connect_all :: boolean(),
+ prevent_over_part :: boolean()
+ }).
+
+-record(state, {conf = #conf{} :: #conf{},
+ known = #{} :: #{
+ %% Known connected node with protocol
+ %% version as value
+ node() => non_neg_integer(),
+ %% Connecting node, not yet known, with
+ %% protocol version as value
+ {pending, node()} => non_neg_integer(),
+ %% Node currently being removed
+ {removing, node()} => yes
+ },
synced = [] :: [node()],
resolvers = [],
syncers = [] :: [pid()],
@@ -142,6 +173,12 @@
%%% Resources locked by Pid.
%%% ref() is the same ref() as in global_locks.
%%%
+%%% global_lost_connections (set):
+%%% {{NodeA, NodeB}, {ExtendedCreationA, OpIdA, Timer}
+%%% Information about lost connections (reported by NodeA) used by
+%%% the "prevent overlapping partitions" fix. The timer is used is
+%%% used to remove the entry when not needed anymore.
+%%%
%%% global_pid_names is a 'bag' for backward compatibility.
%%% (Before vsn 5 more than one name could be registered for a process.)
%%%
@@ -207,9 +244,6 @@ send(Name, Msg) ->
whereis_name(Name) ->
where(Name).
-node_disconnected(Node) ->
- global_name_server ! {nodedown, Node}.
-
%%-----------------------------------------------------------------
%% Method = function(Name, Pid1, Pid2) -> Pid | Pid2 | none
%% Method is called if a name conflict is detected when two nodes
@@ -452,6 +486,7 @@ init([]) ->
_ = ets:new(global_pid_names, [bag, named_table, protected]),
_ = ets:new(global_pid_ids, [bag, named_table, protected]),
+ _ = ets:new(global_lost_connections, [set, named_table, protected]),
%% This is for troubleshooting only.
DoTrace = os:getenv("GLOBAL_HIGH_LEVEL_TRACE") =:= "TRUE",
@@ -469,10 +504,28 @@ init([]) ->
_ ->
true
end,
+ POP = case application:get_env(kernel,
+ prevent_overlapping_partitions) of
+ {ok, Bool} when Bool == true; Bool == false ->
+ Bool;
+ {ok, Invalid} ->
+ error({invalid_parameter_value,
+ prevent_overlapping_partitions,
+ Invalid});
+ undefined ->
+ false
+ end,
S = #state{the_locker = start_the_locker(DoTrace),
trace = T0,
the_registrar = start_the_registrar(),
- connect_all = Ca},
+ conf = #conf{connect_all = Ca,
+ prevent_over_part = POP}},
+ _ = rand:seed(exsss,
+ (erlang:monotonic_time(nanosecond) rem 1000000000)
+ + (erlang:system_time(nanosecond) rem 1000000000)),
+ CreX = ((rand:uniform(?MAX_64BIT_SMALL_INT - ?MIN_64BIT_SMALL_INT)
+ - 1) band (bnot ((1 bsl 2) -1))),
+ put(creation_extension, CreX),
{ok, trace_message(S, {init, node()}, [])}.
%%-----------------------------------------------------------------
@@ -593,6 +646,51 @@ init([]) ->
%% sent by each node to all new nodes (Node becomes known to them)
%%-----------------------------------------------------------------
+%% ----------------------------------------------------------------
+%% Prevent Overlapping Partitions Algorithm
+%% ========================================
+%%
+%% 1. When a node lose connection to another node it sends a
+%% {lost_connection, LostConnNode, OtherNode} message to all
+%% other nodes that it knows of.
+%% 2. When a lost_connection message is received the receiver
+%% first checks if it has seen this message before. If so, it
+%% just ignores it. If it has not seen it before, it sends the
+%% message to all nodes it knows of. This in order to ensure
+%% that all connected nodes will receive this message. It then
+%% sends a {remove_connection, LostConnRecvNode} message (where
+%% LostConnRecvNode is its own node name) to OtherNode and
+%% clear all information about OtherNode so OtherNode wont be
+%% part of ReceiverNode's cluster anymore. When this information
+%% has been cleared, no lost_connection will be triggered when
+%% a nodedown message for the connection to OtherNode is
+%% received.
+%% 3. When a {remove_connection, LostConnRecvNode} message is
+%% received, the receiver node takes down the connection to
+%% LostConnRecvNode and clears its information about
+%% LostConnRecvNode so it is not part of its cluster anymore.
+%% Both nodes will receive a nodedown message due to the
+%% connection being closed, but none of them will send
+%% lost_connection messages since they have cleared information
+%% about the other node.
+%%
+%% This will take down more connections than the minimum amount
+%% of connections to remove in order to form fully connected
+%% partitions. For example, if the user takes down a connection
+%% between two nodes, the rest of the nodes will disconnect from
+%% both of these nodes instead of just one. This is due to:
+%% * We do not want to partition a remaining network when a node
+%% has halted. When you receive a nodedown and/or lost_connection
+%% messages you don't know if the corresponding node has halted
+%% or if there are network issues.
+%% * We need to decide which connection to take down as soon as
+%% we receive a lost_connection message in order to prevent
+%% inconsistencies entering global's state.
+%% * All nodes need to make the same choices independent of
+%% each other.
+%%
+%% ----------------------------------------------------------------
+
-spec handle_call(term(), {pid(), term()}, state()) ->
{'noreply', state()} |
{'reply', term(), state()} |
@@ -625,7 +723,7 @@ handle_call({del_lock, Lock}, {Pid, _Tag}, S0) ->
{reply, true, S};
handle_call(get_known, _From, S) ->
- {reply, S#state.known, S};
+ {reply, mk_known_list(0, S), S};
handle_call(get_synced, _From, S) ->
{reply, S#state.synced, S};
@@ -676,27 +774,28 @@ handle_call(Request, From, S) ->
-spec handle_cast(term(), state()) -> {'noreply', state()}.
-handle_cast({init_connect, Vsn, Node, InitMsg}, S) ->
+handle_cast({init_connect, Vsn, Node, InitMsg}, S0) ->
%% Sent from global_name_server at Node.
?trace({'####', init_connect, {vsn, Vsn}, {node,Node},{initmsg,InitMsg}}),
- case Vsn of
- %% It is always the responsibility of newer versions to understand
- %% older versions of the protocol.
- {HisVsn, HisTag} when HisVsn > ?vsn ->
- init_connect(?vsn, Node, InitMsg, HisTag, S#state.resolvers, S);
- {HisVsn, HisTag} ->
- init_connect(HisVsn, Node, InitMsg, HisTag, S#state.resolvers, S);
- %% To be future compatible
- Tuple when is_tuple(Tuple) ->
- List = tuple_to_list(Tuple),
- [_HisVsn, HisTag | _] = List,
- %% use own version handling if his is newer.
- init_connect(?vsn, Node, InitMsg, HisTag, S#state.resolvers, S);
- _ ->
- Txt = io_lib:format("Illegal global protocol version ~p Node: ~p\n",
- [Vsn, Node]),
- error_logger:info_report(lists:flatten(Txt))
- end,
+ S = case Vsn of
+ %% It is always the responsibility of newer versions to understand
+ %% older versions of the protocol.
+ {HisVsn, HisTag} when HisVsn > ?vsn ->
+ init_connect(?vsn, Node, InitMsg, HisTag, HisVsn, S0);
+ {HisVsn, HisTag} ->
+ init_connect(HisVsn, Node, InitMsg, HisTag, HisVsn, S0);
+ %% To be future compatible
+ Tuple when is_tuple(Tuple) ->
+ List = tuple_to_list(Tuple),
+ [HisVsn, HisTag | _] = List,
+ %% use own version handling if his is newer.
+ init_connect(?vsn, Node, InitMsg, HisTag, HisVsn, S0);
+ _ ->
+ Txt = io_lib:format("Illegal global protocol version ~p Node: ~p\n",
+ [Vsn, Node]),
+ error_logger:info_report(lists:flatten(Txt)),
+ S0
+ end,
{noreply, S};
%%=======================================================================
@@ -745,10 +844,10 @@ handle_cast({exchange_ops, Node, MyTag, Ops, Resolved}, S0) ->
S = trace_message(S0, {exit_resolver, Node}, [MyTag]),
case get({sync_tag_my, Node}) of
MyTag ->
- Known = S#state.known,
+ Known = mk_known_list(node_vsn(Node, S), S),
gen_server:cast({global_name_server, Node},
{resolved, node(), Resolved, Known,
- Known,get_names_ext(),get({sync_tag_his,Node})}),
+ unused,get_names_ext(),get({sync_tag_his,Node})}),
case get({save_ops, Node}) of
{resolved, HisKnown, Names_ext, HisResolved} ->
put({save_ops, Node}, Ops),
@@ -768,7 +867,7 @@ handle_cast({exchange_ops, Node, MyTag, Ops, Resolved}, S0) ->
%%
%% Here the name clashes are resolved.
%%========================================================================
-handle_cast({resolved, Node, HisResolved, HisKnown, _HisKnown_v2,
+handle_cast({resolved, Node, HisResolved, HisKnown, _Unused,
Names_ext, MyTag}, S) ->
%% Sent from global_name_server at Node.
?trace({'####', resolved, {his_resolved,HisResolved}, {node,Node}}),
@@ -805,7 +904,7 @@ handle_cast({new_nodes, Node, Ops, Names_ext, Nodes, ExtraInfo}, S) ->
%%
%% We are in sync with this node (from the other node's known world).
%%========================================================================
-handle_cast({in_sync, Node, _IsKnown}, S) ->
+handle_cast({in_sync, Node, _IsKnown}, #state{known = Known} = S) ->
%% Sent from global_name_server at Node (in the other partition).
?trace({'####', in_sync, {Node, _IsKnown}}),
lists:foreach(fun(Pid) -> Pid ! {synced, [Node]} end, S#state.syncers),
@@ -815,7 +914,8 @@ handle_cast({in_sync, Node, _IsKnown}, S) ->
true -> Synced;
false -> [Node | Synced]
end,
- {noreply, NewS#state{synced = NSynced}};
+ {noreply, NewS#state{known = maps:remove({pending, Node}, Known),
+ synced = NSynced}};
%% Called when Pid on other node crashed
handle_cast({async_del_name, _Name, _Pid}, S) ->
@@ -868,13 +968,20 @@ handle_info({nodedown, Node}, S) when Node =:= S#state.node_name ->
handle_info({nodedown, Node}, S0) ->
?trace({'####', nodedown, {node,Node}}),
S1 = trace_message(S0, {nodedown, Node}, []),
- S = handle_nodedown(Node, S1),
+ S = handle_nodedown(Node, S1, disconnected),
{noreply, S};
handle_info({extra_nodedown, Node}, S0) ->
?trace({'####', extra_nodedown, {node,Node}}),
S1 = trace_message(S0, {extra_nodedown, Node}, []),
- S = handle_nodedown(Node, S1),
+ S = handle_nodedown(Node, S1, disconnected),
+ {noreply, S};
+
+handle_info({ignore_node, Node}, S0) ->
+ %% global_group wants us to ignore this node...
+ ?trace({'####', ignore_node, {node,Node}}),
+ S1 = trace_message(S0, {ignore_node, Node}, []),
+ S = handle_nodedown(Node, S1, ignore_node),
{noreply, S};
handle_info({nodeup, Node}, S) when Node =:= node() ->
@@ -883,11 +990,12 @@ handle_info({nodeup, Node}, S) when Node =:= node() ->
%% references to old node name ('nonode@nohost') to Node.
{noreply, change_our_node_name(Node, S)};
-handle_info({nodeup, _Node}, S) when not S#state.connect_all ->
+handle_info({nodeup, _Node},
+ #state{conf = #conf{connect_all = false}} = S) ->
{noreply, S};
-handle_info({nodeup, Node}, S0) when S0#state.connect_all ->
- IsKnown = lists:member(Node, S0#state.known) or
+handle_info({nodeup, Node}, S0) ->
+ IsKnown = maps:is_key(Node, S0#state.known) or
%% This one is only for double nodeups (shouldn't occur!)
lists:keymember(Node, 1, S0#state.resolvers),
?trace({'####', nodeup, {node,Node}, {isknown,IsKnown}}),
@@ -928,6 +1036,110 @@ handle_info({whereis, Name, From}, S) ->
do_whereis(Name, From),
{noreply, S};
+handle_info({lost_connection, NodeA, XCreationA, OpIdA, NodeB} = Msg,
+ #state{conf = #conf{connect_all = true,
+ prevent_over_part = true}} = S0) ->
+ %% Message introduced in protocol version ?pop_vsn
+ %%
+ %% NodeA reports that it lost connection to NodeB. If we got a
+ %% connection to NodeB, we need to disconnect it in order to
+ %% prevent overlapping partitions...
+ LcKey = {NodeA, NodeB},
+ S1 = case get_lost_connection_info(LcKey) of
+ {XCreationA, OpId, _Tmr} when OpIdA =< OpId ->
+ %% We have already handled this lost connection
+ %% message...
+ S0;
+ {_, _, Tmr} ->
+ %% Inform all other nodes we know of about this as well. This
+ %% in order to prevent that some nodes in this cluster wont
+ %% get the original message from NodeA. This ensures that all
+ %% nodes will know of this connection loss, even if some nodes
+ %% lose connection to NodeA while NodeA is multicasting that
+ %% it lost the connection to NodeB.
+ gns_volatile_multicast(Msg, NodeA, ?pop_vsn, true, S0),
+
+ %% Save info about this lost_connection message...
+ save_lost_connection_info(LcKey, XCreationA, OpIdA, Tmr),
+
+ RmNode = case node() == NodeB of
+ false ->
+ NodeB;
+ true ->
+ %% This toghether with NodeA being known by
+ %% us probably is unusal, but can happen
+ %% since lost_connection messages are
+ %% reapeted by receiving nodes. All other
+ %% nodes will remove us, so there is no
+ %% need to make them remove NodeA as well.
+ %% We therefore request removal from NodeA
+ %% and wait for the nodedown which
+ %% eventually will come since NodeA reported
+ %% this...
+ NodeA
+ end,
+
+ case is_node_potentially_known(RmNode, S0) of
+ false ->
+ S0;
+ true ->
+ case node_vsn(RmNode, S0) of
+ Vsn when Vsn < ?pop_vsn ->
+ erlang:disconnect_node(RmNode),
+ error_logger:warning_msg(
+ "'global' at node ~p disconnected old "
+ "node ~p in order to prevent overlapping "
+ "partitions",
+ [node(), RmNode]),
+ ok;
+ _Vsn ->
+ gns_volatile_send(RmNode,
+ {remove_connection, node()}),
+ error_logger:warning_msg(
+ "'global' at node ~p requested disconnect "
+ "from node ~p in order to prevent "
+ "overlapping partitions",
+ [node(), RmNode]),
+ ok
+ end,
+ handle_nodedown(RmNode, S0, remove_connection)
+ end
+ end,
+
+ {noreply, S1};
+
+handle_info({lost_connection, _NodeA, _XCreationA, _OpIdA, _NodeB}, S) ->
+ %% Message introduced in protocol version ?pop_vsn
+ {noreply, S};
+
+handle_info({timeout, _, _} = TmoMsg, S) ->
+ %% Instance specific message
+ remove_lost_connection_info(TmoMsg),
+ {noreply, S};
+
+handle_info({remove_connection, Node}, S0) ->
+ %% Message introduced in protocol version ?pop_vsn
+ S2 = case is_node_potentially_known(Node, S0) of
+ false ->
+ S0;
+ true ->
+ erlang:disconnect_node(Node),
+ S1 = handle_nodedown(Node, S0, remove_connection),
+ error_logger:warning_msg(
+ "'global' at node ~p disconnected node ~p in order to "
+ "prevent overlapping partitions", [node(), Node]),
+ S1
+ end,
+ {noreply, S2};
+
+handle_info({prepare_shutdown, From, Ref}, S0) ->
+ %% Prevent lost_connection messages being sent due to
+ %% connections being taken down during the shutdown...
+ S1 = S0#state{conf = #conf{connect_all = false,
+ prevent_over_part = false}},
+ From ! {Ref, ok},
+ {noreply, S1};
+
handle_info(known, S) ->
io:format(">>>> ~p\n",[S#state.known]),
{noreply, S};
@@ -1067,7 +1279,8 @@ check_replies([], _Id, _Replies) ->
%% Another node wants to synchronize its registered names with us.
%% Both nodes must have a lock before they are allowed to continue.
%%========================================================================
-init_connect(Vsn, Node, InitMsg, HisTag, Resolvers, S) ->
+init_connect(Vsn, Node, InitMsg, HisTag, HisVsn,
+ #state{resolvers = Resolvers, known = Known} = S) ->
%% It is always the responsibility of newer versions to understand
%% older versions of the protocol.
put({prot_vsn, Node}, Vsn),
@@ -1083,7 +1296,9 @@ init_connect(Vsn, Node, InitMsg, HisTag, Resolvers, S) ->
false ->
?trace({init_connect,{pre_connect,Node},{histag,HisTag}}),
put({pre_connect, Node}, {Vsn, InitMsg, HisTag})
- end.
+ end,
+ S#state{known = maps:put({pending, Node}, HisVsn, Known)}.
+
%%========================================================================
%% In the simple case, we'll get lock_is_set before we get exchange,
@@ -1129,8 +1344,9 @@ resolved(Node, HisResolved, HisKnown, Names_ext, S0) ->
%% Known may have shrunk since the lock was taken (due to nodedowns).
Known = S0#state.known,
Synced = S0#state.synced,
- NewNodes = [Node | HisKnown],
- sync_others(HisKnown),
+ NewNodes = make_node_vsn_list([Node | HisKnown], S0),
+ HisKnownNodes = node_list(HisKnown),
+ sync_others(HisKnownNodes),
ExtraInfo = [{vsn,get({prot_vsn, Node})}, {lock, get({lock_id, Node})}],
S = do_ops(Ops, node(), Names_ext, ExtraInfo, S0),
%% I am synced with Node, but not with HisKnown yet
@@ -1138,47 +1354,57 @@ resolved(Node, HisResolved, HisKnown, Names_ext, S0) ->
S3 = lists:foldl(fun(Node1, S1) ->
F = fun(Tag) -> cancel_locker(Node1,S1,Tag) end,
cancel_resolved_locker(Node1, F)
- end, S, HisKnown),
+ end, S, HisKnownNodes),
%% The locker that took the lock is asked to send
%% the {new_nodes, ...} message. This ensures that
%% {del_lock, ...} is received after {new_nodes, ...}
- %% (except when abcast spawns process(es)...).
+ NewNodesMsg = {new_nodes, node(), Ops, Names_ext, NewNodes, ExtraInfo},
NewNodesF = fun() ->
- gen_server:abcast(Known, global_name_server,
- {new_nodes, node(), Ops, Names_ext,
- NewNodes, ExtraInfo})
+ lists:foreach(
+ fun (N) when is_atom(N) ->
+ case maps:get(N, Known) of
+ V when V >= ?pgpv_vsn ->
+ gen_server:cast({global_name_server, N},
+ NewNodesMsg);
+ _OldV ->
+ gen_server:cast({global_name_server, N},
+ {new_nodes, node(), Ops,
+ Names_ext, node_list(NewNodes),
+ ExtraInfo})
+ end;
+ (_) ->
+ ok
+ end,
+ maps:keys(Known))
end,
F = fun(Tag) -> cancel_locker(Node, S3, Tag, NewNodesF) end,
S4 = cancel_resolved_locker(Node, F),
%% See (*) below... we're node b in that description
- AddedNodes = (NewNodes -- Known),
- NewKnown = Known ++ AddedNodes,
- S4#state.the_locker ! {add_to_known, AddedNodes},
- NewS = trace_message(S4, {added, AddedNodes},
- [{new_nodes, NewNodes}, {abcast, Known}, {ops,Ops}]),
- NewS#state{known = NewKnown, synced = [Node | Synced]}.
+ {AddedNodes, S5} = add_to_known(NewNodes, S4),
+ S5#state.the_locker ! {add_to_known, AddedNodes},
+ S6 = trace_message(S5, {added, AddedNodes},
+ [{new_nodes, NewNodes}, {abcast, Known}, {ops,Ops}]),
+ S6#state{synced = [Node | Synced]}.
cancel_resolved_locker(Node, CancelFun) ->
Tag = get({sync_tag_my, Node}),
?trace({calling_cancel_locker,Tag,get()}),
S = CancelFun(Tag),
reset_node_state(Node),
- S.
+ S#state{known = maps:remove({pending, Node}, S#state.known)}.
new_nodes(Ops, ConnNode, Names_ext, Nodes, ExtraInfo, S0) ->
- Known = S0#state.known,
%% (*) This one requires some thought...
%% We're node a, other nodes b and c:
%% The problem is that {in_sync, a} may arrive before {resolved, [a]} to
%% b from c, leading to b sending {new_nodes, [a]} to us (node a).
%% Therefore, we make sure we never get duplicates in Known.
- AddedNodes = lists:delete(node(), Nodes -- Known),
+ {AddedNodes, S1} = add_to_known(Nodes, S0),
sync_others(AddedNodes),
- S = do_ops(Ops, ConnNode, Names_ext, ExtraInfo, S0),
+ S2 = do_ops(Ops, ConnNode, Names_ext, ExtraInfo, S1),
?trace({added_nodes_in_sync,{added_nodes,AddedNodes}}),
- S#state.the_locker ! {add_to_known, AddedNodes},
- S1 = trace_message(S, {added, AddedNodes}, [{ops,Ops}]),
- S1#state{known = Known ++ AddedNodes}.
+ S2#state.the_locker ! {add_to_known, AddedNodes},
+ trace_message(S2, {added, AddedNodes}, [{ops,Ops}]).
do_whereis(Name, From) ->
case is_global_lock_set() of
@@ -2070,15 +2296,195 @@ pid_locks(Ref) ->
ref_is_locking(Ref, PidRefs) ->
lists:keyfind(Ref, 2, PidRefs) =/= false.
-handle_nodedown(Node, S) ->
+handle_nodedown(Node, #state{synced = Syncs,
+ known = Known0} = S, What) ->
%% DOWN signals from monitors have removed locks and registered names.
- #state{known = Known, synced = Syncs} = S,
NewS = cancel_locker(Node, S, get({sync_tag_my, Node})),
NewS#state.the_locker ! {remove_from_known, Node},
reset_node_state(Node),
- NewS#state{known = lists:delete(Node, Known),
+ Known1 = case What of
+ remove_connection ->
+ maps:put({removing, Node}, yes, Known0);
+ ignore_node ->
+ maps:remove({removing, Node}, Known0);
+ disconnected ->
+ case maps:get({removing, Node}, Known0, no) of
+ yes ->
+ maps:remove({removing, Node}, Known0);
+ no ->
+ inform_connection_loss(Node, S),
+ Known0
+ end
+ end,
+ NewS#state{known = maps:remove(Node,
+ maps:remove({pending, Node}, Known1)),
synced = lists:delete(Node, Syncs)}.
+inform_connection_loss(Node,
+ #state{conf = #conf{connect_all = true,
+ prevent_over_part = true}} = S) ->
+ Msg = {lost_connection,
+ node(),
+ (get(creation_extension)
+ + erlang:system_info(creation)
+ - ?MIN_64BIT_SMALL_INT),
+ erlang:unique_integer([monotonic]),
+ Node},
+ gns_volatile_multicast(Msg, Node, ?pop_vsn, true, S);
+inform_connection_loss(_Node, #state{}) ->
+ ok.
+
+%%
+%% Volatile send (does not bring up connections and does not
+%% preserve signal order) of Msg to global name server at Node...
+%%
+gns_volatile_send(Node, Msg) ->
+ To = {global_name_server, Node},
+ case erlang:send(To, Msg, [nosuspend, noconnect]) of
+ ok ->
+ ok;
+ noconnect ->
+ ok;
+ nosuspend ->
+ _ = spawn(fun () ->
+ _ = erlang:send(To, Msg, [noconnect])
+ end),
+ ok
+ end.
+
+%%
+%% Volatile multicast of Msg to all global name servers on known nodes
+%% (and pending known nodes if AlsoPend is true) using protocol version
+%% MinVer or larger...
+%%
+gns_volatile_multicast(Msg, IgnoreNode, MinVer,
+ AlsoPend, #state{known = Known}) ->
+ Send = fun (Key, Node) ->
+ case maps:get(Key, Known) of
+ Ver when Ver < MinVer -> ok;
+ _Ver -> gns_volatile_send(Node, Msg)
+ end
+ end,
+ lists:foreach(fun (Node) when is_atom(Node), Node =/= IgnoreNode ->
+ Send(Node, Node);
+ ({pending, Node} = Key) when AlsoPend == true,
+ Node =/= IgnoreNode ->
+ Send(Key, Node);
+ (_) ->
+ ok
+ end, maps:keys(Known)).
+
+is_node_potentially_known(Node, #state{known = Known}) ->
+ maps:is_key(Node, Known) orelse maps:is_key({pending, Node}, Known).
+
+node_vsn(Node, #state{}) when node() == Node ->
+ ?vsn;
+node_vsn(Node, #state{known = Known}) ->
+ case maps:find(Node, Known) of
+ {ok, Ver} ->
+ Ver;
+ error ->
+ case maps:find({pending, Node}, Known) of
+ {ok, Ver} ->
+ Ver;
+ error ->
+ 0
+ end
+ end.
+
+node_list(NList) ->
+ lists:map(fun (N) when is_atom(N) ->
+ N;
+ ({N, _V}) when is_atom(N) ->
+ N
+ end, NList).
+
+
+make_node_vsn_list(NList, #state{} = S) ->
+ lists:map(fun ({N, 0}) when is_atom(N) ->
+ {N, node_vsn(N, S)};
+ (N) when is_atom(N) ->
+ {N, node_vsn(N, S)};
+ ({N, V} = NV) when is_atom(N),
+ is_integer(V) ->
+ NV
+ end, NList).
+
+mk_known_list(Vsn, #state{known = Known}) when Vsn < ?pgpv_vsn ->
+ lists:foldl(fun ({N, _V}, Ns) when is_atom(N) ->
+ [N | Ns];
+ (_, Ns) ->
+ Ns
+ end,
+ [],
+ maps:to_list(Known));
+mk_known_list(_Vsn, #state{known = Known}) ->
+ lists:foldl(fun ({N, _V} = NV, NVs) when is_atom(N) ->
+ [NV | NVs];
+ (_, Ns) ->
+ Ns
+ end,
+ [],
+ maps:to_list(Known)).
+
+add_to_known(AddKnown, #state{known = Known} = S) ->
+ Fun = fun (N, Acc) when N == node() ->
+ Acc;
+ ({N, _V}, Acc) when N == node() ->
+ Acc;
+ (N, {A, K} = Acc) when is_atom(N) ->
+ case maps:is_key(N, K) of
+ true -> Acc;
+ false -> {[N|A], maps:put(N, 0, K)}
+ end;
+ ({N, V}, {A, K} = Acc) ->
+ case maps:find(N, K) of
+ error ->
+ {[N|A], maps:put(N, V, K)};
+ {ok, NV} when NV >= 0 ->
+ Acc;
+ {ok, _UnknownVsn} ->
+ %% Update version, but don't count
+ %% it as an added node...
+ {A, maps:put(N, V, K)}
+ end
+ end,
+
+ {Added, NewKnown} = lists:foldl(Fun, {[], Known}, AddKnown),
+ {Added, S#state{known = NewKnown}}.
+
+get_lost_connection_info(LcKey) ->
+ case ets:lookup(global_lost_connections, LcKey) of
+ [{LcKey, LcValue}] ->
+ LcValue;
+ _ ->
+ {undefined, undefined, undefined}
+ end.
+
+save_lost_connection_info(LcKey, XCre, OpId, undefined) ->
+ %% Make sure we clean up old unused information about
+ %% lost connections...
+ Tmr = erlang:start_timer(?lost_conn_info_cleanup_time,
+ self(), {lost_connection, LcKey}),
+ Value = {XCre, OpId, Tmr},
+ _ = ets:insert(global_lost_connections, {LcKey, Value}),
+ ok;
+save_lost_connection_info(LcKey, XCre, OpId, OldTmr) ->
+ %% Cancel lost connection info cleanup timer for info being replaced...
+ _ = erlang:cancel_timer(OldTmr, [{async, true}, {info, false}]),
+ save_lost_connection_info(LcKey, XCre, OpId, undefined).
+
+remove_lost_connection_info({timeout, Tmr, {lost_connection, LcKey}}) ->
+ case ets:lookup(global_lost_connections, LcKey) of
+ [{LcKey, {_, _, Tmr}}] ->
+ _ = ets:delete(global_lost_connections, LcKey),
+ ok;
+ _ ->
+ ok
+ end;
+remove_lost_connection_info(_) ->
+ ok.
+
get_names() ->
ets:select(global_names,
ets:fun2ms(fun({Name, Pid, Method, _Ref}) ->
diff --git a/lib/kernel/src/global_group.erl b/lib/kernel/src/global_group.erl
index f5ead2a4c5..3e4ad1e685 100644
--- a/lib/kernel/src/global_group.erl
+++ b/lib/kernel/src/global_group.erl
@@ -925,15 +925,20 @@ handle_info({nodedown, Node}, S) when S#state.sync_state =:= no_conf ->
handle_info({nodedown, Node}, S) ->
% io:format("~p>>>>> nodedown, Node ~p ~n",[node(), Node]),
send_monitor(S#state.monitor, {nodedown, Node}, S#state.sync_state),
- global_name_server ! {nodedown, Node},
NN = lists:delete(Node, S#state.nodes),
NSE = lists:delete(Node, S#state.sync_error),
- NNC = case {lists:member(Node, get_own_nodes()),
- lists:member(Node, S#state.no_contact)} of
- {true, false} ->
- [Node | S#state.no_contact];
- _ ->
- S#state.no_contact
+ NNC = case lists:member(Node, get_own_nodes()) of
+ false ->
+ global_name_server ! {ignore_node, Node},
+ S#state.no_contact;
+ true ->
+ global_name_server ! {nodedown, Node},
+ case lists:member(Node, S#state.no_contact) of
+ false ->
+ [Node | S#state.no_contact];
+ true ->
+ S#state.no_contact
+ end
end,
{noreply, S#state{nodes = NN, no_contact = NNC, sync_error = NSE}};
@@ -950,8 +955,7 @@ handle_info({disconnect_node, Node}, S) ->
_ ->
cont
end,
- global_name_server ! {nodedown, Node}, %% nodedown is used to inform global of the
- %% disconnected node
+ global_name_server ! {ignore_node, Node},
NN = lists:delete(Node, S#state.nodes),
NNC = lists:delete(Node, S#state.no_contact),
NSE = lists:delete(Node, S#state.sync_error),
@@ -1264,7 +1268,7 @@ kill_global_group_check() ->
disconnect_nodes(DisconnectNodes) ->
lists:foreach(fun(Node) ->
{global_group, Node} ! {disconnect_node, node()},
- global:node_disconnected(Node)
+ global_name_server ! {ignore_node, Node}
end,
DisconnectNodes).
@@ -1275,7 +1279,7 @@ disconnect_nodes(DisconnectNodes) ->
force_nodedown(DisconnectNodes) ->
lists:foreach(fun(Node) ->
erlang:disconnect_node(Node),
- global:node_disconnected(Node)
+ global_name_server ! {ignore_node, Node}
end,
DisconnectNodes).
diff --git a/lib/kernel/src/kernel.app.src b/lib/kernel/src/kernel.app.src
index 35c0c3f88e..234d71f745 100644
--- a/lib/kernel/src/kernel.app.src
+++ b/lib/kernel/src/kernel.app.src
@@ -146,9 +146,10 @@
pg2]},
{applications, []},
{env, [{logger_level, notice},
- {logger_sasl_compatible, false}
+ {logger_sasl_compatible, false},
+ {prevent_overlapping_partitions, false}
]},
{mod, {kernel, []}},
- {runtime_dependencies, ["erts-10.6", "stdlib-3.5", "sasl-3.0"]}
+ {runtime_dependencies, ["erts-@OTP-17843@", "stdlib-3.5", "sasl-3.0"]}
]
}.
diff --git a/lib/kernel/test/erl_distribution_SUITE.erl b/lib/kernel/test/erl_distribution_SUITE.erl
index 6f0064f0fa..2b84a68c52 100644
--- a/lib/kernel/test/erl_distribution_SUITE.erl
+++ b/lib/kernel/test/erl_distribution_SUITE.erl
@@ -121,6 +121,11 @@ tick(Config) when is_list(Config) ->
run_dist_configs(fun tick/2, Config).
tick(DCfg, _Config) ->
+ %%
+ %% This test case use disabled "connect all" so that
+ %% global wont interfere...
+ %%
+
%% First check that the normal case is OK!
[Name1, Name2] = get_nodenames(2, dist_test),
{ok, Node} = start_node(DCfg, Name1),
@@ -148,11 +153,11 @@ tick(DCfg, _Config) ->
%% node doesn't tick the client node within the interval ...
{ok, ServNode} = start_node(DCfg, Name2,
- "-kernel net_ticktime 100"),
+ "-kernel net_ticktime 100 -connect_all false"),
rpc:call(ServNode, erl_distribution_SUITE, tick_serv_test, [Node, node()]),
{ok, Node} = start_node(DCfg, Name1,
- "-kernel net_ticktime 12"),
+ "-kernel net_ticktime 12 -connect_all false"),
rpc:call(Node, erl_distribution_SUITE, tick_cli_test, [ServNode]),
spawn_link(erl_distribution_SUITE, keep_conn, [Node]),
@@ -585,6 +590,10 @@ tick_change(Config) when is_list(Config) ->
run_dist_configs(fun tick_change/2, Config).
tick_change(DCfg, _Config) ->
+ %%
+ %% This test case use disabled "connect all" so that
+ %% global wont interfere...
+ %%
[BN, CN] = get_nodenames(2, tick_change),
DefaultTT = net_kernel:get_net_ticktime(),
unchanged = net_kernel:set_net_ticktime(DefaultTT, 60),
@@ -601,7 +610,7 @@ tick_change(DCfg, _Config) ->
end,
wait_until(fun () -> 10 == net_kernel:get_net_ticktime() end),
- {ok, B} = start_node(DCfg, BN, "-kernel net_ticktime 10"),
+ {ok, B} = start_node(DCfg, BN, "-kernel net_ticktime 10 -connect_all false"),
{ok, C} = start_node(DCfg, CN, "-kernel net_ticktime 10 -hidden"),
OTE = process_flag(trap_exit, true),
@@ -663,7 +672,7 @@ run_tick_change_test(DCfg, B, C, PrevTT, TT) ->
wait_for_nodedowns(Tester, Ref)
end,
- {ok, D} = start_node(DCfg, DN, "-kernel net_ticktime "
+ {ok, D} = start_node(DCfg, DN, "-connect_all false -kernel net_ticktime "
++ integer_to_list(PrevTT)),
NMA = spawn_link(fun () -> MonitorNodes([B, C, D]) end),
@@ -697,7 +706,7 @@ run_tick_change_test(DCfg, B, C, PrevTT, TT) ->
sleep(7),
change_initiated = rpc:call(C,net_kernel,set_net_ticktime,[TT,10]),
- {ok, E} = start_node(DCfg, EN, "-kernel net_ticktime "
+ {ok, E} = start_node(DCfg, EN, "-connect_all false -kernel net_ticktime "
++ integer_to_list(TT)),
NME = spawn_link(E, fun () -> MonitorNodes([node(), B, C, D]) end),
NMA2 = spawn_link(fun () -> MonitorNodes([E]) end),
diff --git a/lib/kernel/test/global_SUITE.erl b/lib/kernel/test/global_SUITE.erl
index 3837a44c64..feae9ba4db 100644
--- a/lib/kernel/test/global_SUITE.erl
+++ b/lib/kernel/test/global_SUITE.erl
@@ -40,7 +40,11 @@
both_known_1/1,
lost_unregister/1,
mass_death/1,
- garbage_messages/1]).
+ garbage_messages/1,
+ ring_line/1,
+ lost_connection/1,
+ lost_connection2/1
+ ]).
-export([global_load/3, lock_global/2, lock_global2/2]).
@@ -77,7 +81,8 @@ all() ->
simple_resolve2, simple_resolve3, leftover_name,
re_register_name, name_exit, external_nodes, many_nodes,
sync_0, global_groups_change, register_1, both_known_1,
- lost_unregister, mass_death, garbage_messages]
+ lost_unregister, mass_death, garbage_messages,
+ lost_connection, lost_connection2]
end.
groups() ->
@@ -217,6 +222,14 @@ lock_global(Parent, Config) ->
%%% to obtain a lock for 'global' on node 3, which would keep the
%%% name registry from ever becoming consistent again.
both_known_1(Config) when is_list(Config) ->
+ case prevent_overlapping_partitions() of
+ true ->
+ {skipped, "Prevent overlapping partitions enabled"};
+ false ->
+ both_known_1_test(Config)
+ end.
+
+both_known_1_test(Config) when is_list(Config) ->
Timeout = 30,
ct:timetrap({seconds,Timeout}),
init_high_level_trace(Timeout),
@@ -299,6 +312,14 @@ both_known_1(Config) when is_list(Config) ->
%% OTP-6428. An unregistered name reappears.
lost_unregister(Config) when is_list(Config) ->
+ case prevent_overlapping_partitions() of
+ true ->
+ {skipped, "Prevent overlapping partitions enabled"};
+ false ->
+ lost_unregister_test(Config)
+ end.
+
+lost_unregister_test(Config) when is_list(Config) ->
Timeout = 30,
ct:timetrap({seconds,Timeout}),
init_high_level_trace(Timeout),
@@ -339,6 +360,120 @@ lost_unregister(Config) when is_list(Config) ->
init_condition(Config),
ok.
+lost_connection(Config) when is_list(Config) ->
+ case prevent_overlapping_partitions() of
+ true ->
+ lost_connection_test(Config);
+ false ->
+ {skipped, "Prevent overlapping partitions disabled"}
+ end.
+
+lost_connection_test(Config) when is_list(Config) ->
+ %% OTP-17843: Registered names could become inconsistent due to
+ %% overlapping partitions. This has been solved by
+ %% global actively disconnecting nodes to prevent
+ %% overlapping partitions.
+ ct:timetrap({seconds, 15}),
+
+ [Cp1, Cp2] = start_nodes([cp1, cp2], peer, Config),
+
+ PartCtrlr = setup_partitions(Config, [[node(), Cp1, Cp2]]),
+
+ wait_for_ready_net(Config),
+
+ {Gurka, yes} = start_proc_basic(gurka),
+
+ check_everywhere([node(), Cp1, Cp2], gurka, Config),
+
+ Gurka = global:whereis_name(gurka),
+
+ erlang:disconnect_node(Cp2), %% lost connection previously causing issues...
+
+ ok = rfcall(
+ PartCtrlr,
+ fun () ->
+ ok = rfcall(
+ Cp2,
+ fun () ->
+ {AltGurka, yes} = start_proc_basic(gurka),
+ AltGurka = global:whereis_name(gurka),
+ ok
+ end),
+ timer:sleep(1000),
+ rpc:cast(Cp2, erlang, halt, []),
+ wait_until(fun () -> not lists:member(Cp2, nodes(hidden)) end)
+ end),
+
+ Reconnected = case lists:member(Cp1, nodes()) of
+ true ->
+ false;
+ false ->
+ erlang:display("reconnecting Cp1"),
+ pong = net_adm:ping(Cp1),
+ timer:sleep(500),
+ true
+ end,
+
+ check_everywhere([node(), Cp1], gurka, Config),
+
+ Gurka = global:whereis_name(gurka),
+
+ ok = global:unregister_name(gurka),
+
+ stop_node(Cp1),
+ stop_partition_controller(PartCtrlr),
+
+ {comment, case Reconnected of
+ true -> "Re-connected Cp1";
+ false -> "No re-connection of Cp1 needed"
+ end}.
+
+lost_connection2(Config) when is_list(Config) ->
+ case prevent_overlapping_partitions() of
+ true ->
+ lost_connection2_test(Config);
+ false ->
+ {skipped, "Prevent overlapping partitions disabled"}
+ end.
+
+lost_connection2_test(Config) when is_list(Config) ->
+ %% OTP-17843: Registered names could become inconsistent due to
+ %% overlapping partitions. This has been solved by
+ %% global actively disconnecting nodes to prevent
+ %% overlapping partitions.
+ ct:timetrap({seconds, 15}),
+
+ [Cp1, Cp2, Cp3, Cp4] = start_nodes([cp1, cp2, cp3, cp4], peer, Config),
+
+ PartCtrlr = setup_partitions(Config, [[node(), Cp1, Cp2, Cp3, Cp4]]),
+
+ wait_for_ready_net(Config),
+
+ {Gurka, yes} = start_proc_basic(gurka),
+
+ check_everywhere([node(), Cp1, Cp2], gurka, Config),
+
+ Gurka = global:whereis_name(gurka),
+
+ disconnect_nodes(PartCtrlr, Cp3, Cp4),
+
+ Nodes = nodes(),
+ true = lists:member(Cp1, Nodes),
+ true = lists:member(Cp2, Nodes),
+ false = lists:member(Cp3, Nodes),
+ false = lists:member(Cp4, Nodes),
+
+ pong = net_adm:ping(Cp3),
+ pong = net_adm:ping(Cp4),
+
+ stop_node(Cp1),
+ stop_node(Cp2),
+ stop_node(Cp3),
+ stop_node(Cp4),
+ stop_partition_controller(PartCtrlr),
+
+ ok.
+
-define(UNTIL_LOOP, 300).
-define(end_tag, 'end at').
@@ -963,12 +1098,9 @@ name_die(Config) when is_list(Config) ->
T1 = node(),
Part1 = [T1],
Part2 = [Cp1],
- rpc_cast(Cp1,
- ?MODULE, part_2_2, [Config,
- Part1,
- Part2,
- []]),
- ?UNTIL(is_ready_partition(Config)),
+
+ PartCtrlr = setup_partitions(Config, [Part1, Part2]),
+
?UNTIL(undefined =:= global:whereis_name(Name)),
yes = global:register_name(Name, Pid),
@@ -986,12 +1118,9 @@ name_die(Config) when is_list(Config) ->
KillFile = filename:join([Dir, "kill.txt"]),
file:delete(KillFile),
erlang:spawn(Cp1, fun() -> kill_pid(Pid2, KillFile, Config) end),
- rpc_cast(Cp1,
- ?MODULE, part_2_2, [Config,
- Part1,
- Part2,
- []]),
- ?UNTIL(is_ready_partition(Config)),
+
+ setup_partitions(PartCtrlr, [Part1, Part2]),
+
?UNTIL(undefined =:= global:whereis_name(Name)),
yes = global:register_name(Name, Pid2),
touch(KillFile, "kill"),
@@ -1001,6 +1130,7 @@ name_die(Config) when is_list(Config) ->
?UNTIL(OrigNames =:= global:registered_names()),
write_high_level_trace(Config),
stop_nodes(Cps),
+ stop_partition_controller(PartCtrlr),
init_condition(Config),
ok.
@@ -1023,16 +1153,26 @@ basic_partition(Config) when is_list(Config) ->
wait_for_ready_net(Config),
%% make cp2 and cp3 connected, partitioned from us and cp1
- rpc_cast(Cp2, ?MODULE, part1, [Config, node(), Cp1, Cp3]),
- ?UNTIL(is_ready_partition(Config)),
+ PCtrlr = setup_partitions(Config, [[node(), Cp1], [Cp2, Cp3]]),
%% start different processes in both partitions
{Pid, yes} = start_proc(test),
+ %% Reach into the other partition via PCtrlr...
+ ok = rfcall(
+ PCtrlr,
+ fun () ->
+ {_, yes} = rpc:call(Cp2, ?MODULE,
+ start_proc, [test2]),
+ {_, yes} = rpc:call(Cp1, ?MODULE,
+ start_proc, [test4]),
+ ok
+ end),
+
%% connect to other partition
pong = net_adm:ping(Cp2),
pong = net_adm:ping(Cp3),
- [Cp1, Cp2, Cp3] = lists:sort(nodes()),
+ wait_until(fun () -> [Cp1, Cp2, Cp3] == lists:sort(nodes()) end),
%% check names
?UNTIL(Pid =:= rpc:call(Cp2, global, whereis_name, [test])),
@@ -1059,6 +1199,7 @@ basic_partition(Config) when is_list(Config) ->
stop_node(Cp1),
stop_node(Cp2),
stop_node(Cp3),
+ stop_node(PCtrlr),
init_condition(Config),
ok.
@@ -1091,12 +1232,23 @@ basic_name_partition(Config) when is_list(Config) ->
%% cp2: register name12
%% cp3: register name03
- rpc_cast(Cp2, ?MODULE, part1_5, [Config, node(), Cp1, Cp3]),
- ?UNTIL(is_ready_partition(Config)),
+ PCtrlr = setup_partitions(Config, [[node(), Cp1], [Cp2, Cp3]]),
%% start different processes in both partitions
{_, yes} = start_proc_basic(name03),
{_, yes} = rpc:call(Cp1, ?MODULE, start_proc_basic, [name12]),
+
+ %% Reach into the other partition via PCtrlr...
+ ok = rfcall(
+ PCtrlr,
+ fun () ->
+ {_, yes} = rpc:call(Cp2, ?MODULE,
+ start_proc_basic, [name12]),
+ {_, yes} = rpc:call(Cp3, ?MODULE,
+ start_proc_basic, [name03]),
+ ok
+ end),
+
ct:sleep(1000),
%% connect to other partition
@@ -1135,6 +1287,7 @@ basic_name_partition(Config) when is_list(Config) ->
stop_node(Cp1),
stop_node(Cp2),
stop_node(Cp3),
+ stop_node(PCtrlr),
init_condition(Config),
ok.
@@ -1160,24 +1313,36 @@ advanced_partition(Config) when is_list(Config) ->
init_condition(Config),
OrigNames = global:registered_names(),
+ Parent = self(),
+
[Cp0, Cp1, Cp2, Cp3, Cp4, Cp5, Cp6]
= start_nodes([cp0, cp1, cp2, cp3, cp4, cp5, cp6], peer, Config),
Nodes = lists:sort([node(), Cp0, Cp1, Cp2, Cp3, Cp4, Cp5, Cp6]),
wait_for_ready_net(Config),
%% make cp3-cp6 connected, partitioned from us and cp0-cp2
- rpc_cast(Cp3, ?MODULE, part2,
- [Config, self(), node(), Cp0, Cp1, Cp2, Cp3, Cp4, Cp5,Cp6]),
- ?UNTIL(is_ready_partition(Config)),
+ PCntrlr = setup_partitions(Config, [[node(), Cp0, Cp1, Cp2], [Cp3, Cp4, Cp5, Cp6]]),
+
+ %% start processes in other partition (via partition controller)...
+ ok = rfcall(PCntrlr,
+ fun () ->
+ rfcall(Cp3,
+ fun () ->
+ start_procs(Parent, Cp4, Cp5, Cp6, Config),
+ ok
+ end)
+ end),
%% start different processes in this partition
start_procs(self(), Cp0, Cp1, Cp2, Config),
%% connect to other partition
pong = net_adm:ping(Cp3),
- pong = net_adm:ping(Cp4),
- pong = net_adm:ping(Cp5),
- pong = net_adm:ping(Cp6),
+
+ wait_until(fun () ->
+ CurrNodes = lists:sort(?NODES),
+ io:format("CurrNodes: ~p~n", [CurrNodes]),
+ Nodes == CurrNodes end),
wait_for_ready_net(Config),
@@ -1240,6 +1405,7 @@ advanced_partition(Config) when is_list(Config) ->
stop_node(Cp4),
stop_node(Cp5),
stop_node(Cp6),
+ stop_node(PCntrlr),
init_condition(Config),
ok.
@@ -1261,16 +1427,44 @@ stress_partition(Config) when is_list(Config) ->
init_condition(Config),
OrigNames = global:registered_names(),
- [Cp0, Cp1, Cp2, Cp3, Cp4, Cp5, Cp6]
- = start_nodes([cp0, cp1, cp2, cp3, cp4, cp5, cp6], peer, Config),
+ Parent = self(),
+
+ [Cp0, Cp1, Cp2, Cp3, Cp4, Cp5, Cp6a, Cp6b]
+ = start_nodes([cp0, cp1, cp2, cp3, cp4, cp5, cp6a, cp6b], peer, Config),
wait_for_ready_net(Config),
%% make cp3-cp5 connected, partitioned from us and cp0-cp2
%% cp6 is alone (single node). cp6 pings cp0 and cp3 in 12 secs...
- rpc_cast(Cp3, ?MODULE, part3,
- [Config, self(), node(), Cp0, Cp1, Cp2, Cp3, Cp4, Cp5,Cp6]),
- ?UNTIL(is_ready_partition(Config)),
+ PCntrlr = setup_partitions(Config, [[node(), Cp0, Cp1, Cp2], [Cp3, Cp4, Cp5, Cp6a], [Cp6b]]),
+
+ %% start processes in other partition (via partition controller)...
+ ok = rfcall(PCntrlr,
+ fun () ->
+ ok = rfcall(Cp3,
+ fun () ->
+ start_procs(Parent, Cp4, Cp5, Cp6a, Config),
+ ok
+ end),
+ ok = rfcall(Cp6b,
+ fun () ->
+ Pid1 = start_proc3(test1),
+ assert_pid(Pid1),
+ Pid2 = start_proc3(test3),
+ assert_pid(Pid2),
+ yes = global:register_name(
+ test1, Pid1),
+ yes = global:register_name(
+ test3, Pid2,
+ fun global:random_notify_name/3),
+ ok
+ end),
+ %% Make Cp5 crash
+ rpc:cast(Cp5, ?MODULE, crash, [12000]),
+ %% Make Cp6b alone
+ rpc:cast(Cp6b, ?MODULE, alone, [Cp0, Cp3]),
+ ok
+ end),
%% start different processes in this partition
start_procs(self(), Cp0, Cp1, Cp2, Config),
@@ -1289,17 +1483,19 @@ stress_partition(Config) when is_list(Config) ->
%% connect to other partition
pong = net_adm:ping(Cp3),
+ pong = net_adm:ping(Cp4),
rpc_cast(Cp2, ?MODULE, crash, [0]),
%% Start new nodes
{ok, Cp7} = start_peer_node(cp7, Config),
{ok, Cp2_2} = start_peer_node(cp2, Config),
- Nodes = lists:sort([node(), Cp0, Cp1, Cp2_2, Cp3, Cp4, Cp6, Cp7, Cp8]),
+ Nodes = lists:sort([node(), Cp0, Cp1, Cp2_2, Cp3, Cp4, Cp6a, Cp6b, Cp7, Cp8]),
put(?nodes_tag, Nodes),
- pong = net_adm:ping(Cp4),
- pong = net_adm:ping(Cp6),
- pong = net_adm:ping(Cp8),
+ %% We don't know how the crashes partitions the net, so
+ %% we cast trough all nodes so we get them all connected
+ %% again...
+ cast_line(Nodes),
wait_for_ready_net(Nodes, Config),
@@ -1332,9 +1528,11 @@ stress_partition(Config) when is_list(Config) ->
stop_node(Cp3),
stop_node(Cp4),
stop_node(Cp5),
- stop_node(Cp6),
+ stop_node(Cp6a),
+ stop_node(Cp6b),
stop_node(Cp7),
stop_node(Cp8),
+ stop_node(PCntrlr),
init_condition(Config),
ok.
@@ -1373,40 +1571,27 @@ ring(Config) when is_list(Config) ->
wait_for_ready_net(Config),
- Time = msec() + 7000,
-
- rpc_cast(Cp0, ?MODULE, single_node, [Time, Cp8, Config]),
- rpc_cast(Cp1, ?MODULE, single_node, [Time, Cp0, Config]),
- rpc_cast(Cp2, ?MODULE, single_node, [Time, Cp1, Config]),
- rpc_cast(Cp3, ?MODULE, single_node, [Time, Cp2, Config]),
- rpc_cast(Cp4, ?MODULE, single_node, [Time, Cp3, Config]),
- rpc_cast(Cp5, ?MODULE, single_node, [Time, Cp4, Config]),
- rpc_cast(Cp6, ?MODULE, single_node, [Time, Cp5, Config]),
- rpc_cast(Cp7, ?MODULE, single_node, [Time, Cp6, Config]),
- rpc_cast(Cp8, ?MODULE, single_node, [Time, Cp7, Config]),
-
- %% sleep to make the partitioned net ready
- sleep(Time - msec()),
-
- pong = net_adm:ping(Cp0),
- pong = net_adm:ping(Cp1),
- pong = net_adm:ping(Cp2),
- pong = net_adm:ping(Cp3),
- pong = net_adm:ping(Cp4),
- pong = net_adm:ping(Cp5),
- pong = net_adm:ping(Cp6),
- pong = net_adm:ping(Cp7),
- pong = net_adm:ping(Cp8),
+ PartCtrlr = setup_partitions(Config,
+ [[node()], [Cp0], [Cp1], [Cp2], [Cp3],
+ [Cp4], [Cp5], [Cp6], [Cp7], [Cp8]]),
+ Time = msec() + 1000,
+
+ ok = rfcall(
+ PartCtrlr,
+ fun () ->
+ rpc:cast(Cp0, ?MODULE, single_node, [Time, Cp8]), % ping ourself!
+ rpc:cast(Cp1, ?MODULE, single_node, [Time, Cp0]),
+ rpc:cast(Cp2, ?MODULE, single_node, [Time, Cp1]),
+ rpc:cast(Cp3, ?MODULE, single_node, [Time, Cp2]),
+ rpc:cast(Cp4, ?MODULE, single_node, [Time, Cp3]),
+ rpc:cast(Cp5, ?MODULE, single_node, [Time, Cp4]),
+ rpc:cast(Cp6, ?MODULE, single_node, [Time, Cp5]),
+ rpc:cast(Cp7, ?MODULE, single_node, [Time, Cp6]),
+ rpc:cast(Cp8, ?MODULE, single_node, [Time, Cp7]),
+ ok
+ end),
pong = net_adm:ping(Cp0),
- pong = net_adm:ping(Cp1),
- pong = net_adm:ping(Cp2),
- pong = net_adm:ping(Cp3),
- pong = net_adm:ping(Cp4),
- pong = net_adm:ping(Cp5),
- pong = net_adm:ping(Cp6),
- pong = net_adm:ping(Cp7),
- pong = net_adm:ping(Cp8),
wait_for_ready_net(Nodes, Config),
@@ -1437,6 +1622,7 @@ ring(Config) when is_list(Config) ->
stop_node(Cp6),
stop_node(Cp7),
stop_node(Cp8),
+ stop_partition_controller(PartCtrlr),
init_condition(Config),
ok.
@@ -1459,31 +1645,24 @@ simple_ring(Config) when is_list(Config) ->
wait_for_ready_net(Config),
- Time = msec() + 5000,
-
- rpc_cast(Cp0, ?MODULE, single_node, [Time, Cp5, Config]),
- rpc_cast(Cp1, ?MODULE, single_node, [Time, Cp0, Config]),
- rpc_cast(Cp2, ?MODULE, single_node, [Time, Cp1, Config]),
- rpc_cast(Cp3, ?MODULE, single_node, [Time, Cp2, Config]),
- rpc_cast(Cp4, ?MODULE, single_node, [Time, Cp3, Config]),
- rpc_cast(Cp5, ?MODULE, single_node, [Time, Cp4, Config]),
-
- %% sleep to make the partitioned net ready
- sleep(Time - msec()),
-
- pong = net_adm:ping(Cp0),
- pong = net_adm:ping(Cp1),
- pong = net_adm:ping(Cp2),
- pong = net_adm:ping(Cp3),
- pong = net_adm:ping(Cp4),
- pong = net_adm:ping(Cp5),
+ PartCtrlr = setup_partitions(Config,
+ [[node()], [Cp0], [Cp1], [Cp2], [Cp3],
+ [Cp4], [Cp5]]),
+ Time = msec() + 1000,
+
+ ok = rfcall(
+ PartCtrlr,
+ fun () ->
+ rpc:cast(Cp0, ?MODULE, single_node, [Time, Cp5]), % ping ourself!
+ rpc:cast(Cp1, ?MODULE, single_node, [Time, Cp0]),
+ rpc:cast(Cp2, ?MODULE, single_node, [Time, Cp1]),
+ rpc:cast(Cp3, ?MODULE, single_node, [Time, Cp2]),
+ rpc:cast(Cp4, ?MODULE, single_node, [Time, Cp3]),
+ rpc:cast(Cp5, ?MODULE, single_node, [Time, Cp4]),
+ ok
+ end),
pong = net_adm:ping(Cp0),
- pong = net_adm:ping(Cp1),
- pong = net_adm:ping(Cp2),
- pong = net_adm:ping(Cp3),
- pong = net_adm:ping(Cp4),
- pong = net_adm:ping(Cp5),
wait_for_ready_net(Nodes, Config),
@@ -1511,6 +1690,7 @@ simple_ring(Config) when is_list(Config) ->
stop_node(Cp3),
stop_node(Cp4),
stop_node(Cp5),
+ stop_partition_controller(PartCtrlr),
init_condition(Config),
ok.
@@ -1531,41 +1711,27 @@ line(Config) when is_list(Config) ->
wait_for_ready_net(Config),
- Time = msec() + 7000,
-
- rpc_cast(Cp0, ?MODULE, single_node,
- [Time, Cp0, Config]), % ping ourself!
- rpc_cast(Cp1, ?MODULE, single_node, [Time, Cp0, Config]),
- rpc_cast(Cp2, ?MODULE, single_node, [Time, Cp1, Config]),
- rpc_cast(Cp3, ?MODULE, single_node, [Time, Cp2, Config]),
- rpc_cast(Cp4, ?MODULE, single_node, [Time, Cp3, Config]),
- rpc_cast(Cp5, ?MODULE, single_node, [Time, Cp4, Config]),
- rpc_cast(Cp6, ?MODULE, single_node, [Time, Cp5, Config]),
- rpc_cast(Cp7, ?MODULE, single_node, [Time, Cp6, Config]),
- rpc_cast(Cp8, ?MODULE, single_node, [Time, Cp7, Config]),
-
- %% Sleep to make the partitioned net ready
- sleep(Time - msec()),
-
- pong = net_adm:ping(Cp0),
- pong = net_adm:ping(Cp1),
- pong = net_adm:ping(Cp2),
- pong = net_adm:ping(Cp3),
- pong = net_adm:ping(Cp4),
- pong = net_adm:ping(Cp5),
- pong = net_adm:ping(Cp6),
- pong = net_adm:ping(Cp7),
- pong = net_adm:ping(Cp8),
+ PartCtrlr = setup_partitions(Config,
+ [[node()], [Cp0], [Cp1], [Cp2], [Cp3],
+ [Cp4], [Cp5], [Cp6], [Cp7], [Cp8]]),
+ ThisNode = node(),
- pong = net_adm:ping(Cp0),
- pong = net_adm:ping(Cp1),
- pong = net_adm:ping(Cp2),
- pong = net_adm:ping(Cp3),
- pong = net_adm:ping(Cp4),
- pong = net_adm:ping(Cp5),
- pong = net_adm:ping(Cp6),
- pong = net_adm:ping(Cp7),
- pong = net_adm:ping(Cp8),
+ Time = msec() + 1000,
+ ok = rfcall(
+ PartCtrlr,
+ fun () ->
+ rpc:cast(Cp0, ?MODULE, single_node, [Time, Cp0]), % ping ourself!
+ rpc:cast(Cp1, ?MODULE, single_node, [Time, Cp0]),
+ rpc:cast(Cp2, ?MODULE, single_node, [Time, Cp1]),
+ rpc:cast(Cp3, ?MODULE, single_node, [Time, Cp2]),
+ rpc:cast(Cp4, ?MODULE, single_node, [Time, Cp3]),
+ rpc:cast(Cp5, ?MODULE, single_node, [Time, Cp4]),
+ rpc:cast(Cp6, ?MODULE, single_node, [Time, Cp5]),
+ rpc:cast(Cp7, ?MODULE, single_node, [Time, Cp6]),
+ rpc:cast(Cp8, ?MODULE, single_node, [Time, Cp7]),
+ rpc:cast(ThisNode, ?MODULE, single_node, [Time, Cp8]),
+ ok
+ end),
wait_for_ready_net(Nodes, Config),
@@ -1596,6 +1762,7 @@ line(Config) when is_list(Config) ->
stop_node(Cp6),
stop_node(Cp7),
stop_node(Cp8),
+ stop_partition_controller(PartCtrlr),
init_condition(Config),
ok.
@@ -1618,32 +1785,24 @@ simple_line(Config) when is_list(Config) ->
wait_for_ready_net(Config),
- Time = msec() + 5000,
-
- rpc_cast(Cp0, ?MODULE, single_node,
- [Time, Cp0, Config]), % ping ourself!
- rpc_cast(Cp1, ?MODULE, single_node, [Time, Cp0, Config]),
- rpc_cast(Cp2, ?MODULE, single_node, [Time, Cp1, Config]),
- rpc_cast(Cp3, ?MODULE, single_node, [Time, Cp2, Config]),
- rpc_cast(Cp4, ?MODULE, single_node, [Time, Cp3, Config]),
- rpc_cast(Cp5, ?MODULE, single_node, [Time, Cp4, Config]),
-
- %% sleep to make the partitioned net ready
- sleep(Time - msec()),
-
- pong = net_adm:ping(Cp0),
- pong = net_adm:ping(Cp1),
- pong = net_adm:ping(Cp2),
- pong = net_adm:ping(Cp3),
- pong = net_adm:ping(Cp4),
- pong = net_adm:ping(Cp5),
+ PartCtrlr = setup_partitions(Config,
+ [[node()], [Cp0], [Cp1], [Cp2], [Cp3],
+ [Cp4], [Cp5]]),
+ ThisNode = node(),
- pong = net_adm:ping(Cp0),
- pong = net_adm:ping(Cp1),
- pong = net_adm:ping(Cp2),
- pong = net_adm:ping(Cp3),
- pong = net_adm:ping(Cp4),
- pong = net_adm:ping(Cp5),
+ Time = msec() + 1000,
+ ok = rfcall(
+ PartCtrlr,
+ fun () ->
+ rpc:cast(Cp0, ?MODULE, single_node, [Time, Cp0]), % ping ourself!
+ rpc:cast(Cp1, ?MODULE, single_node, [Time, Cp0]),
+ rpc:cast(Cp2, ?MODULE, single_node, [Time, Cp1]),
+ rpc:cast(Cp3, ?MODULE, single_node, [Time, Cp2]),
+ rpc:cast(Cp4, ?MODULE, single_node, [Time, Cp3]),
+ rpc:cast(Cp5, ?MODULE, single_node, [Time, Cp4]),
+ rpc:cast(ThisNode, ?MODULE, single_node, [Time, Cp5]),
+ ok
+ end),
wait_for_ready_net(Nodes, Config),
@@ -1671,6 +1830,7 @@ simple_line(Config) when is_list(Config) ->
stop_node(Cp3),
stop_node(Cp4),
stop_node(Cp5),
+ stop_partition_controller(PartCtrlr),
init_condition(Config),
ok.
@@ -1756,9 +1916,9 @@ otp_1849(Config) when is_list(Config) ->
%% Test ticket: Deadlock in global.
otp_3162(Config) when is_list(Config) ->
StartFun = fun() ->
- {ok, Cp1} = start_node(cp1, Config),
- {ok, Cp2} = start_node(cp2, Config),
- {ok, Cp3} = start_node(cp3, Config),
+ {ok, Cp1} = start_node(cp1, peer, Config),
+ {ok, Cp2} = start_node(cp2, peer, Config),
+ {ok, Cp3} = start_node(cp3, peer, Config),
[Cp1, Cp2, Cp3]
end,
do_otp_3162(StartFun, Config).
@@ -1772,6 +1932,10 @@ do_otp_3162(StartFun, Config) ->
wait_for_ready_net(Config),
+ ThisNode = node(),
+
+ PartCntrlr = setup_partitions(Config, [[ThisNode, Cp1, Cp2, Cp3]]),
+
%% start procs on each node
Pid1 = rpc:call(Cp1, ?MODULE, start_proc4, [kalle]),
assert_pid(Pid1),
@@ -1780,40 +1944,22 @@ do_otp_3162(StartFun, Config) ->
Pid3 = rpc:call(Cp3, ?MODULE, start_proc4, [vera]),
assert_pid(Pid3),
- rpc_disconnect_node(Cp1, Cp2, Config),
-
- ?UNTIL
- ([Cp3] =:= lists:sort(rpc:call(Cp1, erlang, nodes, [])) -- [node()]),
-
- ?UNTIL([kalle, vera] =:=
- lists:sort(rpc:call(Cp1, global, registered_names, []))),
- ?UNTIL
- ([Cp3] =:= lists:sort(rpc:call(Cp2, erlang, nodes, [])) -- [node()]),
- ?UNTIL([stina, vera] =:=
- lists:sort(rpc:call(Cp2, global, registered_names, []))),
- ?UNTIL
- ([Cp1, Cp2] =:=
- lists:sort(rpc:call(Cp3, erlang, nodes, [])) -- [node()]),
- ?UNTIL([kalle, stina, vera] =:=
- lists:sort(rpc:call(Cp3, global, registered_names, []))),
-
- pong = rpc:call(Cp2, net_adm, ping, [Cp1]),
-
- ?UNTIL
- ([Cp2, Cp3] =:=
- lists:sort(rpc:call(Cp1, erlang, nodes, [])) -- [node()]),
- ?UNTIL(begin
- NN = lists:sort(rpc:call(Cp1, global, registered_names, [])),
- [kalle, stina, vera] =:= NN
- end),
- ?UNTIL
- ([Cp1, Cp3] =:=
- lists:sort(rpc:call(Cp2, erlang, nodes, [])) -- [node()]),
- ?UNTIL([kalle, stina, vera] =:=
- lists:sort(rpc:call(Cp2, global, registered_names, []))),
- ?UNTIL
- ([Cp1, Cp2] =:=
- lists:sort(rpc:call(Cp3, erlang, nodes, [])) -- [node()]),
+ disconnect_nodes(PartCntrlr, Cp1, Cp2),
+ %% Nowadays we do not know how the net have been partitioned after the
+ %% disconnect, so we cannot perform all the tests this testcase originally
+ %% did. We instead only test the end result is ok after we have reconnected
+ %% all nodes.
+
+ ok = rfcall(PartCntrlr,
+ fun () ->
+ rpc:cast(Cp1, net_kernel, connect_node, [Cp3]),
+ rpc:cast(Cp2, net_kernel, connect_node, [Cp3]),
+ rpc:cast(ThisNode, net_kernel, connect_node, [Cp3]),
+ ok
+ end),
+
+ ?UNTIL(lists:sort([ThisNode, Cp1, Cp2]) =:=
+ lists:sort(rpc:call(Cp3, erlang, nodes, []))),
?UNTIL([kalle, stina, vera] =:=
lists:sort(rpc:call(Cp3, global, registered_names, []))),
@@ -1821,6 +1967,7 @@ do_otp_3162(StartFun, Config) ->
stop_node(Cp1),
stop_node(Cp2),
stop_node(Cp3),
+ stop_partition_controller(PartCntrlr),
init_condition(Config),
ok.
@@ -1949,6 +2096,8 @@ simple_disconnect(Config) when is_list(Config) ->
[Cp1, Cp2] = Cps = start_nodes([n_1, n_2], peer, Config),
wait_for_ready_net(Config),
+ PartCtrlr = start_partition_controller(Config),
+
Nodes = lists:sort([node() | Cps]),
lists:foreach(fun(N) -> rpc:call(N, ?MODULE, start_tracer, []) end,Nodes),
@@ -1962,16 +2111,21 @@ simple_disconnect(Config) when is_list(Config) ->
ct:sleep(100),
%% Disconnect test_server and Cp2.
- true = erlang:disconnect_node(Cp2),
- ct:sleep(500),
+ disconnect_nodes(PartCtrlr, node(), Cp2),
+ %% We might have been disconnected from Cp1 as well. Ensure connected
+ %% to Cp1...
+ pong = net_adm:ping(Cp1),
%% _Pid is registered on Cp1. The exchange of names between Cp2 and
%% test_server sees two identical pids.
pong = net_adm:ping(PingNode),
+
?UNTIL(Cps =:= lists:sort(nodes())),
{_, Trace0} = collect_tracers(Nodes),
Resolvers = [P || {_Node,new_resolver,{pid,P}} <- Trace0],
+ check_everywhere(Nodes, Name, Config),
+ true = undefined /= global:whereis_name(Name),
lists:foreach(fun(P) -> P ! die end, Resolvers),
lists:foreach(fun(P) -> wait_for_exit(P) end, Resolvers),
check_everywhere(Nodes, Name, Config),
@@ -1989,30 +2143,10 @@ simple_disconnect(Config) when is_list(Config) ->
OrigNames = global:registered_names(),
write_high_level_trace(Config),
stop_nodes(Cps),
+ stop_partition_controller(PartCtrlr),
init_condition(Config),
ok.
-%% Not used right now.
-simple_dis(Nodes0, Name, Resolver, Config) ->
- Nodes = [node() | Nodes0],
- NN = lists:zip(Nodes, lists:seq(1, length(Nodes))),
- [{_Node,Other} | Dis] =
- [{N,[N1 || {N1,I1} <- NN, I1 > I + 1]} || {N,I} <- NN],
- lists:foreach(
- fun({Node, DisNodes}) ->
- Args = [Node, DisNodes, Name, Resolver],
- ok = rpc:call(Node, ?MODULE, simple_dis_node, Args)
- end, Dis),
- ok = simple_dis_node(node(), Other, Name, Resolver, Config).
-
-simple_dis_node(_Node, DisNodes, _Name, _Resolver, Config) ->
- lists:foreach(
- fun(OtherNode) -> _ = erlang:disconnect_node(OtherNode) end, DisNodes),
- ?UNTIL(DisNodes -- nodes() =:= DisNodes),
- ok.
-
-
-
%%%-----------------------------------------------------------------
%%% Testing resolve of name. Many combinations with four nodes.
%%%-----------------------------------------------------------------
@@ -2023,7 +2157,8 @@ simple_dis_node(_Node, DisNodes, _Name, _Resolver, Config) ->
n2, % node starting registered process in partition 2
nodes, % nodes expected to exist after ping
n_res, % expected number of resolvers after ping
- config
+ config,
+ ctrlr
}).
-define(RES(F), {F, fun ?MODULE:F/3}).
@@ -2040,6 +2175,8 @@ simple_resolve(Config) when is_list(Config) ->
Nodes = lists:sort([node() | Cps]),
wait_for_ready_net(Config),
+ PartCtrlr = start_partition_controller(Config),
+
lists:foreach(fun(N) ->
rpc:call(N, ?MODULE, start_tracer, [])
end, Nodes),
@@ -2049,7 +2186,8 @@ simple_resolve(Config) when is_list(Config) ->
%% name 'link' remains...
Cf = #cf{link = none, ping = A2, n1 = node(), n2 = A2,
- nodes = [node(), N1, A2, Z2], n_res = 2, config = Config},
+ nodes = [node(), N1, A2, Z2], n_res = 2, config = Config,
+ ctrlr = PartCtrlr},
%% There is no test with a resolver that deletes a pid (like
%% global_exit_name does). The resulting DOWN signal just clears
@@ -2157,6 +2295,7 @@ simple_resolve(Config) when is_list(Config) ->
OrigNames = global:registered_names(),
write_high_level_trace(Config),
stop_nodes(Cps),
+ stop_partition_controller(PartCtrlr),
init_condition(Config),
ok.
@@ -2175,12 +2314,15 @@ simple_resolve2(Config) when is_list(Config) ->
wait_for_ready_net(Config),
Nodes = lists:sort([node() | Cps]),
+ PartCtrlr = start_partition_controller(Config),
+
lists:foreach(fun(N) ->
rpc:call(N, ?MODULE, start_tracer, [])
end, Nodes),
Cf = #cf{link = none, ping = A2, n1 = node(), n2 = A2,
- nodes = [node(), N1, A2, Z2], n_res = 2, config = Config},
+ nodes = [node(), N1, A2, Z2], n_res = 2, config = Config,
+ ctrlr = PartCtrlr},
%% Halt z_2.
res(?RES(halt_second), Cps, Cf#cf{link = N1, n1 = N1, n2 = Z2, ping = A2,
@@ -2193,6 +2335,7 @@ simple_resolve2(Config) when is_list(Config) ->
OrigNames = global:registered_names(),
write_high_level_trace(Config),
stop_nodes(Cps), % Not all nodes may be present, but it works anyway.
+ stop_partition_controller(PartCtrlr),
init_condition(Config),
ok.
@@ -2210,12 +2353,15 @@ simple_resolve3(Config) when is_list(Config) ->
wait_for_ready_net(Config),
Nodes = lists:sort([node() | Cps]),
+ PartCtrlr = start_partition_controller(Config),
+
lists:foreach(fun(N) ->
rpc:call(N, ?MODULE, start_tracer, [])
end, Nodes),
Cf = #cf{link = none, ping = A2, n1 = node(), n2 = A2,
- nodes = [node(), N1, A2, Z2], n_res = 2, config = Config},
+ nodes = [node(), N1, A2, Z2], n_res = 2, config = Config,
+ ctrlr = PartCtrlr},
%% Halt a_2.
res(?RES(halt_second), Cps, Cf#cf{link = node(), n2 = A2,
@@ -2228,13 +2374,14 @@ simple_resolve3(Config) when is_list(Config) ->
OrigNames = global:registered_names(),
write_high_level_trace(Config),
stop_nodes(Cps), % Not all nodes may be present, but it works anyway.
+ stop_partition_controller(PartCtrlr),
init_condition(Config),
ok.
res({Res,Resolver}, [N1, A2, Z2], Cf) ->
%% Note: there are no links anymore, but monitors.
#cf{link = LinkedNode, ping = PingNode, n1 = Res1, n2 = OtherNode,
- nodes = Nodes0, n_res = NRes, config = Config} = Cf,
+ nodes = Nodes0, n_res = NRes, config = Config, ctrlr = PartCtrlr} = Cf,
io:format("~n~nResolver: ~p", [Res]),
io:format(" Registered on partition 1: ~p", [Res1]),
io:format(" Registered on partition 2: ~p", [OtherNode]),
@@ -2252,11 +2399,20 @@ res({Res,Resolver}, [N1, A2, Z2], Cf) ->
%% expected monitors remain between registered processes and the
%% global_name_server.
- rpc_cast(OtherNode,
- ?MODULE,
- part_2_2,
- [Config, Part1, Part2, [{Name, Resolver}]]),
- ?UNTIL(is_ready_partition(Config)),
+ setup_partitions(PartCtrlr, [Part1, Part2]),
+
+ ok = rfcall(PartCtrlr,
+ fun () ->
+ rfcall(OtherNode,
+ fun () ->
+ {Pid2, yes} = start_resolver(Name,
+ Resolver),
+ trace_message({node(), part_2_2,
+ nodes(), {pid2,Pid2}}),
+ ok
+ end)
+ end),
+
{_Pid1, yes} =
rpc:call(Res1, ?MODULE, start_resolver, [Name, Resolver]),
@@ -2319,15 +2475,6 @@ monitored_by_node(Trace, Servers) ->
M <- ML,
lists:member(M, Servers)]).
-%% Runs on a node in Part2
-part_2_2(Config, Part1, Part2, NameResolvers) ->
- make_partition(Config, Part1, Part2),
- lists:foreach
- (fun({Name, Resolver}) ->
- {Pid2, yes} = start_resolver(Name, Resolver),
- trace_message({node(), part_2_2, nodes(), {pid2,Pid2}})
- end, NameResolvers).
-
resolve_first(name, Pid1, _Pid2) ->
Pid1.
@@ -2398,7 +2545,7 @@ mon_by_servers(Proc) ->
{monitored_by, ML} = process_info(Proc, monitored_by),
{monitors_2levels,
lists:append([ML |
- [begin
+ [begin
{monitored_by, MML} = rpc:call(node(M),
erlang,
process_info,
@@ -2433,13 +2580,23 @@ leftover_name(Config) when is_list(Config) ->
Part2 = [A2, Z2],
NoResolver = {no_module, resolve_none},
Resolver = fun contact_a_2/3,
- rpc_cast(A2,
- ?MODULE, part_2_2, [Config,
- Part1,
- Part2,
- [{Name, NoResolver},
- {ResName, Resolver}]]),
- ?UNTIL(is_ready_partition(Config)),
+
+ PartCtrlr = setup_partitions(Config, [Part1, Part2]),
+
+ ok = rfcall(
+ PartCtrlr,
+ fun () ->
+ rfcall(
+ A2,
+ fun () ->
+ lists:foreach(
+ fun({TheName, TheResolver}) ->
+ {Pid2, yes} = start_resolver(TheName, TheResolver),
+ trace_message({node(), part_2_2, nodes(), {pid2,Pid2}})
+ end, [{Name, NoResolver}, {ResName, Resolver}]),
+ ok
+ end)
+ end),
%% resolved_name is resolved to run on a_2, an insert operation is
%% sent to n_1. The resolver function halts a_2, but the nodedown
@@ -2471,6 +2628,7 @@ leftover_name(Config) when is_list(Config) ->
?UNTIL(OrigNames =:= global:registered_names()),
write_high_level_trace(Config),
stop_nodes(Cps),
+ stop_partition_controller(PartCtrlr),
init_condition(Config),
ok.
@@ -2636,9 +2794,22 @@ external_nodes(Config) when is_list(Config) ->
%% Two partitions: [test_server] and [b, c].
%% c registers an external name on b
- rpc_cast(NodeB, ?MODULE, part_ext,
- [Config, node(), NodeC, Name]),
- ?UNTIL(is_ready_partition(Config)),
+
+ PartCtrlr = setup_partitions(Config, [[node()], [NodeB, NodeC]]),
+
+ ok = rfcall(
+ PartCtrlr,
+ fun () ->
+ rfcall(
+ NodeB,
+ fun () ->
+ Pid = spawn(NodeC, fun() -> cnode_proc(NodeB) end),
+ Pid ! {register, self(), Name},
+ receive {Pid, Reply} -> yes = Reply end,
+ rpc:call(NodeC, erlang, register, [Name, Pid]),
+ ok
+ end)
+ end),
pong = net_adm:ping(NodeB),
?UNTIL([NodeB, NodeC] =:= lists:sort(nodes())),
@@ -2702,6 +2873,7 @@ external_nodes(Config) when is_list(Config) ->
?UNTIL(length(get_ext_names()) =:= 1),
stop_node(NodeC),
+ stop_partition_controller(PartCtrlr),
?UNTIL(length(get_ext_names()) =:= 0),
init_condition(Config),
@@ -2710,15 +2882,6 @@ external_nodes(Config) when is_list(Config) ->
get_ext_names() ->
gen_server:call(global_name_server, get_names_ext, infinity).
-%% Runs at B
-part_ext(Config, Main, C, Name) ->
- make_partition(Config, [Main], [node(), C]),
- ThisNode = node(),
- Pid = erlang:spawn(C, fun() -> cnode_proc(ThisNode) end),
- Pid ! {register, self(), Name},
- receive {Pid, Reply} -> yes = Reply end,
- rpc:call(C, erlang, register, [Name, Pid]).
-
cnode_links(Pid) ->
Pid ! {links, self()},
receive
@@ -2776,25 +2939,29 @@ many_nodes(Config) when is_list(Config) ->
Nodes = lists:sort(?NODES),
wait_for_ready_net(Nodes, Config),
- Dir = proplists:get_value(priv_dir, Config),
- GoFile = filename:join([Dir, "go.txt"]),
- file:delete(GoFile),
-
- CpsFiles = [{N, filename:join([Dir, atom_to_list(N)++".node"])} ||
- N <- Cps],
- IsoFun =
- fun({N, File}) ->
- file:delete(File),
- rpc_cast(N, ?MODULE, isolated_node, [File, GoFile, Cps, Config])
- end,
- lists:foreach(IsoFun, CpsFiles),
+ %% All nodes isolated not connected to any other (visible) nodes...
+ Partitions = [[node()] | lists:map(fun (Node) -> [Node] end, Cps)],
+
+ PartCtrlr = setup_partitions(Config, Partitions),
- all_nodes_files(CpsFiles, "isolated", Config),
Time = msec(),
- sync_until(),
- erlang:display(ready_to_go),
- touch(GoFile, "go"),
- all_nodes_files(CpsFiles, "done", Config),
+
+ ok = rfcall(
+ PartCtrlr,
+ fun () ->
+ OkRes = {lists:map(fun (_) -> ok end, Cps), []},
+ OkRes = rfmulticall(
+ Cps,
+ fun () ->
+ lists:foreach(fun(N) ->
+ _ = net_adm:ping(N)
+ end, shuffle(Cps)),
+ ?UNTIL((Cps -- get_known(node())) =:= []),
+ ok
+ end),
+ ok
+ end),
+
Time2 = msec(),
lists:foreach(fun(N) -> pong = net_adm:ping(N) end, Cps),
@@ -2803,12 +2970,10 @@ many_nodes(Config) when is_list(Config) ->
write_high_level_trace(Config), % The test succeeded, but was it slow?
- lists:foreach(fun({_N, File}) -> file:delete(File) end, CpsFiles),
- file:delete(GoFile),
-
?UNTIL(OrigNames =:= global:registered_names()),
write_high_level_trace(Config),
stop_nodes(Cps),
+ stop_partition_controller(PartCtrlr),
init_condition(Config),
Diff = Time2 - Time,
Return = lists:flatten(io_lib:format("~w nodes took ~w ms",
@@ -2831,29 +2996,6 @@ node_rel(From, To) ->
1 -> Last
end} || N <- NodeNumbers].
-isolated_node(File, GoFile, Nodes, Config) ->
- Ns = lists:sort(Nodes),
- exit(erlang:whereis(user), kill),
- touch(File, "start_isolated"),
- NodesList = nodes(),
- append_to_file(File, [{nodes,Nodes},{nodes_list,NodesList}]),
- Replies =
- lists:map(fun(N) -> _ = erlang:disconnect_node(N) end, NodesList),
- append_to_file(File, {replies,Replies}),
- ?UNTIL(begin
- Known = get_known(node()),
- append_to_file(File, {known,Known}),
- Known =:= [node()]
- end),
- touch(File, "isolated"),
- sync_until(File),
- file_contents(GoFile, "go", Config, File),
- touch(File, "got_go"),
- lists:foreach(fun(N) -> _ = net_adm:ping(N) end, shuffle(Nodes)),
- touch(File, "pinged"),
- ?UNTIL((Ns -- get_known(node())) =:= []),
- touch(File, "done").
-
touch(File, List) ->
ok = file:write_file(File, list_to_binary(List)).
@@ -2862,11 +3004,6 @@ append_to_file(File, Term) ->
ok = file:write(Fd, io_lib:format("~p.~n", [Term])),
ok = file:close(Fd).
-all_nodes_files(CpsFiles, ContentsList, Config) ->
- lists:all(fun({_N,File}) ->
- file_contents(File, ContentsList, Config)
- end, CpsFiles).
-
file_contents(File, ContentsList, Config) ->
file_contents(File, ContentsList, Config, no_log_file).
@@ -2890,14 +3027,6 @@ file_contents(File, ContentsList, Config, LogFile) ->
end
end).
-sync_until() ->
- sync_until(no_log_file).
-
-sync_until(LogFile) ->
- Time = ?UNTIL_LOOP - (msec(now()) rem ?UNTIL_LOOP),
- catch append_to_file(LogFile, {sync_until, Time}),
- timer:sleep(Time).
-
shuffle(L) ->
[E || {_, E} <- lists:keysort(1, [{rand:uniform(), E} || E <- L])].
@@ -3378,109 +3507,11 @@ from(_H, []) -> [].
other(A, [A, _B]) -> A;
other(_, [_A, B]) -> B.
-
-%% this one runs at cp2
-part1(Config, Main, Cp1, Cp3) ->
- case catch begin
- make_partition(Config, [Main, Cp1], [node(), Cp3]),
- {_Pid, yes} = start_proc(test2),
- {_Pid2, yes} = start_proc(test4)
- end of
- {_, yes} -> ok; % w("ok", []);
- {'EXIT', _R} ->
- ok
- %% w("global_SUITE line:~w: ~p", [?LINE, _R])
- end.
-
-%% Runs at Cp2
-part1_5(Config, Main, Cp1, Cp3) ->
- case catch begin
- make_partition(Config, [Main, Cp1], [node(), Cp3]),
- {_Pid1, yes} = start_proc_basic(name12),
- {_Pid2, yes} =
- rpc:call(Cp3, ?MODULE, start_proc_basic, [name03])
- end of
- {_, yes} -> ok; % w("ok", []);
- {'EXIT', _R} ->
- ok
- %% w("global_SUITE line:~w: ~p", [?LINE, _R])
- end.
-
w(X,Y) ->
{ok, F} = file:open("cp2.log", [write]),
io:format(F, X, Y),
file:close(F).
-%% this one runs on one node in Part2
-%% The partition is ready when is_ready_partition(Config) returns (true).
-make_partition(Config, Part1, Part2) ->
- Dir = proplists:get_value(priv_dir, Config),
- Ns = [begin
- Name = lists:concat([atom_to_list(N),"_",msec(),".part"]),
- File = filename:join([Dir, Name]),
- file:delete(File),
- rpc_cast(N, ?MODULE, mk_part_node, [File, Part, Config], File),
- {N, File}
- end || Part <- [Part1, Part2], N <- Part],
- all_nodes_files(Ns, "done", Config),
- lists:foreach(fun({_N,File}) -> file:delete(File) end, Ns),
- PartFile = make_partition_file(Config),
- touch(PartFile, "done").
-
-%% The node signals its success by touching a file.
-mk_part_node(File, MyPart0, Config) ->
- touch(File, "start"), % debug
- MyPart = lists:sort(MyPart0),
- ?UNTIL(is_node_in_part(File, MyPart)),
- touch(File, "done").
-
-%% The calls to append_to_file are for debugging.
-is_node_in_part(File, MyPart) ->
- lists:foreach(fun(N) ->
- _ = erlang:disconnect_node(N)
- end, nodes() -- MyPart),
- case {(Known = get_known(node())) =:= MyPart,
- (Nodes = lists:sort([node() | nodes()])) =:= MyPart} of
- {true, true} ->
- %% Make sure the resolvers have been terminated,
- %% otherwise they may pop up and send some message.
- %% (This check is probably unnecessary.)
- case element(5, global:info()) of
- [] ->
- true;
- Rs ->
- erlang:display({is_node_in_part, resolvers, Rs}),
- trace_message({node(), is_node_in_part, Rs}),
- append_to_file(File, {now(), Known, Nodes, Rs}),
- false
- end;
- _ ->
- append_to_file(File, {now(), Known, Nodes}),
- false
- end.
-
-is_ready_partition(Config) ->
- File = make_partition_file(Config),
- file_contents(File, "done", Config),
- file:delete(File),
- true.
-
-make_partition_file(Config) ->
- Dir = proplists:get_value(priv_dir, Config),
- filename:join([Dir, atom_to_list(make_partition_done)]).
-
-%% this one runs at cp3
-part2(Config, Parent, Main, Cp0, Cp1, Cp2, Cp3, Cp4, Cp5, Cp6) ->
- make_partition(Config, [Main, Cp0, Cp1, Cp2], [Cp3, Cp4, Cp5, Cp6]),
- start_procs(Parent, Cp4, Cp5, Cp6, Config).
-
-part3(Config, Parent, Main, Cp0, Cp1, Cp2, Cp3, Cp4, Cp5, Cp6) ->
- make_partition(Config, [Main, Cp0, Cp1, Cp2], [Cp3, Cp4, Cp5, Cp6]),
- start_procs(Parent, Cp4, Cp5, Cp6, Config),
- %% Make Cp6 alone
- rpc_cast(Cp5, ?MODULE, crash, [12000]),
- rpc_cast(Cp6, ?MODULE, alone, [Cp0, Cp3]).
-
start_procs(Parent, N1, N2, N3, Config) ->
S1 = lists:sort([N1, N2, N3]),
?UNTIL(begin
@@ -3575,13 +3606,11 @@ init_proc_basic(Parent, Name) ->
Parent ! {self(),X},
loop().
-single_node(Time, Node, Config) ->
- exit(erlang:whereis(user), kill),
- lists:foreach(fun(N) -> _ = erlang:disconnect_node(N) end, nodes()),
- ?UNTIL(get_known(node()) =:= [node()]),
+single_node(Time, Node) ->
spawn(?MODULE, init_2, []),
- sleep(Time - msec()),
- net_adm:ping(Node).
+ timer:sleep(Time - msec()),
+ _ = net_adm:ping(Node),
+ ok.
init_2() ->
register(single_name, self()),
@@ -3640,8 +3669,6 @@ sreq(Pid, Msg) ->
receive {Ref, X} -> X end.
alone(N1, N2) ->
- lists:foreach(fun(Node) -> true = erlang:disconnect_node(Node) end,
- nodes()),
ct:sleep(12000),
net_adm:ping(N1),
net_adm:ping(N2),
@@ -4173,16 +4200,150 @@ rpc_cast(Node, Module, Function, Args) ->
{_,pong,Node}= {node(),net_adm:ping(Node),Node},
rpc:cast(Node, Module, Function, Args).
-rpc_cast(Node, Module, Function, Args, File) ->
- case net_adm:ping(Node) of
- pong ->
- rpc:cast(Node, Module, Function, Args);
- Else ->
- append_to_file(File, {now(), {rpc_cast, Node, Module, Function,
- Args, Else}})
- %% Maybe we should crash, but it probably doesn't matter.
+global_known(Node) ->
+ gen_server:call({global_name_server, Node}, get_known, infinity).
+
+global_known() ->
+ global_known(node()).
+
+disconnect(Node) ->
+ erlang:disconnect_node(Node),
+ wait_until(fun () -> not lists:member(Node, global_known()) end),
+ ok.
+
+disconnect_nodes(HiddenCtrlNode, NodeA, NodeB) ->
+ Nodes = [node()|nodes()],
+ ok = rfcall(HiddenCtrlNode,
+ fun () ->
+ ok = rfcall(NodeA,
+ fun () ->
+ disconnect(NodeB)
+ end),
+ ok = rfcall(NodeB,
+ fun () ->
+ disconnect(NodeA)
+ end),
+ %% Try to ensure 'lost_connection' messages
+ %% have been handled (see comment in
+ %% create_partitions)...
+ lists:foreach(fun (N) ->
+ _ = global_known(N)
+ end, Nodes),
+ ok
+ end).
+
+create_partitions(PartitionsList) ->
+ AllNodes = lists:sort(lists:flatten(PartitionsList)),
+
+ %% Take down all connections on all nodes...
+ AllOk = {lists:map(fun (_) -> ok end, AllNodes), []},
+ io:format("Disconnecting all nodes from eachother...", []),
+ AllOk = rfmulticall(
+ AllNodes,
+ fun () ->
+ lists:foreach(fun (N) ->
+ erlang:disconnect_node(N)
+ end, nodes()),
+ wait_until(fun () -> [] == global_known() end),
+ ok
+ end, 5000),
+ %% Here we know that all 'lost_connection' messages that will be
+ %% sent by global name servers due to these disconnects have been
+ %% sent, but we don't know that all of them have been received and
+ %% handled. By communicating with all global name servers one more
+ %% time it is very likely that all of them have been received and
+ %% handled (however, not guaranteed). If 'lost_connection' messages
+ %% are received after we begin to set up the partitions, the
+ %% partitions may lose connection with some of its nodes.
+ lists:foreach(fun (N) -> [] = global_known(N) end, AllNodes),
+
+ %% Set up fully connected partitions...
+ io:format("Connecting partitions...", []),
+ lists:foreach(
+ fun (Partition) ->
+ Part = lists:sort(Partition),
+ PartOk = {lists:map(fun (_) -> ok end, Part), []},
+ PartOk = rfmulticall(
+ Part,
+ fun () ->
+ wait_until(
+ fun () ->
+ ConnNodes = Part -- [node()|nodes()],
+ if ConnNodes == [] ->
+ true;
+ true ->
+ lists:foreach(
+ fun (N) ->
+ net_kernel:connect_node(N)
+ end, ConnNodes),
+ false
+ end
+ end),
+ ok
+
+ end, 5000)
+ end, PartitionsList),
+ ok.
+
+setup_partitions(PartCtrlr, PartList) when is_atom(PartCtrlr) ->
+ ok = rfcall(PartCtrlr, fun () -> create_partitions(PartList) end),
+ io:format("Partitions successfully setup:~n", []),
+ lists:foreach(fun (Part) ->
+ io:format("~p~n", [Part])
+ end,
+ PartList),
+ ok;
+setup_partitions(Config, PartList) when is_list(Config) ->
+ PartCtrlr = start_partition_controller(Config),
+ setup_partitions(PartCtrlr, PartList),
+ PartCtrlr.
+
+start_partition_controller(Config) when is_list(Config) ->
+ {ok, PartCtrlr} = start_hidden_node(part_ctrlr, Config),
+ PartCtrlr.
+
+stop_partition_controller(PartCtrlr) ->
+ stop_node(PartCtrlr).
+
+prevent_overlapping_partitions() ->
+ case application:get_env(kernel, prevent_overlapping_partitions) of
+ {ok, true} ->
+ true;
+ _ ->
+ false
+ end.
+
+cast_line([]) ->
+ ok;
+cast_line([N|Ns]) when N == node() ->
+ cast_line(Ns);
+cast_line([N|Ns]) ->
+ rfcast(N, fun () -> cast_line(Ns) end).
+
+wait_until(F) ->
+ case catch F() of
+ true ->
+ ok;
+ _ ->
+ receive after 10 -> ok end,
+ wait_until(F)
end.
+rfcall(Node, Fun) ->
+ rfcall(Node, Fun, infinity).
+
+rfcall(Node, Fun, Timeout) ->
+ rpc:call(Node, erlang, apply, [Fun, []], Timeout).
+
+rfcast(Node, Fun) ->
+ rpc:cast(Node, erlang, apply, [Fun, []]).
+
+rfmulticall(Nodes, Fun) ->
+ rfmulticall(Nodes, Fun, infinity).
+
+rfmulticall(Nodes, Fun, Timeout) ->
+ rpc:multicall(Nodes, erlang, apply, [Fun, []], Timeout).
+
%% The emulator now ensures that the node has been removed from
%% nodes().
rpc_disconnect_node(Node, DisconnectedNode, Config) ->
diff --git a/lib/kernel/test/pg2_SUITE.erl b/lib/kernel/test/pg2_SUITE.erl
index acecd34ead..1241fe586b 100644
--- a/lib/kernel/test/pg2_SUITE.erl
+++ b/lib/kernel/test/pg2_SUITE.erl
@@ -37,8 +37,7 @@
-define(testcase, proplists:get_value(?TESTCASE, Config)).
%% Internal export.
--export([mk_part_node_and_group/3, part2/4,
- mk_part_node/3, part1/5, p_init/3, start_proc/1, sane/0]).
+-export([p_init/3, start_proc/1, sane/0]).
init_per_testcase(Case, Config) ->
[{?TESTCASE, Case}| Config].
@@ -101,9 +100,19 @@ otp_8653(Config) when is_list(Config) ->
wait_for_ready_net(Config),
+ G = pg2_otp_8653,
+
+ {[ok,ok,ok,ok], []}
+ = rfmulticall([node(), A, B, C],
+ fun () ->
+ Pid = spawn(forever()),
+ ok = pg2:create(G),
+ _ = [ok = pg2:join(G, Pid) || _ <- [1,1]],
+ ok
+ end),
+
%% make b and c connected, partitioned from node() and a
- rpc_cast(B, ?MODULE, part2, [Config, node(), A, C]),
- ?UNTIL(is_ready_partition(Config)),
+ PartCtrlr = setup_partitions(Config, [[B, C], [node(), A]]),
%% Connect to the other partition.
pong = net_adm:ping(B),
@@ -112,7 +121,6 @@ otp_8653(Config) when is_list(Config) ->
_ = global:sync(),
[A, B, C] = lists:sort(nodes()),
- G = pg2_otp_8653,
?UNTIL(begin
GA = lists:sort(rpc:call(A, pg2, get_members, [G])),
GB = lists:sort(rpc:call(B, pg2, get_members, [G])),
@@ -125,27 +133,9 @@ otp_8653(Config) when is_list(Config) ->
end),
ok = pg2:delete(G),
stop_nodes([A,B,C]),
+ stop_partition_controller(PartCtrlr),
ok.
-part2(Config, Main, A, C) ->
- Function = mk_part_node_and_group,
- case catch begin
- make_partition(Config, [Main, A], [node(), C], Function)
- end
- of
- ok -> ok
- end.
-
-mk_part_node_and_group(File, MyPart0, Config) ->
- touch(File, "start"), % debug
- MyPart = lists:sort(MyPart0),
- ?UNTIL(is_node_in_part(File, MyPart)),
- G = pg2_otp_8653,
- Pid = spawn(forever()),
- ok = pg2:create(G),
- _ = [ok = pg2:join(G, Pid) || _ <- [1,1]],
- touch(File, "done").
-
%% OTP-8259. Member was not removed after being killed.
otp_8259(Config) when is_list(Config) ->
[A, B, C] = start_nodes([a, b, c], peer, Config),
@@ -162,8 +152,17 @@ otp_8259(Config) when is_list(Config) ->
ok = pg2:join(G, Pid),
%% make b and c connected, partitioned from node() and a
- rpc_cast(B, ?MODULE, part1, [Config, node(), A, C, Name]),
- ?UNTIL(is_ready_partition(Config)),
+ PartCtrlr = setup_partitions(Config, [[B, C], [node(), A]]),
+ ok = rfcall(
+ PartCtrlr,
+ fun () ->
+ rfcall(
+ B,
+ fun () ->
+ {_, yes} = start_proc(Name),
+ ok
+ end)
+ end),
%% Connect to the other partition.
%% The resolver on node b will be called.
@@ -182,16 +181,9 @@ otp_8259(Config) when is_list(Config) ->
ok = pg2:delete(G),
stop_nodes([A,B,C]),
+ stop_partition_controller(PartCtrlr),
ok.
-part1(Config, Main, A, C, Name) ->
- case catch begin
- make_partition(Config, [Main, A], [node(), C]),
- {_Pid, yes} = start_proc(Name)
- end of
- {_, yes} -> ok
- end.
-
start_proc(Name) ->
Pid = spawn(?MODULE, p_init, [self(), Name, node()]),
receive
@@ -585,6 +577,9 @@ collect_nodes(N, Max) ->
[Node | collect_nodes(N+1, Max)]
end.
+start_hidden_node(Name, Config) ->
+ start_node(Name, slave, "-hidden", Config).
+
start_node(Name, How, Config) ->
start_node(Name, How, "", Config).
@@ -619,60 +614,79 @@ node_name(Name, Config) ->
L = lists:flatten(Date),
lists:concat([Name,U,?testcase,U,U,L]).
-%% This one runs on one node in Part2.
-%% The partition is ready when is_ready_partition(Config) returns (true).
-make_partition(Config, Part1, Part2) ->
- make_partition(Config, Part1, Part2, mk_part_node).
-
-make_partition(Config, Part1, Part2, Function) ->
- Dir = proplists:get_value(priv_dir, Config),
- Ns = [begin
- Name = lists:concat([atom_to_list(N),"_",msec(),".part"]),
- File = filename:join([Dir, Name]),
- file:delete(File),
- rpc_cast(N, ?MODULE, Function, [File, Part, Config], File),
- {N, File}
- end || Part <- [Part1, Part2], N <- Part],
- all_nodes_files(Ns, "done", Config),
- lists:foreach(fun({_N,File}) -> file:delete(File) end, Ns),
- PartFile = make_partition_file(Config),
- touch(PartFile, "done").
-
-%% The node signals its success by touching a file.
-mk_part_node(File, MyPart0, Config) ->
- touch(File, "start"), % debug
- MyPart = lists:sort(MyPart0),
- ?UNTIL(is_node_in_part(File, MyPart)),
- touch(File, "done").
-
-%% The calls to append_to_file are for debugging.
-is_node_in_part(File, MyPart) ->
- lists:foreach(fun(N) ->
- _ = erlang:disconnect_node(N)
- end, nodes() -- MyPart),
- case {(Known = get_known(node())) =:= MyPart,
- (Nodes = lists:sort([node() | nodes()])) =:= MyPart} of
- {true, true} ->
- %% Make sure the resolvers have been terminated,
- %% otherwise they may pop up and send some message.
- %% (This check is probably unnecessary.)
- case element(5, global:info()) of
- [] ->
- true;
- Rs ->
- append_to_file(File, {now(), Known, Nodes, Rs}),
- false
- end;
- _ ->
- append_to_file(File, {now(), Known, Nodes}),
- false
- end.
-is_ready_partition(Config) ->
- File = make_partition_file(Config),
- file_contents(File, "done", Config),
- file:delete(File),
- true.
+create_partitions(PartitionsList) ->
+ AllNodes = lists:sort(lists:flatten(PartitionsList)),
+
+ %% Take down all connections on all nodes...
+ AllOk = {lists:map(fun (_) -> ok end, AllNodes), []},
+ io:format("Disconnecting all nodes from eachother...", []),
+ AllOk = rfmulticall(
+ AllNodes,
+ fun () ->
+ lists:foreach(fun (N) ->
+ erlang:disconnect_node(N)
+ end, nodes()),
+ wait_until(fun () -> [] == global_known() end),
+ ok
+ end, 5000),
+ %% Here we know that all 'lost_connection' messages that will be
+ %% sent by global name servers due to these disconnects have been
+ %% sent, but we don't know that all of them have been received and
+ %% handled. By communicating with all global name servers one more
+ %% time it is very likely that all of them have been received and
+ %% handled (however, not guaranteed). If 'lost_connection' messages
+ %% are received after we begin to set up the partitions, the
+ %% partitions may lose connection with some of its nodes.
+ lists:foreach(fun (N) -> [] = global_known(N) end, AllNodes),
+
+ %% Set up fully connected partitions...
+ io:format("Connecting partitions...", []),
+ lists:foreach(
+ fun (Partition) ->
+ Part = lists:sort(Partition),
+ PartOk = {lists:map(fun (_) -> ok end, Part), []},
+ PartOk = rfmulticall(
+ Part,
+ fun () ->
+ wait_until(
+ fun () ->
+ ConnNodes = Part -- [node()|nodes()],
+ if ConnNodes == [] ->
+ true;
+ true ->
+ lists:foreach(
+ fun (N) ->
+ net_kernel:connect_node(N)
+ end, ConnNodes),
+ false
+ end
+ end),
+ ok
+
+ end, 5000)
+ end, PartitionsList),
+ ok.
+
+setup_partitions(PartCtrlr, PartList) when is_atom(PartCtrlr) ->
+ ok = rfcall(PartCtrlr, fun () -> create_partitions(PartList) end),
+ io:format("Partitions successfully setup:~n", []),
+ lists:foreach(fun (Part) ->
+ io:format("~p~n", [Part])
+ end,
+ PartList),
+ ok;
+setup_partitions(Config, PartList) when is_list(Config) ->
+ PartCtrlr = start_partition_controller(Config),
+ setup_partitions(PartCtrlr, PartList),
+ PartCtrlr.
+
+start_partition_controller(Config) when is_list(Config) ->
+ {ok, PartCtrlr} = start_hidden_node(part_ctrlr, Config),
+ PartCtrlr.
+
+stop_partition_controller(PartCtrlr) ->
+ stop_node(PartCtrlr).
wait_for_ready_net(Config) ->
wait_for_ready_net([node()|nodes()], Config).
@@ -688,64 +702,30 @@ wait_for_ready_net(Nodes0, Config) ->
end, Nodes)
end).
-%% To make it less probable that some low-level problem causes
-%% problems, the receiving node is ping:ed.
-rpc_cast(Node, Module, Function, Args) ->
- {_,pong,Node}= {node(),net_adm:ping(Node),Node},
- rpc:cast(Node, Module, Function, Args).
-
-rpc_cast(Node, Module, Function, Args, File) ->
- case net_adm:ping(Node) of
- pong ->
- rpc:cast(Node, Module, Function, Args);
- Else ->
- append_to_file(File, {now(), {rpc_cast, Node, Module, Function,
- Args, Else}})
- %% Maybe we should crash, but it probably doesn't matter.
- end.
-
-touch(File, List) ->
- ok = file:write_file(File, list_to_binary(List)).
+global_known(Node) ->
+ gen_server:call({global_name_server, Node}, get_known, infinity).
-append_to_file(File, Term) ->
- {ok, Fd} = file:open(File, [raw,binary,append]),
- ok = file:write(Fd, io_lib:format("~p.~n", [Term])),
- ok = file:close(Fd).
+global_known() ->
+ global_known(node()).
-all_nodes_files(Files, ContentsList, Config) ->
- lists:all(fun({_N,File}) ->
- file_contents(File, ContentsList, Config)
- end, Files).
+wait_until(F) ->
+ case catch F() of
+ true ->
+ ok;
+ _ ->
+ receive after 10 -> ok end,
+ wait_until(F)
+ end.
-file_contents(File, ContentsList, Config) ->
- file_contents(File, ContentsList, Config, no_log_file).
+rfcall(Node, Fun) ->
+ rfcall(Node, Fun, infinity).
-file_contents(File, ContentsList, Config, LogFile) ->
- Contents = list_to_binary(ContentsList),
- Sz = size(Contents),
- ?UNTIL(begin
- case file:read_file(File) of
- {ok, FileContents}=Reply ->
- case catch split_binary(FileContents, Sz) of
- {Contents,_} ->
- true;
- _ ->
- catch append_to_file(LogFile,
- {File,Contents,Reply}),
- false
- end;
- Reply ->
- catch append_to_file(LogFile, {File, Contents, Reply}),
- false
- end
- end).
+rfcall(Node, Fun, Timeout) ->
+ rpc:call(Node, erlang, apply, [Fun, []], Timeout).
-make_partition_file(Config) ->
- Dir = proplists:get_value(priv_dir, Config),
- filename:join([Dir, atom_to_list(make_partition_done)]).
+rfmulticall(Nodes, Fun) ->
+ rfmulticall(Nodes, Fun, infinity).
-msec() ->
- msec(now()).
+rfmulticall(Nodes, Fun, Timeout) ->
+ rpc:multicall(Nodes, erlang, apply, [Fun, []], Timeout).
-msec(T) ->
- element(1,T)*1000000000 + element(2,T)*1000 + element(3,T) div 1000.
diff --git a/lib/kernel/test/rpc_SUITE.erl b/lib/kernel/test/rpc_SUITE.erl
index a89a7600a2..aea53003cb 100644
--- a/lib/kernel/test/rpc_SUITE.erl
+++ b/lib/kernel/test/rpc_SUITE.erl
@@ -70,13 +70,13 @@ call(Config) when is_list(Config) ->
PA = filename:dirname(code:which(?MODULE)),
%% Note. First part of nodename sets response delay in seconds
{ok, N1} = test_server:start_node('3_rpc_SUITE_call', slave,
- [{args, "-pa " ++ PA}]),
+ [{args, "-connect_all false -pa " ++ PA}]),
{ok, N2} = test_server:start_node('1_rcp_SUITE_call', slave,
- [{args, "-pa " ++ PA}]),
+ [{args, "-connect_all false -pa " ++ PA}]),
{ok, N3} = test_server:start_node('4_rcp_SUITE_call', slave,
- [{args, "-pa " ++ PA}]),
+ [{args, "-connect_all false -pa " ++ PA}]),
{ok, N4} = test_server:start_node('8_rcp_SUITE_call', slave,
- [{args, "-pa " ++ PA}]),
+ [{args, "-connect_all false -pa " ++ PA}]),
ok = io:format("~p~n", [[N1, N2, N3]]),
{hej,_,N1} = rpc:call(N1, ?MODULE, f, []),
{hej,_,N2} = rpc:call(N2, ?MODULE, f, [], 2000),
@@ -95,13 +95,13 @@ block_call(Config) when is_list(Config) ->
PA = filename:dirname(code:which(?MODULE)),
%% Note. First part of nodename sets response delay in seconds
{ok, N1} = test_server:start_node('3_rpc_SUITE_block_call', slave,
- [{args, "-pa " ++ PA}]),
+ [{args, "-connect_all false -pa " ++ PA}]),
{ok, N2} = test_server:start_node('1_rcp_SUITE_block_call', slave,
- [{args, "-pa " ++ PA}]),
+ [{args, "-connect_all false -pa " ++ PA}]),
{ok, N3} = test_server:start_node('4_rcp_SUITE_block_call', slave,
- [{args, "-pa " ++ PA}]),
+ [{args, "-connect_all false -pa " ++ PA}]),
{ok, N4} = test_server:start_node('8_rcp_SUITE_block_call', slave,
- [{args, "-pa " ++ PA}]),
+ [{args, "-connect_all false -pa " ++ PA}]),
ok = io:format("~p~n", [[N1, N2, N3]]),
{hej,_,N1} = rpc:block_call(N1, ?MODULE, f, []),
{hej,_,N2} = rpc:block_call(N2, ?MODULE, f, [], 2000),
@@ -121,9 +121,9 @@ multicall(Config) when is_list(Config) ->
PA = filename:dirname(code:which(?MODULE)),
%% Note. First part of nodename sets response delay in seconds
{ok, N1} = test_server:start_node('3_rpc_SUITE_multicall', slave,
- [{args, "-pa " ++ PA}]),
+ [{args, "-connect_all false -pa " ++ PA}]),
{ok, N2} = test_server:start_node('1_rcp_SUITE_multicall', slave,
- [{args, "-pa " ++ PA}]),
+ [{args, "-connect_all false -pa " ++ PA}]),
ok = io:format("~p~n", [[N1, N2]]),
{[{hej,_,N1},{hej,_,N2}],[]} =
rpc:multicall([N1, N2], ?MODULE, f, []),
@@ -137,13 +137,13 @@ multicall_timeout(Config) when is_list(Config) ->
PA = filename:dirname(code:which(?MODULE)),
%% Note. First part of nodename sets response delay in seconds
{ok, N1} = test_server:start_node('11_rpc_SUITE_multicall', slave,
- [{args, "-pa " ++ PA}]),
+ [{args, "-connect_all false -pa " ++ PA}]),
{ok, N2} = test_server:start_node('8_rpc_SUITE_multicall', slave,
- [{args, "-pa " ++ PA}]),
+ [{args, "-connect_all false -pa " ++ PA}]),
{ok, N3} = test_server:start_node('5_rpc_SUITE_multicall', slave,
- [{args, "-pa " ++ PA}]),
+ [{args, "-connect_all false -pa " ++ PA}]),
{ok, N4} = test_server:start_node('2_rcp_SUITE_multicall', slave,
- [{args, "-pa " ++ PA}]),
+ [{args, "-connect_all false -pa " ++ PA}]),
ok = io:format("~p~n", [[N1, N2]]),
{[{hej,_,N3},{hej,_,N4}],[N1, N2]} =
rpc:multicall([N3, N1, N2, N4], ?MODULE, f, [], 6000),
@@ -159,9 +159,9 @@ multicall_timeout(Config) when is_list(Config) ->
multicall_dies(Config) when is_list(Config) ->
PA = filename:dirname(code:which(?MODULE)),
{ok, N1} = test_server:start_node('rpc_SUITE_multicall_dies_1', slave,
- [{args, "-pa " ++ PA}]),
+ [{args, "-connect_all false -pa " ++ PA}]),
{ok, N2} = test_server:start_node('rcp_SUITE_multicall_dies_2', slave,
- [{args, "-pa " ++ PA}]),
+ [{args, "-connect_all false -pa " ++ PA}]),
Nodes = [N1, N2],
%%
{[{badrpc, {'EXIT', normal}}, {badrpc, {'EXIT', normal}}], []} =
@@ -213,10 +213,10 @@ multicall_node_dies(Config) when is_list(Config) ->
do_multicall_2_nodes_dies(Mod, Func, Args) ->
ok = io:format("~p:~p~p~n", [Mod, Func, Args]),
PA = filename:dirname(code:which(?MODULE)),
- {ok, N1} = test_server:start_node('rpc_SUITE_multicall_node_dies_1', slave,
- [{args, "-pa " ++ PA}]),
- {ok, N2} = test_server:start_node('rcp_SUITE_multicall_node_dies_2', slave,
- [{args, "-pa " ++ PA}]),
+ {ok, N1} = test_server:start_node('rpc_SUITE_multicall_node_dies_1', peer,
+ [{args, "-connect_all false -pa " ++ PA}]),
+ {ok, N2} = test_server:start_node('rcp_SUITE_multicall_node_dies_2', peer,
+ [{args, "-connect_all false -pa " ++ PA}]),
Nodes = [N1, N2],
{[], Nodes} = rpc:multicall(Nodes, Mod, Func, Args),
Msgs = flush([]),
@@ -229,7 +229,7 @@ do_multicall_2_nodes_dies(Mod, Func, Args) ->
called_dies(Config) when is_list(Config) ->
PA = filename:dirname(code:which(?MODULE)),
{ok, N} = test_server:start_node(rpc_SUITE_called_dies, slave,
- [{args, "-pa " ++ PA}]),
+ [{args, "-connect_all false -pa " ++ PA}]),
%%
rep(fun (Tag, Call, Args) ->
{Tag,{badrpc,{'EXIT',normal}}} =
@@ -395,7 +395,7 @@ node_rep(Fun, Name, PA, M, F, A) ->
node_rep_call(Tag, Call, Args, Fun, Name0, PA) ->
Name = list_to_atom(Name0 ++ "_" ++ atom_to_list(Tag)),
{ok, N} = test_server:start_node(Name, slave,
- [{args, "-pa " ++ PA}]),
+ [{args, "-connect_all false -pa " ++ PA}]),
Fun(Call, [N|Args]),
catch test_server:stop_node(N),
ok.
@@ -405,7 +405,7 @@ called_throws(Config) when is_list(Config) ->
PA = filename:dirname(code:which(?MODULE)),
%%
{ok, N} = test_server:start_node(rpc_SUITE_called_throws, slave,
- [{args, "-pa " ++ PA}]),
+ [{args, "-connect_all false -pa " ++ PA}]),
%%
rep(fun (Tag, Call, Args) ->
{Tag,up} =
@@ -424,7 +424,7 @@ called_throws(Config) when is_list(Config) ->
call_benchmark(Config) when is_list(Config) ->
PA = filename:dirname(code:which(?MODULE)),
{ok, Node} = test_server:start_node(rpc_SUITE_call_benchmark, slave,
- [{args, "-pa " ++ PA}]),
+ [{args, "-connect_all false -pa " ++ PA}]),
Iter = case erlang:system_info(modified_timing_level) of
undefined -> 10000;
_ -> 500 %Modified timing - spawn is slower
@@ -452,7 +452,7 @@ do_call_benchmark(Node, I, M) ->
async_call(Config) when is_list(Config) ->
%% Note: First part of nodename sets response delay in seconds.
PA = filename:dirname(code:which(?MODULE)),
- NodeArgs = [{args,"-pa "++ PA}],
+ NodeArgs = [{args,"-connect_all false -pa "++ PA}],
{ok,Node1} = test_server:start_node('1_rpc_SUITE_call', slave, NodeArgs),
{ok,Node2} = test_server:start_node('10_rpc_SUITE_call', slave, NodeArgs),
{ok,Node3} = test_server:start_node('20_rpc_SUITE_call', slave, NodeArgs),