summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorRickard Green <rickard@erlang.org>2019-11-27 18:23:10 +0100
committerRickard Green <rickard@erlang.org>2019-11-27 18:23:10 +0100
commit6901bf27ade015c3bb6ff5766d7f71fbec90dd8f (patch)
treee2e879b262a3d82981fba4a6abd69886817e8f9a /lib
parenta09985fc298ead645210359f1fc2626da4ffe780 (diff)
parent46f4421c754c30b35dfd79fcb3c5c5fbb584e5e9 (diff)
downloaderlang-6901bf27ade015c3bb6ff5766d7f71fbec90dd8f.tar.gz
Merge branch 'rickard/nodedown-reason/OTP-16216' into maint
* rickard/nodedown-reason/OTP-16216: net_kernel: save connection pending owners in map net_kernel: save connection owners in map Fix race causing nodedown reason to be lost
Diffstat (limited to 'lib')
-rw-r--r--lib/kernel/src/kernel.app.src2
-rw-r--r--lib/kernel/src/net_kernel.erl286
2 files changed, 183 insertions, 105 deletions
diff --git a/lib/kernel/src/kernel.app.src b/lib/kernel/src/kernel.app.src
index 4f5e6d782f..1fdb390a32 100644
--- a/lib/kernel/src/kernel.app.src
+++ b/lib/kernel/src/kernel.app.src
@@ -149,6 +149,6 @@
{logger_sasl_compatible, false}
]},
{mod, {kernel, []}},
- {runtime_dependencies, ["erts-10.2.5", "stdlib-3.5", "sasl-3.0"]}
+ {runtime_dependencies, ["erts-@OTP-16216@", "stdlib-3.5", "sasl-3.0"]}
]
}.
diff --git a/lib/kernel/src/net_kernel.erl b/lib/kernel/src/net_kernel.erl
index e333bcd307..4a68e6676d 100644
--- a/lib/kernel/src/net_kernel.erl
+++ b/lib/kernel/src/net_kernel.erl
@@ -102,8 +102,9 @@
tick, %% tick information
connecttime, %% the connection setuptime.
connections, %% table of connections
- conn_owners = [], %% List of connection owner pids,
- pend_owners = [], %% List of potential owners
+ conn_owners = #{}, %% Map of connection owner pids,
+ dist_ctrlrs = #{}, %% Map of dist controllers (local ports or pids),
+ pend_owners = #{}, %% Map of potential owners
listen, %% list of #listen
allowed, %% list of allowed nodes in a restricted system
verbose = 0, %% level of verboseness
@@ -125,6 +126,7 @@
conn_id, %% Connection identity
state, %% pending | up | up_pending
owner, %% owner pid
+ ctrlr, %% Controller port or pid
pending_owner, %% possible new owner
address, %% #net_address
waiting = [], %% queued processes
@@ -356,7 +358,7 @@ do_auto_connect_1(Node, ConnId, From, State) ->
spawn(?MODULE,passive_connect_monitor,[From,Node]),
{noreply, State};
_ ->
- erts_internal:abort_connection(Node, ConnId),
+ erts_internal:abort_pending_connection(Node, ConnId),
{reply, false, State}
end;
@@ -389,7 +391,7 @@ do_auto_connect_2(Node, ConnId, From, State, ConnLookup) ->
case application:get_env(kernel, dist_auto_connect) of
{ok, never} ->
?connect_failure(Node,{dist_auto_connect,never}),
- erts_internal:abort_connection(Node, ConnId),
+ erts_internal:abort_pending_connection(Node, ConnId),
{reply, false, State};
%% This might happen due to connection close
@@ -399,16 +401,15 @@ do_auto_connect_2(Node, ConnId, From, State, ConnLookup) ->
(hd(ConnLookup))#connection.state =:= up ->
?connect_failure(Node,{barred_connection,
ets:lookup(sys_dist, Node)}),
- erts_internal:abort_connection(Node, ConnId),
{reply, false, State};
_ ->
case setup(Node, ConnId, normal, From, State) of
{ok, SetupPid} ->
- Owners = [{SetupPid, Node} | State#state.conn_owners],
- {noreply,State#state{conn_owners=Owners}};
+ Owners = State#state.conn_owners,
+ {noreply,State#state{conn_owners=Owners#{SetupPid => Node}}};
_Error ->
?connect_failure(Node, {setup_call, failed, _Error}),
- erts_internal:abort_connection(Node, ConnId),
+ erts_internal:abort_pending_connection(Node, ConnId),
{reply, false, State}
end
end
@@ -428,8 +429,8 @@ do_explicit_connect([#barred_connection{}], Type, Node, ConnId, From , State) ->
do_explicit_connect(_ConnLookup, Type, Node, ConnId, From , State) ->
case setup(Node,ConnId,Type,From,State) of
{ok, SetupPid} ->
- Owners = [{SetupPid, Node} | State#state.conn_owners],
- {noreply,State#state{conn_owners=Owners}};
+ Owners = State#state.conn_owners,
+ {noreply,State#state{conn_owners=Owners#{SetupPid => Node}}};
_Error ->
?connect_failure(Node, {setup_call, failed, _Error}),
{reply, false, State}
@@ -468,7 +469,7 @@ handle_call({connect, Type, Node}, From, State) ->
{noreply, _S} -> %% connection pending
ok;
{reply, false, _S} -> %% connection refused
- erts_internal:abort_connection(Node, ConnId)
+ erts_internal:abort_pending_connection(Node, ConnId)
end,
R1
catch
@@ -715,20 +716,39 @@ handle_info({accept,AcceptPid,Socket,Family,Proto}, State) ->
end;
%%
+%% New dist controller has been registered
+%%
+handle_info({dist_ctrlr, Ctrlr, Node, SetupPid} = Msg,
+ #state{dist_ctrlrs = DistCtrlrs} = State) ->
+ case ets:lookup(sys_dist, Node) of
+ [Conn] when (Conn#connection.state =:= pending)
+ andalso (Conn#connection.owner =:= SetupPid)
+ andalso (Conn#connection.ctrlr =:= undefined)
+ andalso (is_port(Ctrlr) orelse is_pid(Ctrlr))
+ andalso (node(Ctrlr) == node()) ->
+ link(Ctrlr),
+ ets:insert(sys_dist, Conn#connection{ctrlr = Ctrlr}),
+ {noreply, State#state{dist_ctrlrs = DistCtrlrs#{Ctrlr => Node}}};
+ _ ->
+ error_msg("Net kernel got ~tw~n",[Msg]),
+ {noreply, State}
+ end;
+
+%%
%% A node has successfully been connected.
%%
-handle_info({SetupPid, {nodeup,Node,Address,Type,Immediate}},
- State) ->
+handle_info({SetupPid, {nodeup,Node,Address,Type,Immediate}}, State) ->
case {Immediate, ets:lookup(sys_dist, Node)} of
- {true, [Conn]} when Conn#connection.state =:= pending,
- Conn#connection.owner =:= SetupPid ->
- ets:insert(sys_dist, Conn#connection{state = up,
- address = Address,
- waiting = [],
- type = Type}),
- SetupPid ! {self(), inserted},
- reply_waiting(Node,Conn#connection.waiting, true),
- {noreply, State};
+ {true, [Conn]} when (Conn#connection.state =:= pending)
+ andalso (Conn#connection.owner =:= SetupPid)
+ andalso (Conn#connection.ctrlr /= undefined) ->
+ ets:insert(sys_dist, Conn#connection{state = up,
+ address = Address,
+ waiting = [],
+ type = Type}),
+ SetupPid ! {self(), inserted},
+ reply_waiting(Node,Conn#connection.waiting, true),
+ {noreply, State};
_ ->
SetupPid ! {self(), bad_request},
{noreply, State}
@@ -756,21 +776,17 @@ handle_info({AcceptPid, {accept_pending,MyNode,Node,Address,Type}}, State) ->
{'EXIT', OldOwner, _} ->
true
end,
- Owners = lists:keyreplace(OldOwner,
- 1,
- State#state.conn_owners,
- {AcceptPid, Node}),
ets:insert(sys_dist, Conn#connection{owner = AcceptPid}),
AcceptPid ! {self(),{accept_pending,ok_pending}},
- State1 = State#state{conn_owners=Owners},
- {noreply,State1}
+ Owners = maps:remove(OldOwner, State#state.conn_owners),
+ {noreply, State#state{conn_owners=Owners#{AcceptPid => Node}}}
end;
[#connection{state=up}=Conn] ->
AcceptPid ! {self(), {accept_pending, up_pending}},
ets:insert(sys_dist, Conn#connection { pending_owner = AcceptPid,
state = up_pending }),
- Pend = [{AcceptPid, Node} | State#state.pend_owners ],
- {noreply, State#state { pend_owners = Pend }};
+ Pend = State#state.pend_owners,
+ {noreply, State#state { pend_owners = Pend#{AcceptPid => Node} }};
[#connection{state=up_pending}] ->
AcceptPid ! {self(), {accept_pending, already_pending}},
{noreply, State};
@@ -784,8 +800,8 @@ handle_info({AcceptPid, {accept_pending,MyNode,Node,Address,Type}}, State) ->
address = Address,
type = Type}),
AcceptPid ! {self(),{accept_pending,ok}},
- Owners = [{AcceptPid,Node} | State#state.conn_owners],
- {noreply, State#state{conn_owners = Owners}}
+ Owners = State#state.conn_owners,
+ {noreply, State#state{conn_owners = Owners#{AcceptPid => Node}}}
catch
_:_ ->
error_logger:error_msg("~n** Cannot get connection id for node ~w~n",
@@ -796,14 +812,17 @@ handle_info({AcceptPid, {accept_pending,MyNode,Node,Address,Type}}, State) ->
end;
handle_info({SetupPid, {is_pending, Node}}, State) ->
- Reply = lists:member({SetupPid,Node},State#state.conn_owners),
+ Reply = case maps:get(SetupPid, State#state.conn_owners, undefined) of
+ Node -> true;
+ _ -> false
+ end,
SetupPid ! {self(), {is_pending, Reply}},
{noreply, State};
%%
%% Handle different types of process terminations.
%%
-handle_info({'EXIT', From, Reason}, State) when is_pid(From) ->
+handle_info({'EXIT', From, Reason}, State) ->
verbose({'EXIT', From, Reason}, 1, State),
handle_exit(From, Reason, State);
@@ -827,14 +846,22 @@ handle_info({From,badcookie,_To,_Mess}, State) ->
%%
handle_info(tick, State) ->
?tckr_dbg(tick),
- lists:foreach(fun({Pid,_Node}) -> Pid ! {self(), tick} end,
- State#state.conn_owners),
+ ok = maps:fold(fun (Pid, _Node, ok) ->
+ Pid ! {self(), tick},
+ ok
+ end,
+ ok,
+ State#state.conn_owners),
{noreply,State};
handle_info(aux_tick, State) ->
?tckr_dbg(aux_tick),
- lists:foreach(fun({Pid,_Node}) -> Pid ! {self(), aux_tick} end,
- State#state.conn_owners),
+ ok = maps:fold(fun (Pid, _Node, ok) ->
+ Pid ! {self(), aux_tick},
+ ok
+ end,
+ ok,
+ State#state.conn_owners),
{noreply,State};
handle_info(transition_period_end,
@@ -873,6 +900,7 @@ do_handle_exit(Pid, Reason, State) ->
listen_exit(Pid, State),
accept_exit(Pid, State),
conn_own_exit(Pid, Reason, State),
+ dist_ctrlr_exit(Pid, Reason, State),
pending_own_exit(Pid, State),
ticker_exit(Pid, State),
{noreply,State}.
@@ -900,21 +928,24 @@ accept_exit(Pid, State) ->
false
end.
-conn_own_exit(Pid, Reason, State) ->
- Owners = State#state.conn_owners,
- case lists:keysearch(Pid, 1, Owners) of
- {value, {Pid, Node}} ->
- throw({noreply, nodedown(Pid, Node, Reason, State)});
- _ ->
- false
+conn_own_exit(Pid, Reason, #state{conn_owners = Owners} = State) ->
+ case maps:get(Pid, Owners, undefined) of
+ undefined -> false;
+ Node -> throw({noreply, nodedown(Pid, Node, Reason, State)})
+ end.
+
+dist_ctrlr_exit(Pid, Reason, #state{dist_ctrlrs = DCs} = State) ->
+ case maps:get(Pid, DCs, undefined) of
+ undefined -> false;
+ Node -> throw({noreply, nodedown(Pid, Node, Reason, State)})
end.
-pending_own_exit(Pid, State) ->
- Pend = State#state.pend_owners,
- case lists:keysearch(Pid, 1, Pend) of
- {value, {Pid, Node}} ->
- NewPend = lists:keydelete(Pid, 1, Pend),
- State1 = State#state { pend_owners = NewPend },
+pending_own_exit(Pid, #state{pend_owners = Pend} = State) ->
+ case maps:get(Pid, Pend, undefined) of
+ undefined ->
+ false;
+ Node ->
+ State1 = State#state { pend_owners = maps:remove(Pid, Pend)},
case get_conn(Node) of
{ok, Conn} when Conn#connection.state =:= up_pending ->
reply_waiting(Node,Conn#connection.waiting, true),
@@ -925,9 +956,7 @@ pending_own_exit(Pid, State) ->
_ ->
ok
end,
- throw({noreply, State1});
- _ ->
- false
+ throw({noreply, State1})
end.
ticker_exit(Pid, #state{tick = #tick{ticker = Pid, time = T} = Tck} = State) ->
@@ -945,10 +974,10 @@ ticker_exit(_, _) ->
%% nodedown(Owner, Node, Reason, State) -> State'
%% -----------------------------------------------------------
-nodedown(Owner, Node, Reason, State) ->
+nodedown(Exited, Node, Reason, State) ->
case get_conn(Node) of
{ok, Conn} ->
- nodedown(Conn, Owner, Node, Reason, Conn#connection.type, State);
+ nodedown(Conn, Exited, Node, Reason, Conn#connection.type, State);
_ ->
State
end.
@@ -959,58 +988,117 @@ get_conn(Node) ->
_ -> error
end.
-nodedown(Conn, Owner, Node, Reason, Type, OldState) ->
- Owners = lists:keydelete(Owner, 1, OldState#state.conn_owners),
- State = OldState#state{conn_owners = Owners},
+delete_owner(Owner, #state{conn_owners = Owners} = State) ->
+ State#state{conn_owners = maps:remove(Owner, Owners)}.
+
+delete_ctrlr(Ctrlr, #state{dist_ctrlrs = DCs} = State) ->
+ State#state{dist_ctrlrs = maps:remove(Ctrlr, DCs)}.
+
+nodedown(Conn, Exited, Node, Reason, Type, State) ->
case Conn#connection.state of
- pending when Conn#connection.owner =:= Owner ->
- pending_nodedown(Conn, Node, Type, State);
- up when Conn#connection.owner =:= Owner ->
- up_nodedown(Conn, Node, Reason, Type, State);
- up_pending when Conn#connection.owner =:= Owner ->
- up_pending_nodedown(Conn, Node, Reason, Type, State);
+ pending ->
+ pending_nodedown(Conn, Exited, Node, Type, State);
+ up ->
+ up_nodedown(Conn, Exited, Node, Reason, Type, State);
+ up_pending ->
+ up_pending_nodedown(Conn, Exited, Node, Reason, Type, State);
_ ->
- OldState
+ State
end.
-pending_nodedown(Conn, Node, Type, State) ->
- % Don't bar connections that have never been alive
- %mark_sys_dist_nodedown(Node),
- % - instead just delete the node:
- erts_internal:abort_connection(Node, Conn#connection.conn_id),
- ets:delete(sys_dist, Node),
- reply_waiting(Node,Conn#connection.waiting, false),
+pending_nodedown(#connection{owner = Owner,
+ waiting = Waiting,
+ conn_id = CID},
+ Exited, Node, Type, State) when Owner =:= Exited ->
+ %% Owner exited!
+ case erts_internal:abort_pending_connection(Node, CID) of
+ false ->
+ %% Just got connected but that message has not
+ %% reached us yet. Wait for controller to exit and
+ %% handle this then...
+ ok;
+ true ->
+ %% Don't bar connections that have never been alive, i.e.
+ %% no 'mark_sys_dist_nodedown(Node)'; instead just delete
+ %% the node:
+ ets:delete(sys_dist, Node),
+ reply_waiting(Node, Waiting, false),
+ case Type of
+ normal ->
+ ?nodedown(Node, State);
+ _ ->
+ ok
+ end
+ end,
+ delete_owner(Owner, State);
+pending_nodedown(#connection{owner = Owner,
+ ctrlr = Ctrlr,
+ waiting = Waiting},
+ Exited, Node, Type, State) when Ctrlr =:= Exited ->
+ %% Controller exited!
+ %%
+ %% Controller has been registered but crashed
+ %% before sending mark up message...
+ %%
+ %% 'nodeup' messages has been sent by the emulator,
+ %% so bar the connection...
+ mark_sys_dist_nodedown(Node),
+ reply_waiting(Node,Waiting, true),
case Type of
- normal ->
- ?nodedown(Node, State);
- _ ->
- ok
+ normal ->
+ ?nodedown(Node, State);
+ _ ->
+ ok
end,
+ delete_owner(Owner, delete_ctrlr(Ctrlr, State));
+pending_nodedown(_Conn, _Exited, _Node, _Type, State) ->
State.
-up_pending_nodedown(Conn, Node, _Reason, _Type, State) ->
- AcceptPid = Conn#connection.pending_owner,
- Owners = State#state.conn_owners,
- Pend = lists:keydelete(AcceptPid, 1, State#state.pend_owners),
- erts_internal:abort_connection(Node, Conn#connection.conn_id),
+up_pending_nodedown(#connection{owner = Owner,
+ ctrlr = Ctrlr,
+ pending_owner = AcceptPid} = Conn,
+ Exited, Node, _Reason,
+ _Type, State) when Ctrlr =:= Exited ->
+ %% Controller exited!
Conn1 = Conn#connection { owner = AcceptPid,
conn_id = erts_internal:new_connection(Node),
+ ctrlr = undefined,
pending_owner = undefined,
state = pending },
ets:insert(sys_dist, Conn1),
AcceptPid ! {self(), pending},
- State#state{conn_owners = [{AcceptPid,Node}|Owners], pend_owners = Pend}.
+ Pend = maps:remove(AcceptPid, State#state.pend_owners),
+ Owners = State#state.conn_owners,
+ State1 = State#state{conn_owners = Owners#{AcceptPid => Node},
+ pend_owners = Pend},
+ delete_owner(Owner, delete_ctrlr(Ctrlr, State1));
+up_pending_nodedown(#connection{owner = Owner},
+ Exited, _Node, _Reason,
+ _Type, State) when Owner =:= Exited ->
+ %% Owner exited!
+ delete_owner(Owner, State);
+up_pending_nodedown(_Conn, _Exited, _Node, _Reason, _Type, State) ->
+ State.
-up_nodedown(Conn, Node, _Reason, Type, State) ->
- mark_sys_dist_nodedown(Conn, Node),
+up_nodedown(#connection{owner = Owner,
+ ctrlr = Ctrlr},
+ Exited, Node, _Reason, Type, State) when Ctrlr =:= Exited ->
+ %% Controller exited!
+ mark_sys_dist_nodedown(Node),
case Type of
normal -> ?nodedown(Node, State);
_ -> ok
end,
+ delete_owner(Owner, delete_ctrlr(Ctrlr, State));
+up_nodedown(#connection{owner = Owner},
+ Exited, _Node, _Reason,
+ _Type, State) when Owner =:= Exited ->
+ %% Owner exited!
+ delete_owner(Owner, State);
+up_nodedown(_Conn, _Exited, _Node, _Reason, _Type, State) ->
State.
-mark_sys_dist_nodedown(Conn, Node) ->
- erts_internal:abort_connection(Node, Conn#connection.conn_id),
+mark_sys_dist_nodedown(Node) ->
case application:get_env(kernel, dist_auto_connect) of
{ok, once} ->
ets:insert(sys_dist, #barred_connection{node = Node});
@@ -1098,29 +1186,19 @@ mk_monitor_nodes_error(_Flag, Opts) ->
do_disconnect(Node, State) ->
case ets:lookup(sys_dist, Node) of
[Conn] when Conn#connection.state =:= up ->
- disconnect_pid(Conn#connection.owner, State);
+ disconnect_ctrlr(Conn#connection.ctrlr, State);
[Conn] when Conn#connection.state =:= up_pending ->
- disconnect_pid(Conn#connection.owner, State);
+ disconnect_ctrlr(Conn#connection.ctrlr, State);
_ ->
{false, State}
end.
-disconnect_pid(Pid, State) ->
- exit(Pid, disconnect),
-
- %% This code used to only use exit + recv 'EXIT' to sync,
- %% but since OTP-22 links are no longer broken atomically
- %% so the exit message below can arrive before any remaining
- %% exit messages have killed the distribution port
- Ref = erlang:monitor(process, Pid),
- %% Sync wait for connection to die!!!
+disconnect_ctrlr(Ctrlr, State) ->
+ exit(Ctrlr, disconnect),
receive
- {'DOWN',Ref,_,_,_} ->
- receive
- {'EXIT',Pid,Reason} ->
- {_,State1} = handle_exit(Pid, Reason, State),
- {true, State1}
- end
+ {'EXIT',Ctrlr,Reason} ->
+ {_,State1} = handle_exit(Ctrlr, Reason, State),
+ {true, State1}
end.
%%