summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJean-Sébastien Pédron <jean-sebastien.pedron@dumbbell.fr>2023-05-12 17:11:57 +0200
committerJean-Sébastien Pédron <jean-sebastien.pedron@dumbbell.fr>2023-05-16 10:46:32 +0200
commit213eb20e1658a1cebb4c1a07606e28fc7b62ebca (patch)
treef7ae3a58697bb94120e5333e21e748b30a8be4cb
parentc7d0427d621493f00a3aa0d6e7720f709f3769a6 (diff)
downloadrabbitmq-server-git-fix-feature-flags-init-vs-code_server-deadlock-take2.tar.gz
rabbit_feature_flags: Fix possible deadlock when calling the Code server (take 2)fix-feature-flags-init-vs-code_server-deadlock-take2
[Why] The background reason for this fix is about the same as the one explained in the previous version of this fix; see commit e0a2f1027278bd0901bdaa9af6065abc676ff14f. This time, the order of events that led to a similar deadlock is the following: 0. No `rabbit_ff_registry` is loaded yet. 1. Process A, B and C call `rabbit_ff_registry:something()` indirectly which triggers two initializations in parallel. * Process A did it from an explicit call to `rabbit_ff_registry_factory:initialize_factory()` during RabbitMQ boot. * Process B and C indirectly called it because they checked if a feature flag was enabled. 2. Process B acquires the lock first and finishes the initialization. A new registry is loaded and the old `rabbit_ff_registry` module copy is marked as "old". At this point, process A and C still reference that old copy because `rabbit_ff_registry:something()` is up above in its call stack. 3. Process A acquires the lock, prepares the new registry and tries to soft-purge the old `rabbit_ff_registry` copy before loading the new one. This is where the deadlock happens: process A requests the Code server to purge the old copy, but the Code server waits for process C to stop using it. The difference between the steps described in the first bug fix attempt's commit and these ones is that the process which lingers on the deleted `rabbit_ff_registry` (process C above) isn't the one who acquired the lock; process A has it. That's why the first bug fix isn't effective in this case: it relied on the fact that the process which lingers on the deleted `rabbit_ff_registry` is the process which attempts to purge the module. [How] In this commit, we go with a more drastic change. This time, we put a wrapper in front of `rabbit_ff_registry` called `rabbit_ff_registry_wrapper`. This wrapper is responsible for doing the automatic initialization if the loaded registry is the stub module. The `rabbit_ff_registry` stub now always returns `init_required` instead of performing the initialization and calling itself recursively. This way, processes linger on `rabbit_ff_registry_wrapper`, not on `rabbit_ff_registry`. Thanks to this, the Code server can proceed with the purge. See #8112.
-rw-r--r--deps/rabbit/app.bzl3
-rw-r--r--deps/rabbit/src/rabbit_feature_flags.erl30
-rw-r--r--deps/rabbit/src/rabbit_ff_controller.erl2
-rw-r--r--deps/rabbit/src/rabbit_ff_registry.erl65
-rw-r--r--deps/rabbit/src/rabbit_ff_registry_factory.erl11
-rw-r--r--deps/rabbit/src/rabbit_ff_registry_wrapper.erl156
-rw-r--r--deps/rabbit/test/feature_flags_SUITE.erl324
-rw-r--r--deps/rabbit/test/feature_flags_v2_SUITE.erl1
8 files changed, 475 insertions, 117 deletions
diff --git a/deps/rabbit/app.bzl b/deps/rabbit/app.bzl
index f8ac64f82e..6a17514549 100644
--- a/deps/rabbit/app.bzl
+++ b/deps/rabbit/app.bzl
@@ -114,6 +114,7 @@ def all_beam_files(name = "all_beam_files"):
"src/rabbit_ff_extra.erl",
"src/rabbit_ff_registry.erl",
"src/rabbit_ff_registry_factory.erl",
+ "src/rabbit_ff_registry_wrapper.erl",
"src/rabbit_fhc_helpers.erl",
"src/rabbit_fifo.erl",
"src/rabbit_fifo_client.erl",
@@ -355,6 +356,7 @@ def all_test_beam_files(name = "all_test_beam_files"):
"src/rabbit_ff_extra.erl",
"src/rabbit_ff_registry.erl",
"src/rabbit_ff_registry_factory.erl",
+ "src/rabbit_ff_registry_wrapper.erl",
"src/rabbit_fhc_helpers.erl",
"src/rabbit_fifo.erl",
"src/rabbit_fifo_client.erl",
@@ -609,6 +611,7 @@ def all_srcs(name = "all_srcs"):
"src/rabbit_ff_extra.erl",
"src/rabbit_ff_registry.erl",
"src/rabbit_ff_registry_factory.erl",
+ "src/rabbit_ff_registry_wrapper.erl",
"src/rabbit_fhc_helpers.erl",
"src/rabbit_fifo.erl",
"src/rabbit_fifo_client.erl",
diff --git a/deps/rabbit/src/rabbit_feature_flags.erl b/deps/rabbit/src/rabbit_feature_flags.erl
index 3f35fc47a9..06e46cc03a 100644
--- a/deps/rabbit/src/rabbit_feature_flags.erl
+++ b/deps/rabbit/src/rabbit_feature_flags.erl
@@ -113,7 +113,7 @@
remote_nodes/0,
running_remote_nodes/0,
does_node_support/3,
- inject_test_feature_flags/1,
+ inject_test_feature_flags/1, inject_test_feature_flags/2,
query_supported_feature_flags/0,
does_enabled_feature_flags_list_file_exist/0,
read_enabled_feature_flags_list/0,
@@ -336,8 +336,8 @@ list() -> list(all).
%% `disabled'.
%% @returns A map of selected feature flags.
-list(all) -> rabbit_ff_registry:list(all);
-list(enabled) -> rabbit_ff_registry:list(enabled);
+list(all) -> rabbit_ff_registry_wrapper:list(all);
+list(enabled) -> rabbit_ff_registry_wrapper:list(enabled);
list(disabled) -> maps:filter(
fun(FeatureName, _) -> is_disabled(FeatureName) end,
list(all)).
@@ -381,7 +381,7 @@ enable(FeatureNames) when is_list(FeatureNames) ->
with_feature_flags(FeatureNames, fun enable/1).
uses_callbacks(FeatureName) when is_atom(FeatureName) ->
- case rabbit_ff_registry:get(FeatureName) of
+ case rabbit_ff_registry_wrapper:get(FeatureName) of
undefined -> false;
FeatureProps -> uses_callbacks(FeatureProps)
end;
@@ -504,9 +504,11 @@ is_supported(FeatureNames, Timeout) ->
%% `false' if one of them is not.
is_supported_locally(FeatureName) when is_atom(FeatureName) ->
- rabbit_ff_registry:is_supported(FeatureName);
+ rabbit_ff_registry_wrapper:is_supported(FeatureName);
is_supported_locally(FeatureNames) when is_list(FeatureNames) ->
- lists:all(fun(F) -> rabbit_ff_registry:is_supported(F) end, FeatureNames).
+ lists:all(
+ fun(F) -> rabbit_ff_registry_wrapper:is_supported(F) end,
+ FeatureNames).
-spec is_enabled(feature_name() | [feature_name()]) -> boolean().
%% @doc
@@ -563,19 +565,19 @@ is_enabled(FeatureNames, blocking) ->
end.
is_enabled_nb(FeatureName) when is_atom(FeatureName) ->
- rabbit_ff_registry:is_enabled(FeatureName);
+ rabbit_ff_registry_wrapper:is_enabled(FeatureName);
is_enabled_nb(FeatureNames) when is_list(FeatureNames) ->
lists:foldl(
fun
(_F, state_changing = Acc) ->
Acc;
(F, false = Acc) ->
- case rabbit_ff_registry:is_enabled(F) of
+ case rabbit_ff_registry_wrapper:is_enabled(F) of
state_changing -> state_changing;
_ -> Acc
end;
(F, _) ->
- rabbit_ff_registry:is_enabled(F)
+ rabbit_ff_registry_wrapper:is_enabled(F)
end,
true, FeatureNames).
@@ -710,7 +712,7 @@ get_state(FeatureName) when is_atom(FeatureName) ->
%% given feature flag name doesn't correspond to a known feature flag.
get_stability(FeatureName) when is_atom(FeatureName) ->
- case rabbit_ff_registry:get(FeatureName) of
+ case rabbit_ff_registry_wrapper:get(FeatureName) of
undefined -> undefined;
FeatureProps -> get_stability(FeatureProps)
end;
@@ -738,6 +740,9 @@ init() ->
-define(PT_TESTSUITE_ATTRS, {?MODULE, testsuite_feature_flags_attrs}).
inject_test_feature_flags(FeatureFlags) ->
+ inject_test_feature_flags(FeatureFlags, true).
+
+inject_test_feature_flags(FeatureFlags, InitReg) ->
ExistingAppAttrs = module_attributes_from_testsuite(),
FeatureFlagsPerApp0 = lists:foldl(
fun({Origin, Origin, FFlags}, Acc) ->
@@ -768,7 +773,10 @@ inject_test_feature_flags(FeatureFlags) ->
[FeatureFlags, AttributesFromTestsuite],
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
ok = persistent_term:put(?PT_TESTSUITE_ATTRS, AttributesFromTestsuite),
- rabbit_ff_registry_factory:initialize_registry().
+ case InitReg of
+ true -> rabbit_ff_registry_factory:initialize_registry();
+ false -> ok
+ end.
module_attributes_from_testsuite() ->
persistent_term:get(?PT_TESTSUITE_ATTRS, []).
diff --git a/deps/rabbit/src/rabbit_ff_controller.erl b/deps/rabbit/src/rabbit_ff_controller.erl
index cc46ad5d24..ca6dd84984 100644
--- a/deps/rabbit/src/rabbit_ff_controller.erl
+++ b/deps/rabbit/src/rabbit_ff_controller.erl
@@ -1339,7 +1339,7 @@ enable_dependencies1(
Rets :: #{node() => term()}.
run_callback(Nodes, FeatureName, Command, Extra, Timeout) ->
- FeatureProps = rabbit_ff_registry:get(FeatureName),
+ FeatureProps = rabbit_ff_registry_wrapper:get(FeatureName),
Callbacks = maps:get(callbacks, FeatureProps, #{}),
case Callbacks of
#{Command := {CallbackMod, CallbackFun}}
diff --git a/deps/rabbit/src/rabbit_ff_registry.erl b/deps/rabbit/src/rabbit_ff_registry.erl
index 26718e2cc7..0ec784c8a0 100644
--- a/deps/rabbit/src/rabbit_ff_registry.erl
+++ b/deps/rabbit/src/rabbit_ff_registry.erl
@@ -37,22 +37,26 @@
-on_load(on_load/0).
-endif.
-%% Initially, is_registry_initialized/0 always returns false and this `Call'
-%% is always called. The case statement is here to convince Dialyzer that the
-%% function could return values of type `__ReturnedIfUninitialized' or
+%% In this registry stub, most functions want to return `init_required' to let
+%% {@link rabbit_ff_registry_wrapper} do the first time initialization.
+%%
+%% The inner case statement is here to convince Dialyzer that the function
+%% could return values of type `__ReturnedIfUninitialized' or
%% `__NeverReturned'.
%%
%% If the function was only calling itself (`Call'), Dialyzer would consider
-%% that it would never return.
+%% that it would never return. With the outer case, Dialyzer would conclude
+%% that `__ReturnedIfUninitialized' is always returned and other values will
+%% never be returned and there is no point in expecting them.
%%
-%% With just `is_registry_initialized()' case, Dialyzer would conclude that
-%% `__ReturnedIfUninitialized' is always returned and other values will never
-%% be returned and there is no point in expecting them.
+%% In the end, `Call' is never executed because {@link
+%% rabbit_ff_registry_wrapper} is responsible for calling the registry
+%% function again after initialization.
%%
%% With both cases in place, it seems that we can convince Dialyzer that the
%% function returns values matching its spec.
-define(convince_dialyzer(__Call, __ReturnedIfUninitialized, __NeverReturned),
- case is_registry_initialized() of
+ case always_return_true() of
false ->
__Call;
true ->
@@ -62,8 +66,11 @@
end
end).
--spec get(rabbit_feature_flags:feature_name()) ->
- rabbit_feature_flags:feature_props_extended() | undefined.
+-spec get(FeatureName) -> Ret when
+ FeatureName :: rabbit_feature_flags:feature_name(),
+ Ret :: FeatureProps | init_required,
+ FeatureProps :: rabbit_feature_flags:feature_props_extended() |
+ undefined.
%% @doc
%% Returns the properties of a feature flag.
%%
@@ -74,13 +81,15 @@
%% @returns the properties of the specified feature flag.
get(FeatureName) ->
- _ = rabbit_ff_registry_factory:initialize_registry(),
?convince_dialyzer(
?MODULE:get(FeatureName),
- undefined,
+ init_required,
#{provided_by => rabbit}).
--spec list(all | enabled | disabled) -> rabbit_feature_flags:feature_flags().
+-spec list(Which) -> Ret when
+ Which :: all | enabled | disabled,
+ Ret :: FeatureFlags | init_required,
+ FeatureFlags :: rabbit_feature_flags:feature_flags().
%% @doc
%% Lists all, enabled or disabled feature flags, depending on the argument.
%%
@@ -92,10 +101,11 @@ get(FeatureName) ->
%% @returns A map of selected feature flags.
list(Which) ->
- _ = rabbit_ff_registry_factory:initialize_registry(),
- ?convince_dialyzer(?MODULE:list(Which), #{}, #{}).
+ ?convince_dialyzer(?MODULE:list(Which), init_required, #{}).
--spec states() -> rabbit_feature_flags:feature_states().
+-spec states() -> Ret when
+ Ret :: FeatureStates | init_required,
+ FeatureStates :: rabbit_feature_flags:feature_states().
%% @doc
%% Returns the states of supported feature flags.
%%
@@ -105,10 +115,12 @@ list(Which) ->
%% @returns A map of feature flag states.
states() ->
- _ = rabbit_ff_registry_factory:initialize_registry(),
- ?convince_dialyzer(?MODULE:states(), #{}, #{}).
+ ?convince_dialyzer(?MODULE:states(), init_required, #{}).
--spec is_supported(rabbit_feature_flags:feature_name()) -> boolean().
+-spec is_supported(FeatureName) -> Ret when
+ FeatureName :: rabbit_feature_flags:feature_name(),
+ Ret :: Supported | init_required,
+ Supported :: boolean().
%% @doc
%% Returns if a feature flag is supported.
%%
@@ -120,10 +132,12 @@ states() ->
%% otherwise.
is_supported(FeatureName) ->
- _ = rabbit_ff_registry_factory:initialize_registry(),
- ?convince_dialyzer(?MODULE:is_supported(FeatureName), false, true).
+ ?convince_dialyzer(?MODULE:is_supported(FeatureName), init_required, true).
--spec is_enabled(rabbit_feature_flags:feature_name()) -> boolean() | state_changing.
+-spec is_enabled(FeatureName) -> Ret when
+ FeatureName :: rabbit_feature_flags:feature_name(),
+ Ret :: Enabled | init_required,
+ Enabled :: boolean() | state_changing.
%% @doc
%% Returns if a feature flag is enabled or if its state is changing.
%%
@@ -135,8 +149,7 @@ is_supported(FeatureName) ->
%% its state is transient, or `false' otherwise.
is_enabled(FeatureName) ->
- _ = rabbit_ff_registry_factory:initialize_registry(),
- ?convince_dialyzer(?MODULE:is_enabled(FeatureName), false, true).
+ ?convince_dialyzer(?MODULE:is_enabled(FeatureName), init_required, true).
-spec is_registry_initialized() -> boolean().
%% @doc
@@ -169,7 +182,9 @@ is_registry_initialized() ->
is_registry_written_to_disk() ->
always_return_true().
--spec inventory() -> rabbit_feature_flags:inventory().
+-spec inventory() -> Ret when
+ Ret :: Inventory | init_required,
+ Inventory :: rabbit_feature_flags:inventory().
inventory() ->
_ = rabbit_ff_registry_factory:initialize_registry(),
diff --git a/deps/rabbit/src/rabbit_ff_registry_factory.erl b/deps/rabbit/src/rabbit_ff_registry_factory.erl
index cccf202a65..2a9a868654 100644
--- a/deps/rabbit/src/rabbit_ff_registry_factory.erl
+++ b/deps/rabbit/src/rabbit_ff_registry_factory.erl
@@ -735,14 +735,9 @@ registry_vsn() ->
proplists:get_value(vsn, Attrs, undefined).
purge_old_registry(Mod) ->
- case erlang:check_process_code(self(), rabbit_ff_registry) of
- false ->
- case code:is_loaded(Mod) of
- {file, _} -> do_purge_old_registry(Mod);
- false -> ok
- end;
- true ->
- ok
+ case code:is_loaded(Mod) of
+ {file, _} -> do_purge_old_registry(Mod);
+ false -> ok
end.
do_purge_old_registry(Mod) ->
diff --git a/deps/rabbit/src/rabbit_ff_registry_wrapper.erl b/deps/rabbit/src/rabbit_ff_registry_wrapper.erl
new file mode 100644
index 0000000000..b98d3a7e8d
--- /dev/null
+++ b/deps/rabbit/src/rabbit_ff_registry_wrapper.erl
@@ -0,0 +1,156 @@
+%% This Source Code Form is subject to the terms of the Mozilla Public
+%% License, v. 2.0. If a copy of the MPL was not distributed with this
+%% file, You can obtain one at https://mozilla.org/MPL/2.0/.
+%%
+%% Copyright (c) 2023 VMware, Inc. or its affiliates. All rights reserved.
+%%
+
+%% @author The RabbitMQ team
+%% @copyright 2023 VMware, Inc. or its affiliates.
+%%
+%% @doc
+%% This module sits in front of {@link rabbit_ff_registry}.
+%%
+%% If {@link rabbit_ff_registry} is the registry stub, calls to its functions
+%% will return `init_required'. In this case, this module is responsible for
+%% running the registry initialization and call {@link rabbit_ff_registry}
+%% again.
+%%
+%% It was introduced to fix a deadlock with the Code server when a process
+%% called the registry stub and that triggered the initialization of the
+%% registry. If the registry stub was already marked as deleted in the Code
+%% server, purging it while a process lingered on the deleted copy would cause
+%% that deadlock: the Code server would never return because that process
+%% would never make progress and thus would never stop using the deleted
+%% registry stub.
+
+-module(rabbit_ff_registry_wrapper).
+
+-compile({no_auto_import, [get/1]}).
+
+-export([get/1,
+ list/1,
+ states/0,
+ is_supported/1,
+ is_enabled/1,
+ inventory/0]).
+
+-spec get(FeatureName) -> FeatureProps when
+ FeatureName :: rabbit_feature_flags:feature_name(),
+ FeatureProps :: rabbit_feature_flags:feature_props_extended() |
+ undefined.
+%% @doc
+%% Returns the properties of a feature flag.
+%%
+%% Only the informations stored in the local registry is used to answer
+%% this call.
+%%
+%% @param FeatureName The name of the feature flag.
+%% @returns the properties of the specified feature flag.
+
+get(FeatureName) ->
+ case rabbit_ff_registry:get(FeatureName) of
+ init_required ->
+ _ = rabbit_ff_registry_factory:initialize_registry(),
+ get(FeatureName);
+ Ret ->
+ Ret
+ end.
+
+-spec list(Which) -> FeatureFlags when
+ Which :: all | enabled | disabled,
+ FeatureFlags :: rabbit_feature_flags:feature_flags().
+%% @doc
+%% Lists all, enabled or disabled feature flags, depending on the argument.
+%%
+%% Only the informations stored in the local registry is used to answer
+%% this call.
+%%
+%% @param Which The group of feature flags to return: `all', `enabled' or
+%% `disabled'.
+%% @returns A map of selected feature flags.
+
+list(Which) ->
+ case rabbit_ff_registry:list(Which) of
+ init_required ->
+ _ = rabbit_ff_registry_factory:initialize_registry(),
+ list(Which);
+ Ret ->
+ Ret
+ end.
+
+-spec states() -> FeatureStates when
+ FeatureStates :: rabbit_feature_flags:feature_states().
+%% @doc
+%% Returns the states of supported feature flags.
+%%
+%% Only the informations stored in the local registry is used to answer
+%% this call.
+%%
+%% @returns A map of feature flag states.
+
+states() ->
+ case rabbit_ff_registry:states() of
+ init_required ->
+ _ = rabbit_ff_registry_factory:initialize_registry(),
+ states();
+ Ret ->
+ Ret
+ end.
+
+-spec is_supported(FeatureName) -> Supported when
+ FeatureName :: rabbit_feature_flags:feature_name(),
+ Supported :: boolean().
+%% @doc
+%% Returns if a feature flag is supported.
+%%
+%% Only the informations stored in the local registry is used to answer
+%% this call.
+%%
+%% @param FeatureName The name of the feature flag to be checked.
+%% @returns `true' if the feature flag is supported, or `false'
+%% otherwise.
+
+is_supported(FeatureName) ->
+ case rabbit_ff_registry:is_supported(FeatureName) of
+ init_required ->
+ _ = rabbit_ff_registry_factory:initialize_registry(),
+ is_supported(FeatureName);
+ Ret ->
+ Ret
+ end.
+
+-spec is_enabled(FeatureName) -> Enabled when
+ FeatureName :: rabbit_feature_flags:feature_name(),
+ Enabled :: boolean() | state_changing.
+%% @doc
+%% Returns if a feature flag is enabled or if its state is changing.
+%%
+%% Only the informations stored in the local registry is used to answer
+%% this call.
+%%
+%% @param FeatureName The name of the feature flag to be checked.
+%% @returns `true' if the feature flag is enabled, `state_changing' if
+%% its state is transient, or `false' otherwise.
+
+is_enabled(FeatureName) ->
+ case rabbit_ff_registry:is_enabled(FeatureName) of
+ init_required ->
+ _ = rabbit_ff_registry_factory:initialize_registry(),
+ is_enabled(FeatureName);
+ Ret ->
+ Ret
+ end.
+
+-spec inventory() -> Inventory when
+ Inventory :: rabbit_feature_flags:inventory().
+%% @doc Unused for now (FIXME)
+
+inventory() ->
+ case rabbit_ff_registry:inventory() of
+ init_required ->
+ _ = rabbit_ff_registry_factory:initialize_registry(),
+ inventory();
+ Ret ->
+ Ret
+ end.
diff --git a/deps/rabbit/test/feature_flags_SUITE.erl b/deps/rabbit/test/feature_flags_SUITE.erl
index ffe1f5c00f..7e3c213c2d 100644
--- a/deps/rabbit/test/feature_flags_SUITE.erl
+++ b/deps/rabbit/test/feature_flags_SUITE.erl
@@ -25,7 +25,8 @@
registry_general_usage/1,
registry_concurrent_reloads/1,
- try_to_deadlock_in_registry_reload/1,
+ try_to_deadlock_in_registry_reload_1/1,
+ try_to_deadlock_in_registry_reload_2/1,
enable_feature_flag_in_a_healthy_situation/1,
enable_unsupported_feature_flag_in_a_healthy_situation/1,
enable_feature_flag_when_ff_file_is_unwritable/1,
@@ -102,7 +103,8 @@ groups() ->
[
registry_general_usage,
registry_concurrent_reloads,
- try_to_deadlock_in_registry_reload
+ try_to_deadlock_in_registry_reload_1,
+ try_to_deadlock_in_registry_reload_2
]},
{feature_flags_v2, [], Groups}
].
@@ -120,6 +122,11 @@ init_per_suite(Config) ->
end_per_suite(Config) ->
rabbit_ct_helpers:run_teardown_steps(Config).
+init_per_group(registry, Config) ->
+ logger:set_primary_config(level, debug),
+ rabbit_ct_helpers:run_steps(
+ Config,
+ [fun rabbit_ct_helpers:redirect_logger_to_ct_logs/1]);
init_per_group(feature_flags_v2, Config) ->
%% `feature_flags_v2' is now required and won't work in mixed-version
%% clusters if the other version doesn't support it.
@@ -213,7 +220,6 @@ init_per_testcase(Testcase, Config) ->
end,
case ?config(tc_group_properties, Config1) of
[{name, registry} | _] ->
- logger:set_primary_config(level, debug),
FeatureFlagsFile = filename:join(?config(priv_dir, Config1),
rabbit_misc:format(
"feature_flags-~ts",
@@ -308,7 +314,7 @@ end_per_testcase(Testcase, Config) ->
%% -------------------------------------------------------------------
-define(list_ff(Which),
- lists:sort(maps:keys(rabbit_ff_registry:list(Which)))).
+ lists:sort(maps:keys(rabbit_ff_registry_wrapper:list(Which)))).
registry_general_usage(_Config) ->
%% At first, the registry must be uninitialized.
@@ -330,20 +336,20 @@ registry_general_usage(_Config) ->
?assert(rabbit_ff_registry:is_registry_initialized()),
?assertMatch([ff_a, ff_b], ?list_ff(all)),
- ?assert(rabbit_ff_registry:is_supported(ff_a)),
- ?assert(rabbit_ff_registry:is_supported(ff_b)),
- ?assertNot(rabbit_ff_registry:is_supported(ff_c)),
- ?assertNot(rabbit_ff_registry:is_supported(ff_d)),
+ ?assert(rabbit_ff_registry_wrapper:is_supported(ff_a)),
+ ?assert(rabbit_ff_registry_wrapper:is_supported(ff_b)),
+ ?assertNot(rabbit_ff_registry_wrapper:is_supported(ff_c)),
+ ?assertNot(rabbit_ff_registry_wrapper:is_supported(ff_d)),
?assertEqual(#{ff_a => false,
- ff_b => false}, rabbit_ff_registry:states()),
+ ff_b => false}, rabbit_ff_registry_wrapper:states()),
?assertMatch([], ?list_ff(enabled)),
?assertMatch([], ?list_ff(state_changing)),
?assertMatch([ff_a, ff_b], ?list_ff(disabled)),
- ?assertNot(rabbit_ff_registry:is_enabled(ff_a)),
- ?assertNot(rabbit_ff_registry:is_enabled(ff_b)),
- ?assertNot(rabbit_ff_registry:is_enabled(ff_c)),
- ?assertNot(rabbit_ff_registry:is_enabled(ff_d)),
+ ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_a)),
+ ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_b)),
+ ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_c)),
+ ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_d)),
%% We can declare a new feature flag at runtime. All of them are
%% supported but still disabled.
@@ -354,23 +360,23 @@ registry_general_usage(_Config) ->
rabbit_feature_flags:inject_test_feature_flags(NewFeatureFlags),
rabbit_ff_registry_factory:initialize_registry(),
?assertMatch([ff_a, ff_b, ff_c],
- lists:sort(maps:keys(rabbit_ff_registry:list(all)))),
+ lists:sort(maps:keys(rabbit_ff_registry_wrapper:list(all)))),
- ?assert(rabbit_ff_registry:is_supported(ff_a)),
- ?assert(rabbit_ff_registry:is_supported(ff_b)),
- ?assert(rabbit_ff_registry:is_supported(ff_c)),
- ?assertNot(rabbit_ff_registry:is_supported(ff_d)),
+ ?assert(rabbit_ff_registry_wrapper:is_supported(ff_a)),
+ ?assert(rabbit_ff_registry_wrapper:is_supported(ff_b)),
+ ?assert(rabbit_ff_registry_wrapper:is_supported(ff_c)),
+ ?assertNot(rabbit_ff_registry_wrapper:is_supported(ff_d)),
?assertEqual(#{ff_a => false,
ff_b => false,
- ff_c => false}, rabbit_ff_registry:states()),
+ ff_c => false}, rabbit_ff_registry_wrapper:states()),
?assertMatch([], ?list_ff(enabled)),
?assertMatch([], ?list_ff(state_changing)),
?assertMatch([ff_a, ff_b, ff_c], ?list_ff(disabled)),
- ?assertNot(rabbit_ff_registry:is_enabled(ff_a)),
- ?assertNot(rabbit_ff_registry:is_enabled(ff_b)),
- ?assertNot(rabbit_ff_registry:is_enabled(ff_c)),
- ?assertNot(rabbit_ff_registry:is_enabled(ff_d)),
+ ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_a)),
+ ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_b)),
+ ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_c)),
+ ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_d)),
%% After enabling `ff_a`, it is actually the case. Others are
%% supported but remain disabled.
@@ -378,23 +384,23 @@ registry_general_usage(_Config) ->
#{ff_a => true},
true),
?assertMatch([ff_a, ff_b, ff_c],
- lists:sort(maps:keys(rabbit_ff_registry:list(all)))),
+ lists:sort(maps:keys(rabbit_ff_registry_wrapper:list(all)))),
- ?assert(rabbit_ff_registry:is_supported(ff_a)),
- ?assert(rabbit_ff_registry:is_supported(ff_b)),
- ?assert(rabbit_ff_registry:is_supported(ff_c)),
- ?assertNot(rabbit_ff_registry:is_supported(ff_d)),
+ ?assert(rabbit_ff_registry_wrapper:is_supported(ff_a)),
+ ?assert(rabbit_ff_registry_wrapper:is_supported(ff_b)),
+ ?assert(rabbit_ff_registry_wrapper:is_supported(ff_c)),
+ ?assertNot(rabbit_ff_registry_wrapper:is_supported(ff_d)),
?assertEqual(#{ff_a => true,
ff_b => false,
- ff_c => false}, rabbit_ff_registry:states()),
+ ff_c => false}, rabbit_ff_registry_wrapper:states()),
?assertMatch([ff_a], ?list_ff(enabled)),
?assertMatch([], ?list_ff(state_changing)),
?assertMatch([ff_b, ff_c], ?list_ff(disabled)),
- ?assert(rabbit_ff_registry:is_enabled(ff_a)),
- ?assertNot(rabbit_ff_registry:is_enabled(ff_b)),
- ?assertNot(rabbit_ff_registry:is_enabled(ff_c)),
- ?assertNot(rabbit_ff_registry:is_enabled(ff_d)),
+ ?assert(rabbit_ff_registry_wrapper:is_enabled(ff_a)),
+ ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_b)),
+ ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_c)),
+ ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_d)),
%% This time, we mark the state of `ff_c` as `state_changing`. We
%% expect all other feature flag states to remain unchanged.
@@ -403,23 +409,25 @@ registry_general_usage(_Config) ->
ff_c => state_changing},
true),
?assertMatch([ff_a, ff_b, ff_c],
- lists:sort(maps:keys(rabbit_ff_registry:list(all)))),
+ lists:sort(maps:keys(rabbit_ff_registry_wrapper:list(all)))),
- ?assert(rabbit_ff_registry:is_supported(ff_a)),
- ?assert(rabbit_ff_registry:is_supported(ff_b)),
- ?assert(rabbit_ff_registry:is_supported(ff_c)),
- ?assertNot(rabbit_ff_registry:is_supported(ff_d)),
+ ?assert(rabbit_ff_registry_wrapper:is_supported(ff_a)),
+ ?assert(rabbit_ff_registry_wrapper:is_supported(ff_b)),
+ ?assert(rabbit_ff_registry_wrapper:is_supported(ff_c)),
+ ?assertNot(rabbit_ff_registry_wrapper:is_supported(ff_d)),
- ?assertEqual(#{ff_a => false,
- ff_b => false,
- ff_c => state_changing}, rabbit_ff_registry:states()),
+ ?assertEqual(
+ #{ff_a => false,
+ ff_b => false,
+ ff_c => state_changing},
+ rabbit_ff_registry_wrapper:states()),
?assertMatch([], ?list_ff(enabled)),
?assertMatch([ff_c], ?list_ff(state_changing)),
?assertMatch([ff_a, ff_b], ?list_ff(disabled)),
- ?assertNot(rabbit_ff_registry:is_enabled(ff_a)),
- ?assertNot(rabbit_ff_registry:is_enabled(ff_b)),
- ?assertMatch(state_changing, rabbit_ff_registry:is_enabled(ff_c)),
- ?assertNot(rabbit_ff_registry:is_enabled(ff_d)),
+ ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_a)),
+ ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_b)),
+ ?assertMatch(state_changing, rabbit_ff_registry_wrapper:is_enabled(ff_c)),
+ ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_d)),
%% Finally, we disable `ff_c`. All of them are supported but
%% disabled.
@@ -428,23 +436,23 @@ registry_general_usage(_Config) ->
ff_c => false},
true),
?assertMatch([ff_a, ff_b, ff_c],
- lists:sort(maps:keys(rabbit_ff_registry:list(all)))),
+ lists:sort(maps:keys(rabbit_ff_registry_wrapper:list(all)))),
- ?assert(rabbit_ff_registry:is_supported(ff_a)),
- ?assert(rabbit_ff_registry:is_supported(ff_b)),
- ?assert(rabbit_ff_registry:is_supported(ff_c)),
- ?assertNot(rabbit_ff_registry:is_supported(ff_d)),
+ ?assert(rabbit_ff_registry_wrapper:is_supported(ff_a)),
+ ?assert(rabbit_ff_registry_wrapper:is_supported(ff_b)),
+ ?assert(rabbit_ff_registry_wrapper:is_supported(ff_c)),
+ ?assertNot(rabbit_ff_registry_wrapper:is_supported(ff_d)),
?assertEqual(#{ff_a => false,
ff_b => false,
- ff_c => false}, rabbit_ff_registry:states()),
+ ff_c => false}, rabbit_ff_registry_wrapper:states()),
?assertMatch([], ?list_ff(enabled)),
?assertMatch([], ?list_ff(state_changing)),
?assertMatch([ff_a, ff_b, ff_c], ?list_ff(disabled)),
- ?assertNot(rabbit_ff_registry:is_enabled(ff_a)),
- ?assertNot(rabbit_ff_registry:is_enabled(ff_b)),
- ?assertNot(rabbit_ff_registry:is_enabled(ff_c)),
- ?assertNot(rabbit_ff_registry:is_enabled(ff_d)).
+ ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_a)),
+ ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_b)),
+ ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_c)),
+ ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_d)).
registry_concurrent_reloads(_Config) ->
case rabbit_ff_registry:is_registry_initialized() of
@@ -559,7 +567,7 @@ registry_spammer1(FeatureNames) ->
?assertEqual(FeatureNames, ?list_ff(all)),
registry_spammer1(FeatureNames).
-try_to_deadlock_in_registry_reload(_Config) ->
+try_to_deadlock_in_registry_reload_1(_Config) ->
rabbit_ff_registry_factory:purge_old_registry(rabbit_ff_registry),
_ = code:delete(rabbit_ff_registry),
?assertEqual(false, code:is_loaded(rabbit_ff_registry)),
@@ -568,37 +576,209 @@ try_to_deadlock_in_registry_reload(_Config) ->
FeatureProps = #{provided_by => rabbit,
stability => stable},
- Lock = rabbit_ff_registry_factory:registry_loading_lock(),
- global:set_lock(Lock, [node()]),
-
Parent = self(),
+
+ %% Deadlock steps:
+ %% * Process A acquires the lock first.
+ %% * Process B loads the registry stub and waits for the lock.
+ %% * Process A deletes the registry stub and loads the initialized
+ %% registry.
+ %% * Process B wants to purge the deleted registry stub by sending a
+ %% request to the Code server.
+ %%
+ %% => Process B waits forever the return from the Code server because the
+ %% Code server waits for process B to be runnable again to handle the
+ %% signal.
+
ProcessA = spawn_link(
fun() ->
- FF = rabbit_ff_registry:get(FeatureName),
+ %% Process A acquires the lock manually first to
+ %% ensure the ordering of events. It can be acquired
+ %% recursively, so the feature flag injection can
+ %% "acquire" it again and proceed.
+ ct:pal("Process A: Acquire registry loading lock"),
+ Lock =
+ rabbit_ff_registry_factory:registry_loading_lock(),
+ global:set_lock(Lock, [node()]),
+ receive proceed -> ok end,
+
+ ct:pal(
+ "Process A: "
+ "Inject arbitrary feature flag to reload "
+ "registry"),
+ rabbit_feature_flags:inject_test_feature_flags(
+ #{FeatureName => FeatureProps}),
+
+ ct:pal("Process A: Release registry loading lock"),
+ global:del_lock(Lock, [node()]),
+
+ ct:pal("Process A: Exiting..."),
+ erlang:unlink(Parent)
+ end),
+ timer:sleep(500),
+
+ ProcessB = spawn_link(
+ fun() ->
+ %% Process B is the one loading the registry stub and
+ %% wants to initialize the real registry.
+ ct:pal(
+ "Process B: "
+ "Trigger automatic initial registry load"),
+ FF = rabbit_ff_registry_wrapper:get(FeatureName),
+
+ ct:pal("Process B: Exiting..."),
erlang:unlink(Parent),
Parent ! {self(), FF}
end),
- timer:sleep(1000),
-
- ct:pal("Inject arbitrary feature flag to reload registry"),
- rabbit_feature_flags:inject_test_feature_flags(
- #{FeatureName => FeatureProps}),
+ timer:sleep(500),
+
+ begin
+ {_, StacktraceA} = erlang:process_info(ProcessA, current_stacktrace),
+ {_, StacktraceB} = erlang:process_info(ProcessB, current_stacktrace),
+ ct:pal(
+ "Process stacktraces:~n"
+ " Process A: ~p~n"
+ " Process B: ~p",
+ [StacktraceA, StacktraceB])
+ end,
- ct:pal("Release registry loading lock"),
- global:del_lock(Lock, [node()]),
+ %% Process A is resumed. Without a proper check, process B would try to
+ %% purge the copy of the registry it is currently using itself, causing a
+ %% deadlock because the Code server wants process A to handle a signal, but
+ %% process A is not runnable.
+ ProcessA ! proceed,
- ct:pal("Waiting for process A to exit"),
+ ct:pal("Waiting for process B to exit"),
receive
- {ProcessA, FF} ->
+ {ProcessB, FF} ->
?assertEqual(FeatureProps, FF),
ok
after 10000 ->
- {_, Stacktrace} = erlang:process_info(
- ProcessA, current_stacktrace),
- ct:pal("Process A stuck; stacktrace: ~p", [Stacktrace]),
+ {_, StacktraceB} = erlang:process_info(
+ ProcessB, current_stacktrace),
+ ct:pal("Process B stuck; stacktrace: ~p", [StacktraceB]),
error(registry_reload_deadlock)
end.
+try_to_deadlock_in_registry_reload_2(_Config) ->
+ rabbit_ff_registry_factory:purge_old_registry(rabbit_ff_registry),
+ _ = code:delete(rabbit_ff_registry),
+ ?assertEqual(false, code:is_loaded(rabbit_ff_registry)),
+
+ FeatureName = ?FUNCTION_NAME,
+ FeatureProps = #{provided_by => rabbit,
+ stability => stable},
+
+ ct:pal("Inject arbitrary feature flag to reload registry"),
+ rabbit_feature_flags:inject_test_feature_flags(
+ #{FeatureName => FeatureProps},
+ false),
+
+ _ = erlang:process_flag(trap_exit, true),
+
+ ct:pal("Parent ~p: Acquire registry loading lock", [self()]),
+ Lock = rabbit_ff_registry_factory:registry_loading_lock(),
+ global:set_lock(Lock, [node()]),
+
+ Parent = self(),
+
+ %% Deadlock steps:
+ %% * Processes A, B1 and B2 wait for the lock. The registry stub is loaded.
+ %% * Process B1 acquires the lock.
+ %% * Process B1 deletes the registry stub and loads the initialized
+ %% registry.
+ %% * Process A acquires the lock.
+ %% * Process A wants to purge the deleted registry stub by sending a
+ %% request to the Code server.
+ %%
+ %% => Process A waits forever the return from the Code server because the
+ %% Code server waits for process B2 to stop lingering on the deleted
+ %% registry stub, but process B2 waits for the lock.
+
+ ProcessA = spawn_link(
+ fun() ->
+ %% Process A acquires the lock automatically as part
+ %% of requesting an explicit initialization of the
+ %% registry. Process A doesn't linger on the registry
+ %% stub.
+ ct:pal(
+ "Process A ~p: "
+ "Trigger manual initial registry load",
+ [self()]),
+ rabbit_ff_registry_factory:initialize_registry(),
+
+ ct:pal("Process A ~p: Exiting...", [self()]),
+ erlang:unlink(Parent),
+ Parent ! {self(), done}
+ end),
+
+ FunB = fun() ->
+ %% Processes B1 and B2 acquire the lock automatically as
+ %% part of trying to load the registry as part of they
+ %% querying a feature flag.
+ ct:pal(
+ "Process B ~p: "
+ "Trigger automatic initial registry load",
+ [self()]),
+ _ = rabbit_ff_registry_wrapper:get(FeatureName),
+
+ ct:pal(
+ "Process B ~p: Exiting...",
+ [self()]),
+ erlang:unlink(Parent),
+ Parent ! {self(), done}
+ end,
+ ProcessB1 = spawn_link(FunB),
+ ProcessB2 = spawn_link(FunB),
+ timer:sleep(500),
+
+ %% We use `erlang:suspend_process/1' and `erlang:resume_process/1' to
+ %% ensure the order in which processes acquire the lock.
+ erlang:suspend_process(ProcessA),
+ erlang:suspend_process(ProcessB1),
+ erlang:suspend_process(ProcessB2),
+ timer:sleep(500),
+
+ ct:pal("Parent ~p: Release registry loading lock", [self()]),
+ global:del_lock(Lock, [node()]),
+
+ erlang:resume_process(ProcessB1),
+ timer:sleep(500),
+ erlang:resume_process(ProcessA),
+ timer:sleep(500),
+ erlang:resume_process(ProcessB2),
+
+ ct:pal("Waiting for processes to exit"),
+ Procs = [ProcessA, ProcessB1, ProcessB2],
+ lists:foreach(
+ fun(Pid) ->
+ receive
+ {Pid, done} ->
+ ok;
+ {'EXIT', Pid, Reason} ->
+ ct:pal("Process ~p exited; reason: ~p", [Pid, Reason]),
+ error(test_process_killed)
+ after 10000 ->
+ lists:foreach(
+ fun(Pid1) ->
+ PI = erlang:process_info(
+ Pid1, current_stacktrace),
+ case PI of
+ undefined ->
+ ok;
+ {_, Stacktrace} ->
+ ct:pal(
+ "Process ~p stuck; "
+ "stacktrace: ~p",
+ [Pid1, Stacktrace])
+ end
+ end, Procs),
+ error(registry_reload_deadlock)
+ end
+ end, Procs),
+
+ ok.
+
enable_feature_flag_in_a_healthy_situation(Config) ->
FeatureName = ff_from_testsuite,
ClusterSize = ?config(rmq_nodes_count, Config),
diff --git a/deps/rabbit/test/feature_flags_v2_SUITE.erl b/deps/rabbit/test/feature_flags_v2_SUITE.erl
index dc1c6636b3..ee3d95f259 100644
--- a/deps/rabbit/test/feature_flags_v2_SUITE.erl
+++ b/deps/rabbit/test/feature_flags_v2_SUITE.erl
@@ -175,6 +175,7 @@ setup_slave_node(Config) ->
ok = setup_logger(),
ok = setup_data_dir(Config),
ok = setup_feature_flags_file(Config),
+ _ = rabbit_ff_registry_factory:initialize_registry(),
ok = start_controller(),
ok = rabbit_feature_flags:enable(feature_flags_v2),
ok.