rabbit_feature_flags: Fix possible deadlock when calling the Code server (take 2)fix-feature-flags-init-vs-code_server-deadlock-take2

[Why] The background reason for this fix is about the same as the one explained in the previous version of this fix; see commit e0a2f1027278bd0901bdaa9af6065abc676ff14f. This time, the order of events that led to a similar deadlock is the following: 0. No `rabbit_ff_registry` is loaded yet. 1. Process A, B and C call `rabbit_ff_registry:something()` indirectly which triggers two initializations in parallel. * Process A did it from an explicit call to `rabbit_ff_registry_factory:initialize_factory()` during RabbitMQ boot. * Process B and C indirectly called it because they checked if a feature flag was enabled. 2. Process B acquires the lock first and finishes the initialization. A new registry is loaded and the old `rabbit_ff_registry` module copy is marked as "old". At this point, process A and C still reference that old copy because `rabbit_ff_registry:something()` is up above in its call stack. 3. Process A acquires the lock, prepares the new registry and tries to soft-purge the old `rabbit_ff_registry` copy before loading the new one. This is where the deadlock happens: process A requests the Code server to purge the old copy, but the Code server waits for process C to stop using it. The difference between the steps described in the first bug fix attempt's commit and these ones is that the process which lingers on the deleted `rabbit_ff_registry` (process C above) isn't the one who acquired the lock; process A has it. That's why the first bug fix isn't effective in this case: it relied on the fact that the process which lingers on the deleted `rabbit_ff_registry` is the process which attempts to purge the module. [How] In this commit, we go with a more drastic change. This time, we put a wrapper in front of `rabbit_ff_registry` called `rabbit_ff_registry_wrapper`. This wrapper is responsible for doing the automatic initialization if the loaded registry is the stub module. The `rabbit_ff_registry` stub now always returns `init_required` instead of performing the initialization and calling itself recursively. This way, processes linger on `rabbit_ff_registry_wrapper`, not on `rabbit_ff_registry`. Thanks to this, the Code server can proceed with the purge. See #8112.
author: Jean-Sébastien Pédron <jean-sebastien.pedron@dumbbell.fr> 2023-05-12 17:11:57 +0200
committer: Jean-Sébastien Pédron <jean-sebastien.pedron@dumbbell.fr> 2023-05-16 10:46:32 +0200
commit: 213eb20e1658a1cebb4c1a07606e28fc7b62ebca (patch)
tree: f7ae3a58697bb94120e5333e21e748b30a8be4cb
parent: c7d0427d621493f00a3aa0d6e7720f709f3769a6 (diff)
download: rabbitmq-server-git-fix-feature-flags-init-vs-code_server-deadlock-take2.tar.gz
8 files changed, 475 insertions, 117 deletions
diff --git a/deps/rabbit/app.bzl b/deps/rabbit/app.bzl
index f8ac64f82e..6a17514549 100644
--- a/deps/rabbit/app.bzl
+++ b/deps/rabbit/app.bzl
@@ -114,6 +114,7 @@ def all_beam_files(name = "all_beam_files"):
             "src/rabbit_ff_extra.erl",
             "src/rabbit_ff_registry.erl",
             "src/rabbit_ff_registry_factory.erl",
+            "src/rabbit_ff_registry_wrapper.erl",
             "src/rabbit_fhc_helpers.erl",
             "src/rabbit_fifo.erl",
             "src/rabbit_fifo_client.erl",
@@ -355,6 +356,7 @@ def all_test_beam_files(name = "all_test_beam_files"):
             "src/rabbit_ff_extra.erl",
             "src/rabbit_ff_registry.erl",
             "src/rabbit_ff_registry_factory.erl",
+            "src/rabbit_ff_registry_wrapper.erl",
             "src/rabbit_fhc_helpers.erl",
             "src/rabbit_fifo.erl",
             "src/rabbit_fifo_client.erl",
@@ -609,6 +611,7 @@ def all_srcs(name = "all_srcs"):
             "src/rabbit_ff_extra.erl",
             "src/rabbit_ff_registry.erl",
             "src/rabbit_ff_registry_factory.erl",
+            "src/rabbit_ff_registry_wrapper.erl",
             "src/rabbit_fhc_helpers.erl",
             "src/rabbit_fifo.erl",
             "src/rabbit_fifo_client.erl",
diff --git a/deps/rabbit/src/rabbit_feature_flags.erl b/deps/rabbit/src/rabbit_feature_flags.erl
index 3f35fc47a9..06e46cc03a 100644
--- a/deps/rabbit/src/rabbit_feature_flags.erl
+++ b/deps/rabbit/src/rabbit_feature_flags.erl
@@ -113,7 +113,7 @@
          remote_nodes/0,
          running_remote_nodes/0,
          does_node_support/3,
-         inject_test_feature_flags/1,
+         inject_test_feature_flags/1, inject_test_feature_flags/2,
          query_supported_feature_flags/0,
          does_enabled_feature_flags_list_file_exist/0,
          read_enabled_feature_flags_list/0,
@@ -336,8 +336,8 @@ list() -> list(all).
 %% `disabled'.
 %% @returns A map of selected feature flags.
 
-list(all)      -> rabbit_ff_registry:list(all);
-list(enabled)  -> rabbit_ff_registry:list(enabled);
+list(all)      -> rabbit_ff_registry_wrapper:list(all);
+list(enabled)  -> rabbit_ff_registry_wrapper:list(enabled);
 list(disabled) -> maps:filter(
                     fun(FeatureName, _) -> is_disabled(FeatureName) end,
                     list(all)).
@@ -381,7 +381,7 @@ enable(FeatureNames) when is_list(FeatureNames) ->
     with_feature_flags(FeatureNames, fun enable/1).
 
 uses_callbacks(FeatureName) when is_atom(FeatureName) ->
-    case rabbit_ff_registry:get(FeatureName) of
+    case rabbit_ff_registry_wrapper:get(FeatureName) of
         undefined    -> false;
         FeatureProps -> uses_callbacks(FeatureProps)
     end;
@@ -504,9 +504,11 @@ is_supported(FeatureNames, Timeout) ->
 %%   `false' if one of them is not.
 
 is_supported_locally(FeatureName) when is_atom(FeatureName) ->
-    rabbit_ff_registry:is_supported(FeatureName);
+    rabbit_ff_registry_wrapper:is_supported(FeatureName);
 is_supported_locally(FeatureNames) when is_list(FeatureNames) ->
-    lists:all(fun(F) -> rabbit_ff_registry:is_supported(F) end, FeatureNames).
+    lists:all(
+      fun(F) -> rabbit_ff_registry_wrapper:is_supported(F) end,
+      FeatureNames).
 
 -spec is_enabled(feature_name() | [feature_name()]) -> boolean().
 %% @doc
@@ -563,19 +565,19 @@ is_enabled(FeatureNames, blocking) ->
     end.
 
 is_enabled_nb(FeatureName) when is_atom(FeatureName) ->
-    rabbit_ff_registry:is_enabled(FeatureName);
+    rabbit_ff_registry_wrapper:is_enabled(FeatureName);
 is_enabled_nb(FeatureNames) when is_list(FeatureNames) ->
     lists:foldl(
       fun
           (_F, state_changing = Acc) ->
               Acc;
           (F, false = Acc) ->
-              case rabbit_ff_registry:is_enabled(F) of
+              case rabbit_ff_registry_wrapper:is_enabled(F) of
                   state_changing -> state_changing;
                   _              -> Acc
               end;
           (F, _) ->
-              rabbit_ff_registry:is_enabled(F)
+              rabbit_ff_registry_wrapper:is_enabled(F)
       end,
       true, FeatureNames).
 
@@ -710,7 +712,7 @@ get_state(FeatureName) when is_atom(FeatureName) ->
 %% given feature flag name doesn't correspond to a known feature flag.
 
 get_stability(FeatureName) when is_atom(FeatureName) ->
-    case rabbit_ff_registry:get(FeatureName) of
+    case rabbit_ff_registry_wrapper:get(FeatureName) of
         undefined    -> undefined;
         FeatureProps -> get_stability(FeatureProps)
     end;
@@ -738,6 +740,9 @@ init() ->
 -define(PT_TESTSUITE_ATTRS, {?MODULE, testsuite_feature_flags_attrs}).
 
 inject_test_feature_flags(FeatureFlags) ->
+    inject_test_feature_flags(FeatureFlags, true).
+
+inject_test_feature_flags(FeatureFlags, InitReg) ->
     ExistingAppAttrs = module_attributes_from_testsuite(),
     FeatureFlagsPerApp0 = lists:foldl(
                             fun({Origin, Origin, FFlags}, Acc) ->
@@ -768,7 +773,10 @@ inject_test_feature_flags(FeatureFlags) ->
       [FeatureFlags, AttributesFromTestsuite],
       #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
     ok = persistent_term:put(?PT_TESTSUITE_ATTRS, AttributesFromTestsuite),
-    rabbit_ff_registry_factory:initialize_registry().
+    case InitReg of
+        true  -> rabbit_ff_registry_factory:initialize_registry();
+        false -> ok
+    end.
 
 module_attributes_from_testsuite() ->
     persistent_term:get(?PT_TESTSUITE_ATTRS, []).
diff --git a/deps/rabbit/src/rabbit_ff_controller.erl b/deps/rabbit/src/rabbit_ff_controller.erl
index cc46ad5d24..ca6dd84984 100644
--- a/deps/rabbit/src/rabbit_ff_controller.erl
+++ b/deps/rabbit/src/rabbit_ff_controller.erl
@@ -1339,7 +1339,7 @@ enable_dependencies1(
       Rets :: #{node() => term()}.
 
 run_callback(Nodes, FeatureName, Command, Extra, Timeout) ->
-    FeatureProps = rabbit_ff_registry:get(FeatureName),
+    FeatureProps = rabbit_ff_registry_wrapper:get(FeatureName),
     Callbacks = maps:get(callbacks, FeatureProps, #{}),
     case Callbacks of
         #{Command := {CallbackMod, CallbackFun}}
diff --git a/deps/rabbit/src/rabbit_ff_registry.erl b/deps/rabbit/src/rabbit_ff_registry.erl
index 26718e2cc7..0ec784c8a0 100644
--- a/deps/rabbit/src/rabbit_ff_registry.erl
+++ b/deps/rabbit/src/rabbit_ff_registry.erl
@@ -37,22 +37,26 @@
 -on_load(on_load/0).
 -endif.
 
-%% Initially, is_registry_initialized/0 always returns false and this `Call'
-%% is always called. The case statement is here to convince Dialyzer that the
-%% function could return values of type `__ReturnedIfUninitialized' or
+%% In this registry stub, most functions want to return `init_required' to let
+%% {@link rabbit_ff_registry_wrapper} do the first time initialization.
+%%
+%% The inner case statement is here to convince Dialyzer that the function
+%% could return values of type `__ReturnedIfUninitialized' or
 %% `__NeverReturned'.
 %%
 %% If the function was only calling itself (`Call'), Dialyzer would consider
-%% that it would never return.
+%% that it would never return. With the outer case, Dialyzer would conclude
+%% that `__ReturnedIfUninitialized' is always returned and other values will
+%% never be returned and there is no point in expecting them.
 %%
-%% With just `is_registry_initialized()' case, Dialyzer would conclude that
-%% `__ReturnedIfUninitialized' is always returned and other values will never
-%% be returned and there is no point in expecting them.
+%% In the end, `Call' is never executed because {@link
+%% rabbit_ff_registry_wrapper} is responsible for calling the registry
+%% function again after initialization.
 %%
 %% With both cases in place, it seems that we can convince Dialyzer that the
 %% function returns values matching its spec.
 -define(convince_dialyzer(__Call, __ReturnedIfUninitialized, __NeverReturned),
-        case is_registry_initialized() of
+        case always_return_true() of
             false ->
                 __Call;
             true ->
@@ -62,8 +66,11 @@
                 end
         end).
 
--spec get(rabbit_feature_flags:feature_name()) ->
-    rabbit_feature_flags:feature_props_extended() | undefined.
+-spec get(FeatureName) -> Ret when
+      FeatureName :: rabbit_feature_flags:feature_name(),
+      Ret :: FeatureProps | init_required,
+      FeatureProps :: rabbit_feature_flags:feature_props_extended() |
+                      undefined.
 %% @doc
 %% Returns the properties of a feature flag.
 %%
@@ -74,13 +81,15 @@
 %% @returns the properties of the specified feature flag.
 
 get(FeatureName) ->
-    _ = rabbit_ff_registry_factory:initialize_registry(),
     ?convince_dialyzer(
        ?MODULE:get(FeatureName),
-       undefined,
+       init_required,
        #{provided_by => rabbit}).
 
--spec list(all | enabled | disabled) -> rabbit_feature_flags:feature_flags().
+-spec list(Which) -> Ret when
+      Which :: all | enabled | disabled,
+      Ret :: FeatureFlags | init_required,
+      FeatureFlags :: rabbit_feature_flags:feature_flags().
 %% @doc
 %% Lists all, enabled or disabled feature flags, depending on the argument.
 %%
@@ -92,10 +101,11 @@ get(FeatureName) ->
 %% @returns A map of selected feature flags.
 
 list(Which) ->
-    _ = rabbit_ff_registry_factory:initialize_registry(),
-    ?convince_dialyzer(?MODULE:list(Which), #{}, #{}).
+    ?convince_dialyzer(?MODULE:list(Which), init_required, #{}).
 
--spec states() -> rabbit_feature_flags:feature_states().
+-spec states() -> Ret when
+      Ret :: FeatureStates | init_required,
+      FeatureStates :: rabbit_feature_flags:feature_states().
 %% @doc
 %% Returns the states of supported feature flags.
 %%
@@ -105,10 +115,12 @@ list(Which) ->
 %% @returns A map of feature flag states.
 
 states() ->
-    _ = rabbit_ff_registry_factory:initialize_registry(),
-    ?convince_dialyzer(?MODULE:states(), #{}, #{}).
+    ?convince_dialyzer(?MODULE:states(), init_required, #{}).
 
--spec is_supported(rabbit_feature_flags:feature_name()) -> boolean().
+-spec is_supported(FeatureName) -> Ret when
+      FeatureName :: rabbit_feature_flags:feature_name(),
+      Ret :: Supported | init_required,
+      Supported :: boolean().
 %% @doc
 %% Returns if a feature flag is supported.
 %%
@@ -120,10 +132,12 @@ states() ->
 %%   otherwise.
 
 is_supported(FeatureName) ->
-    _ = rabbit_ff_registry_factory:initialize_registry(),
-    ?convince_dialyzer(?MODULE:is_supported(FeatureName), false, true).
+    ?convince_dialyzer(?MODULE:is_supported(FeatureName), init_required, true).
 
--spec is_enabled(rabbit_feature_flags:feature_name()) -> boolean() | state_changing.
+-spec is_enabled(FeatureName) -> Ret when
+      FeatureName :: rabbit_feature_flags:feature_name(),
+      Ret :: Enabled | init_required,
+      Enabled :: boolean() | state_changing.
 %% @doc
 %% Returns if a feature flag is enabled or if its state is changing.
 %%
@@ -135,8 +149,7 @@ is_supported(FeatureName) ->
 %%   its state is transient, or `false' otherwise.
 
 is_enabled(FeatureName) ->
-    _ = rabbit_ff_registry_factory:initialize_registry(),
-    ?convince_dialyzer(?MODULE:is_enabled(FeatureName), false, true).
+    ?convince_dialyzer(?MODULE:is_enabled(FeatureName), init_required, true).
 
 -spec is_registry_initialized() -> boolean().
 %% @doc
@@ -169,7 +182,9 @@ is_registry_initialized() ->
 is_registry_written_to_disk() ->
     always_return_true().
 
--spec inventory() -> rabbit_feature_flags:inventory().
+-spec inventory() -> Ret when
+      Ret :: Inventory | init_required,
+      Inventory :: rabbit_feature_flags:inventory().
 
 inventory() ->
     _ = rabbit_ff_registry_factory:initialize_registry(),
diff --git a/deps/rabbit/src/rabbit_ff_registry_factory.erl b/deps/rabbit/src/rabbit_ff_registry_factory.erl
index cccf202a65..2a9a868654 100644
--- a/deps/rabbit/src/rabbit_ff_registry_factory.erl
+++ b/deps/rabbit/src/rabbit_ff_registry_factory.erl
@@ -735,14 +735,9 @@ registry_vsn() ->
     proplists:get_value(vsn, Attrs, undefined).
 
 purge_old_registry(Mod) ->
-    case erlang:check_process_code(self(), rabbit_ff_registry) of
-        false ->
-            case code:is_loaded(Mod) of
-                {file, _} -> do_purge_old_registry(Mod);
-                false     -> ok
-            end;
-        true ->
-            ok
+    case code:is_loaded(Mod) of
+        {file, _} -> do_purge_old_registry(Mod);
+        false     -> ok
     end.
 
 do_purge_old_registry(Mod) ->
diff --git a/deps/rabbit/src/rabbit_ff_registry_wrapper.erl b/deps/rabbit/src/rabbit_ff_registry_wrapper.erl
new file mode 100644
index 0000000000..b98d3a7e8d
--- /dev/null
+++ b/deps/rabbit/src/rabbit_ff_registry_wrapper.erl
@@ -0,0 +1,156 @@
+%% This Source Code Form is subject to the terms of the Mozilla Public
+%% License, v. 2.0. If a copy of the MPL was not distributed with this
+%% file, You can obtain one at https://mozilla.org/MPL/2.0/.
+%%
+%% Copyright (c) 2023 VMware, Inc. or its affiliates.  All rights reserved.
+%%
+
+%% @author The RabbitMQ team
+%% @copyright 2023 VMware, Inc. or its affiliates.
+%%
+%% @doc
+%% This module sits in front of {@link rabbit_ff_registry}.
+%%
+%% If {@link rabbit_ff_registry} is the registry stub, calls to its functions
+%% will return `init_required'. In this case, this module is responsible for
+%% running the registry initialization and call {@link rabbit_ff_registry}
+%% again.
+%%
+%% It was introduced to fix a deadlock with the Code server when a process
+%% called the registry stub and that triggered the initialization of the
+%% registry. If the registry stub was already marked as deleted in the Code
+%% server, purging it while a process lingered on the deleted copy would cause
+%% that deadlock: the Code server would never return because that process
+%% would never make progress and thus would never stop using the deleted
+%% registry stub.
+
+-module(rabbit_ff_registry_wrapper).
+
+-compile({no_auto_import, [get/1]}).
+
+-export([get/1,
+         list/1,
+         states/0,
+         is_supported/1,
+         is_enabled/1,
+         inventory/0]).
+
+-spec get(FeatureName) -> FeatureProps when
+      FeatureName :: rabbit_feature_flags:feature_name(),
+      FeatureProps :: rabbit_feature_flags:feature_props_extended() |
+                      undefined.
+%% @doc
+%% Returns the properties of a feature flag.
+%%
+%% Only the informations stored in the local registry is used to answer
+%% this call.
+%%
+%% @param FeatureName The name of the feature flag.
+%% @returns the properties of the specified feature flag.
+
+get(FeatureName) ->
+    case rabbit_ff_registry:get(FeatureName) of
+        init_required ->
+            _ = rabbit_ff_registry_factory:initialize_registry(),
+            get(FeatureName);
+        Ret ->
+            Ret
+    end.
+
+-spec list(Which) -> FeatureFlags when
+      Which :: all | enabled | disabled,
+      FeatureFlags :: rabbit_feature_flags:feature_flags().
+%% @doc
+%% Lists all, enabled or disabled feature flags, depending on the argument.
+%%
+%% Only the informations stored in the local registry is used to answer
+%% this call.
+%%
+%% @param Which The group of feature flags to return: `all', `enabled' or
+%% `disabled'.
+%% @returns A map of selected feature flags.
+
+list(Which) ->
+    case rabbit_ff_registry:list(Which) of
+        init_required ->
+            _ = rabbit_ff_registry_factory:initialize_registry(),
+            list(Which);
+        Ret ->
+            Ret
+    end.
+
+-spec states() -> FeatureStates when
+      FeatureStates :: rabbit_feature_flags:feature_states().
+%% @doc
+%% Returns the states of supported feature flags.
+%%
+%% Only the informations stored in the local registry is used to answer
+%% this call.
+%%
+%% @returns A map of feature flag states.
+
+states() ->
+    case rabbit_ff_registry:states() of
+        init_required ->
+            _ = rabbit_ff_registry_factory:initialize_registry(),
+            states();
+        Ret ->
+            Ret
+    end.
+
+-spec is_supported(FeatureName) -> Supported when
+      FeatureName :: rabbit_feature_flags:feature_name(),
+      Supported :: boolean().
+%% @doc
+%% Returns if a feature flag is supported.
+%%
+%% Only the informations stored in the local registry is used to answer
+%% this call.
+%%
+%% @param FeatureName The name of the feature flag to be checked.
+%% @returns `true' if the feature flag is supported, or `false'
+%%   otherwise.
+
+is_supported(FeatureName) ->
+    case rabbit_ff_registry:is_supported(FeatureName) of
+        init_required ->
+            _ = rabbit_ff_registry_factory:initialize_registry(),
+            is_supported(FeatureName);
+        Ret ->
+            Ret
+    end.
+
+-spec is_enabled(FeatureName) -> Enabled when
+      FeatureName :: rabbit_feature_flags:feature_name(),
+      Enabled :: boolean() | state_changing.
+%% @doc
+%% Returns if a feature flag is enabled or if its state is changing.
+%%
+%% Only the informations stored in the local registry is used to answer
+%% this call.
+%%
+%% @param FeatureName The name of the feature flag to be checked.
+%% @returns `true' if the feature flag is enabled, `state_changing' if
+%%   its state is transient, or `false' otherwise.
+
+is_enabled(FeatureName) ->
+    case rabbit_ff_registry:is_enabled(FeatureName) of
+        init_required ->
+            _ = rabbit_ff_registry_factory:initialize_registry(),
+            is_enabled(FeatureName);
+        Ret ->
+            Ret
+    end.
+
+-spec inventory() -> Inventory when
+      Inventory :: rabbit_feature_flags:inventory().
+%% @doc Unused for now (FIXME)
+
+inventory() ->
+    case rabbit_ff_registry:inventory() of
+        init_required ->
+            _ = rabbit_ff_registry_factory:initialize_registry(),
+            inventory();
+        Ret ->
+            Ret
+    end.
diff --git a/deps/rabbit/test/feature_flags_SUITE.erl b/deps/rabbit/test/feature_flags_SUITE.erl
index ffe1f5c00f..7e3c213c2d 100644
--- a/deps/rabbit/test/feature_flags_SUITE.erl
+++ b/deps/rabbit/test/feature_flags_SUITE.erl
@@ -25,7 +25,8 @@
 
          registry_general_usage/1,
          registry_concurrent_reloads/1,
-         try_to_deadlock_in_registry_reload/1,
+         try_to_deadlock_in_registry_reload_1/1,
+         try_to_deadlock_in_registry_reload_2/1,
          enable_feature_flag_in_a_healthy_situation/1,
          enable_unsupported_feature_flag_in_a_healthy_situation/1,
          enable_feature_flag_when_ff_file_is_unwritable/1,
@@ -102,7 +103,8 @@ groups() ->
       [
        registry_general_usage,
        registry_concurrent_reloads,
-       try_to_deadlock_in_registry_reload
+       try_to_deadlock_in_registry_reload_1,
+       try_to_deadlock_in_registry_reload_2
       ]},
      {feature_flags_v2, [], Groups}
     ].
@@ -120,6 +122,11 @@ init_per_suite(Config) ->
 end_per_suite(Config) ->
     rabbit_ct_helpers:run_teardown_steps(Config).
 
+init_per_group(registry, Config) ->
+    logger:set_primary_config(level, debug),
+    rabbit_ct_helpers:run_steps(
+      Config,
+      [fun rabbit_ct_helpers:redirect_logger_to_ct_logs/1]);
 init_per_group(feature_flags_v2, Config) ->
     %% `feature_flags_v2' is now required and won't work in mixed-version
     %% clusters if the other version doesn't support it.
@@ -213,7 +220,6 @@ init_per_testcase(Testcase, Config) ->
               end,
     case ?config(tc_group_properties, Config1) of
         [{name, registry} | _] ->
-            logger:set_primary_config(level, debug),
             FeatureFlagsFile = filename:join(?config(priv_dir, Config1),
                                              rabbit_misc:format(
                                                "feature_flags-~ts",
@@ -308,7 +314,7 @@ end_per_testcase(Testcase, Config) ->
 %% -------------------------------------------------------------------
 
 -define(list_ff(Which),
-        lists:sort(maps:keys(rabbit_ff_registry:list(Which)))).
+        lists:sort(maps:keys(rabbit_ff_registry_wrapper:list(Which)))).
 
 registry_general_usage(_Config) ->
     %% At first, the registry must be uninitialized.
@@ -330,20 +336,20 @@ registry_general_usage(_Config) ->
     ?assert(rabbit_ff_registry:is_registry_initialized()),
     ?assertMatch([ff_a, ff_b], ?list_ff(all)),
 
-    ?assert(rabbit_ff_registry:is_supported(ff_a)),
-    ?assert(rabbit_ff_registry:is_supported(ff_b)),
-    ?assertNot(rabbit_ff_registry:is_supported(ff_c)),
-    ?assertNot(rabbit_ff_registry:is_supported(ff_d)),
+    ?assert(rabbit_ff_registry_wrapper:is_supported(ff_a)),
+    ?assert(rabbit_ff_registry_wrapper:is_supported(ff_b)),
+    ?assertNot(rabbit_ff_registry_wrapper:is_supported(ff_c)),
+    ?assertNot(rabbit_ff_registry_wrapper:is_supported(ff_d)),
 
     ?assertEqual(#{ff_a => false,
-                   ff_b => false}, rabbit_ff_registry:states()),
+                   ff_b => false}, rabbit_ff_registry_wrapper:states()),
     ?assertMatch([], ?list_ff(enabled)),
     ?assertMatch([], ?list_ff(state_changing)),
     ?assertMatch([ff_a, ff_b], ?list_ff(disabled)),
-    ?assertNot(rabbit_ff_registry:is_enabled(ff_a)),
-    ?assertNot(rabbit_ff_registry:is_enabled(ff_b)),
-    ?assertNot(rabbit_ff_registry:is_enabled(ff_c)),
-    ?assertNot(rabbit_ff_registry:is_enabled(ff_d)),
+    ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_a)),
+    ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_b)),
+    ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_c)),
+    ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_d)),
 
     %% We can declare a new feature flag at runtime. All of them are
     %% supported but still disabled.
@@ -354,23 +360,23 @@ registry_general_usage(_Config) ->
     rabbit_feature_flags:inject_test_feature_flags(NewFeatureFlags),
     rabbit_ff_registry_factory:initialize_registry(),
     ?assertMatch([ff_a, ff_b, ff_c],
-                 lists:sort(maps:keys(rabbit_ff_registry:list(all)))),
+                 lists:sort(maps:keys(rabbit_ff_registry_wrapper:list(all)))),
 
-    ?assert(rabbit_ff_registry:is_supported(ff_a)),
-    ?assert(rabbit_ff_registry:is_supported(ff_b)),
-    ?assert(rabbit_ff_registry:is_supported(ff_c)),
-    ?assertNot(rabbit_ff_registry:is_supported(ff_d)),
+    ?assert(rabbit_ff_registry_wrapper:is_supported(ff_a)),
+    ?assert(rabbit_ff_registry_wrapper:is_supported(ff_b)),
+    ?assert(rabbit_ff_registry_wrapper:is_supported(ff_c)),
+    ?assertNot(rabbit_ff_registry_wrapper:is_supported(ff_d)),
 
     ?assertEqual(#{ff_a => false,
                    ff_b => false,
-                   ff_c => false}, rabbit_ff_registry:states()),
+                   ff_c => false}, rabbit_ff_registry_wrapper:states()),
     ?assertMatch([], ?list_ff(enabled)),
     ?assertMatch([], ?list_ff(state_changing)),
     ?assertMatch([ff_a, ff_b, ff_c], ?list_ff(disabled)),
-    ?assertNot(rabbit_ff_registry:is_enabled(ff_a)),
-    ?assertNot(rabbit_ff_registry:is_enabled(ff_b)),
-    ?assertNot(rabbit_ff_registry:is_enabled(ff_c)),
-    ?assertNot(rabbit_ff_registry:is_enabled(ff_d)),
+    ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_a)),
+    ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_b)),
+    ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_c)),
+    ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_d)),
 
     %% After enabling `ff_a`, it is actually the case. Others are
     %% supported but remain disabled.
@@ -378,23 +384,23 @@ registry_general_usage(_Config) ->
                                                    #{ff_a => true},
                                                    true),
     ?assertMatch([ff_a, ff_b, ff_c],
-                 lists:sort(maps:keys(rabbit_ff_registry:list(all)))),
+                 lists:sort(maps:keys(rabbit_ff_registry_wrapper:list(all)))),
 
-    ?assert(rabbit_ff_registry:is_supported(ff_a)),
-    ?assert(rabbit_ff_registry:is_supported(ff_b)),
-    ?assert(rabbit_ff_registry:is_supported(ff_c)),
-    ?assertNot(rabbit_ff_registry:is_supported(ff_d)),
+    ?assert(rabbit_ff_registry_wrapper:is_supported(ff_a)),
+    ?assert(rabbit_ff_registry_wrapper:is_supported(ff_b)),
+    ?assert(rabbit_ff_registry_wrapper:is_supported(ff_c)),
+    ?assertNot(rabbit_ff_registry_wrapper:is_supported(ff_d)),
 
     ?assertEqual(#{ff_a => true,
                    ff_b => false,
-                   ff_c => false}, rabbit_ff_registry:states()),
+                   ff_c => false}, rabbit_ff_registry_wrapper:states()),
     ?assertMatch([ff_a], ?list_ff(enabled)),
     ?assertMatch([], ?list_ff(state_changing)),
     ?assertMatch([ff_b, ff_c], ?list_ff(disabled)),
-    ?assert(rabbit_ff_registry:is_enabled(ff_a)),
-    ?assertNot(rabbit_ff_registry:is_enabled(ff_b)),
-    ?assertNot(rabbit_ff_registry:is_enabled(ff_c)),
-    ?assertNot(rabbit_ff_registry:is_enabled(ff_d)),
+    ?assert(rabbit_ff_registry_wrapper:is_enabled(ff_a)),
+    ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_b)),
+    ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_c)),
+    ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_d)),
 
     %% This time, we mark the state of `ff_c` as `state_changing`. We
     %% expect all other feature flag states to remain unchanged.
@@ -403,23 +409,25 @@ registry_general_usage(_Config) ->
                                                      ff_c => state_changing},
                                                    true),
     ?assertMatch([ff_a, ff_b, ff_c],
-                 lists:sort(maps:keys(rabbit_ff_registry:list(all)))),
+                 lists:sort(maps:keys(rabbit_ff_registry_wrapper:list(all)))),
 
-    ?assert(rabbit_ff_registry:is_supported(ff_a)),
-    ?assert(rabbit_ff_registry:is_supported(ff_b)),
-    ?assert(rabbit_ff_registry:is_supported(ff_c)),
-    ?assertNot(rabbit_ff_registry:is_supported(ff_d)),
+    ?assert(rabbit_ff_registry_wrapper:is_supported(ff_a)),
+    ?assert(rabbit_ff_registry_wrapper:is_supported(ff_b)),
+    ?assert(rabbit_ff_registry_wrapper:is_supported(ff_c)),
+    ?assertNot(rabbit_ff_registry_wrapper:is_supported(ff_d)),
 
-    ?assertEqual(#{ff_a => false,
-                   ff_b => false,
-                   ff_c => state_changing}, rabbit_ff_registry:states()),
+    ?assertEqual(
+       #{ff_a => false,
+         ff_b => false,
+         ff_c => state_changing},
+       rabbit_ff_registry_wrapper:states()),
     ?assertMatch([], ?list_ff(enabled)),
     ?assertMatch([ff_c], ?list_ff(state_changing)),
     ?assertMatch([ff_a, ff_b], ?list_ff(disabled)),
-    ?assertNot(rabbit_ff_registry:is_enabled(ff_a)),
-    ?assertNot(rabbit_ff_registry:is_enabled(ff_b)),
-    ?assertMatch(state_changing, rabbit_ff_registry:is_enabled(ff_c)),
-    ?assertNot(rabbit_ff_registry:is_enabled(ff_d)),
+    ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_a)),
+    ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_b)),
+    ?assertMatch(state_changing, rabbit_ff_registry_wrapper:is_enabled(ff_c)),
+    ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_d)),
 
     %% Finally, we disable `ff_c`. All of them are supported but
     %% disabled.
@@ -428,23 +436,23 @@ registry_general_usage(_Config) ->
                                                      ff_c => false},
                                                    true),
     ?assertMatch([ff_a, ff_b, ff_c],
-                 lists:sort(maps:keys(rabbit_ff_registry:list(all)))),
+                 lists:sort(maps:keys(rabbit_ff_registry_wrapper:list(all)))),
 
-    ?assert(rabbit_ff_registry:is_supported(ff_a)),
-    ?assert(rabbit_ff_registry:is_supported(ff_b)),
-    ?assert(rabbit_ff_registry:is_supported(ff_c)),
-    ?assertNot(rabbit_ff_registry:is_supported(ff_d)),
+    ?assert(rabbit_ff_registry_wrapper:is_supported(ff_a)),
+    ?assert(rabbit_ff_registry_wrapper:is_supported(ff_b)),
+    ?assert(rabbit_ff_registry_wrapper:is_supported(ff_c)),
+    ?assertNot(rabbit_ff_registry_wrapper:is_supported(ff_d)),
 
     ?assertEqual(#{ff_a => false,
                    ff_b => false,
-                   ff_c => false}, rabbit_ff_registry:states()),
+                   ff_c => false}, rabbit_ff_registry_wrapper:states()),
     ?assertMatch([], ?list_ff(enabled)),
     ?assertMatch([], ?list_ff(state_changing)),
     ?assertMatch([ff_a, ff_b, ff_c], ?list_ff(disabled)),
-    ?assertNot(rabbit_ff_registry:is_enabled(ff_a)),
-    ?assertNot(rabbit_ff_registry:is_enabled(ff_b)),
-    ?assertNot(rabbit_ff_registry:is_enabled(ff_c)),
-    ?assertNot(rabbit_ff_registry:is_enabled(ff_d)).
+    ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_a)),
+    ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_b)),
+    ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_c)),
+    ?assertNot(rabbit_ff_registry_wrapper:is_enabled(ff_d)).
 
 registry_concurrent_reloads(_Config) ->
     case rabbit_ff_registry:is_registry_initialized() of
@@ -559,7 +567,7 @@ registry_spammer1(FeatureNames) ->
     ?assertEqual(FeatureNames, ?list_ff(all)),
     registry_spammer1(FeatureNames).
 
-try_to_deadlock_in_registry_reload(_Config) ->
+try_to_deadlock_in_registry_reload_1(_Config) ->
     rabbit_ff_registry_factory:purge_old_registry(rabbit_ff_registry),
     _ = code:delete(rabbit_ff_registry),
     ?assertEqual(false, code:is_loaded(rabbit_ff_registry)),
@@ -568,37 +576,209 @@ try_to_deadlock_in_registry_reload(_Config) ->
     FeatureProps = #{provided_by => rabbit,
                      stability => stable},
 
-    Lock = rabbit_ff_registry_factory:registry_loading_lock(),
-    global:set_lock(Lock, [node()]),
-
     Parent = self(),
+
+    %% Deadlock steps:
+    %% * Process A acquires the lock first.
+    %% * Process B loads the registry stub and waits for the lock.
+    %% * Process A deletes the registry stub and loads the initialized
+    %%   registry.
+    %% * Process B wants to purge the deleted registry stub by sending a
+    %%   request to the Code server.
+    %%
+    %% => Process B waits forever the return from the Code server because the
+    %% Code server waits for process B to be runnable again to handle the
+    %% signal.
+
     ProcessA = spawn_link(
                  fun() ->
-                         FF = rabbit_ff_registry:get(FeatureName),
+                         %% Process A acquires the lock manually first to
+                         %% ensure the ordering of events. It can be acquired
+                         %% recursively, so the feature flag injection can
+                         %% "acquire" it again and proceed.
+                         ct:pal("Process A: Acquire registry loading lock"),
+                         Lock =
+                         rabbit_ff_registry_factory:registry_loading_lock(),
+                         global:set_lock(Lock, [node()]),
+                         receive proceed -> ok end,
+
+                         ct:pal(
+                           "Process A: "
+                           "Inject arbitrary feature flag to reload "
+                           "registry"),
+                         rabbit_feature_flags:inject_test_feature_flags(
+                           #{FeatureName => FeatureProps}),
+
+                         ct:pal("Process A: Release registry loading lock"),
+                         global:del_lock(Lock, [node()]),
+
+                         ct:pal("Process A: Exiting..."),
+                         erlang:unlink(Parent)
+                 end),
+    timer:sleep(500),
+
+    ProcessB = spawn_link(
+                 fun() ->
+                         %% Process B is the one loading the registry stub and
+                         %% wants to initialize the real registry.
+                         ct:pal(
+                           "Process B: "
+                           "Trigger automatic initial registry load"),
+                         FF = rabbit_ff_registry_wrapper:get(FeatureName),
+
+                         ct:pal("Process B: Exiting..."),
                          erlang:unlink(Parent),
                          Parent ! {self(), FF}
                  end),
-    timer:sleep(1000),
-
-    ct:pal("Inject arbitrary feature flag to reload registry"),
-    rabbit_feature_flags:inject_test_feature_flags(
-      #{FeatureName => FeatureProps}),
+    timer:sleep(500),
+
+    begin
+        {_, StacktraceA} = erlang:process_info(ProcessA, current_stacktrace),
+        {_, StacktraceB} = erlang:process_info(ProcessB, current_stacktrace),
+        ct:pal(
+          "Process stacktraces:~n"
+          "  Process A: ~p~n"
+          "  Process B: ~p",
+          [StacktraceA, StacktraceB])
+    end,
 
-    ct:pal("Release registry loading lock"),
-    global:del_lock(Lock, [node()]),
+    %% Process A is resumed. Without a proper check, process B would try to
+    %% purge the copy of the registry it is currently using itself, causing a
+    %% deadlock because the Code server wants process A to handle a signal, but
+    %% process A is not runnable.
+    ProcessA ! proceed,
 
-    ct:pal("Waiting for process A to exit"),
+    ct:pal("Waiting for process B to exit"),
     receive
-        {ProcessA, FF} ->
+        {ProcessB, FF} ->
             ?assertEqual(FeatureProps, FF),
             ok
     after 10000 ->
-              {_, Stacktrace} = erlang:process_info(
-                                  ProcessA, current_stacktrace),
-              ct:pal("Process A stuck; stacktrace: ~p", [Stacktrace]),
+              {_, StacktraceB} = erlang:process_info(
+                                   ProcessB, current_stacktrace),
+              ct:pal("Process B stuck; stacktrace: ~p", [StacktraceB]),
               error(registry_reload_deadlock)
     end.
 
+try_to_deadlock_in_registry_reload_2(_Config) ->
+    rabbit_ff_registry_factory:purge_old_registry(rabbit_ff_registry),
+    _ = code:delete(rabbit_ff_registry),
+    ?assertEqual(false, code:is_loaded(rabbit_ff_registry)),
+
+    FeatureName = ?FUNCTION_NAME,
+    FeatureProps = #{provided_by => rabbit,
+                     stability => stable},
+
+    ct:pal("Inject arbitrary feature flag to reload registry"),
+    rabbit_feature_flags:inject_test_feature_flags(
+      #{FeatureName => FeatureProps},
+      false),
+
+    _ = erlang:process_flag(trap_exit, true),
+
+    ct:pal("Parent ~p: Acquire registry loading lock", [self()]),
+    Lock = rabbit_ff_registry_factory:registry_loading_lock(),
+    global:set_lock(Lock, [node()]),
+
+    Parent = self(),
+
+    %% Deadlock steps:
+    %% * Processes A, B1 and B2 wait for the lock. The registry stub is loaded.
+    %% * Process B1 acquires the lock.
+    %% * Process B1 deletes the registry stub and loads the initialized
+    %%   registry.
+    %% * Process A acquires the lock.
+    %% * Process A wants to purge the deleted registry stub by sending a
+    %%   request to the Code server.
+    %%
+    %% => Process A waits forever the return from the Code server because the
+    %% Code server waits for process B2 to stop lingering on the deleted
+    %% registry stub, but process B2 waits for the lock.
+
+    ProcessA = spawn_link(
+                 fun() ->
+                         %% Process A acquires the lock automatically as part
+                         %% of requesting an explicit initialization of the
+                         %% registry. Process A doesn't linger on the registry
+                         %% stub.
+                         ct:pal(
+                           "Process A ~p: "
+                           "Trigger manual initial registry load",
+                           [self()]),
+                         rabbit_ff_registry_factory:initialize_registry(),
+
+                         ct:pal("Process A ~p: Exiting...", [self()]),
+                         erlang:unlink(Parent),
+                         Parent ! {self(), done}
+                 end),
+
+    FunB = fun() ->
+                   %% Processes B1 and B2 acquire the lock automatically as
+                   %% part of trying to load the registry as part of they
+                   %% querying a feature flag.
+                   ct:pal(
+                     "Process B ~p: "
+                     "Trigger automatic initial registry load",
+                     [self()]),
+                   _ = rabbit_ff_registry_wrapper:get(FeatureName),
+
+                   ct:pal(
+                     "Process B ~p: Exiting...",
+                     [self()]),
+                   erlang:unlink(Parent),
+                   Parent ! {self(), done}
+           end,
+    ProcessB1 = spawn_link(FunB),
+    ProcessB2 = spawn_link(FunB),
+    timer:sleep(500),
+
+    %% We use `erlang:suspend_process/1' and `erlang:resume_process/1' to
+    %% ensure the order in which processes acquire the lock.
+    erlang:suspend_process(ProcessA),
+    erlang:suspend_process(ProcessB1),
+    erlang:suspend_process(ProcessB2),
+    timer:sleep(500),
+
+    ct:pal("Parent ~p: Release registry loading lock", [self()]),
+    global:del_lock(Lock, [node()]),
+
+    erlang:resume_process(ProcessB1),
+    timer:sleep(500),
+    erlang:resume_process(ProcessA),
+    timer:sleep(500),
+    erlang:resume_process(ProcessB2),
+
+    ct:pal("Waiting for processes to exit"),
+    Procs = [ProcessA, ProcessB1, ProcessB2],
+    lists:foreach(
+      fun(Pid) ->
+              receive
+                  {Pid, done} ->
+                      ok;
+                  {'EXIT', Pid, Reason} ->
+                      ct:pal("Process ~p exited; reason: ~p", [Pid, Reason]),
+                      error(test_process_killed)
+              after 10000 ->
+                        lists:foreach(
+                          fun(Pid1) ->
+                                  PI = erlang:process_info(
+                                         Pid1, current_stacktrace),
+                                  case PI of
+                                      undefined ->
+                                          ok;
+                                      {_, Stacktrace} ->
+                                          ct:pal(
+                                            "Process ~p stuck; "
+                                            "stacktrace: ~p",
+                                            [Pid1, Stacktrace])
+                                  end
+                          end, Procs),
+                        error(registry_reload_deadlock)
+              end
+      end, Procs),
+
+    ok.
+
 enable_feature_flag_in_a_healthy_situation(Config) ->
     FeatureName = ff_from_testsuite,
     ClusterSize = ?config(rmq_nodes_count, Config),
diff --git a/deps/rabbit/test/feature_flags_v2_SUITE.erl b/deps/rabbit/test/feature_flags_v2_SUITE.erl
index dc1c6636b3..ee3d95f259 100644
--- a/deps/rabbit/test/feature_flags_v2_SUITE.erl
+++ b/deps/rabbit/test/feature_flags_v2_SUITE.erl
@@ -175,6 +175,7 @@ setup_slave_node(Config) ->
     ok = setup_logger(),
     ok = setup_data_dir(Config),
     ok = setup_feature_flags_file(Config),
+    _ = rabbit_ff_registry_factory:initialize_registry(),
     ok = start_controller(),
     ok = rabbit_feature_flags:enable(feature_flags_v2),
     ok.
author	Jean-Sébastien Pédron <jean-sebastien.pedron@dumbbell.fr>	2023-05-12 17:11:57 +0200
committer	Jean-Sébastien Pédron <jean-sebastien.pedron@dumbbell.fr>	2023-05-16 10:46:32 +0200
commit	213eb20e1658a1cebb4c1a07606e28fc7b62ebca (patch)
tree	f7ae3a58697bb94120e5333e21e748b30a8be4cb
parent	c7d0427d621493f00a3aa0d6e7720f709f3769a6 (diff)
download	rabbitmq-server-git-fix-feature-flags-init-vs-code_server-deadlock-take2.tar.gz