summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJean-Sébastien Pédron <jean-sebastien.pedron@dumbbell.fr>2023-05-16 14:13:22 +0200
committerJean-Sébastien Pédron <jean-sebastien.pedron@dumbbell.fr>2023-05-16 14:26:42 +0200
commit208f23551d3bd6f04ec8029485206b10b653f4a5 (patch)
treeb388c5f35c169e8e8e66cbceb47feafb33baa147
parentc7d0427d621493f00a3aa0d6e7720f709f3769a6 (diff)
downloadrabbitmq-server-git-improve-feature-flags-compat-error-reporting.tar.gz
rabbit_feature_flags: Improve error reporting from compat. checkimprove-feature-flags-compat-error-reporting
[Why] So far, no matter what the error was, and no matter if it was an actual incompatibility or something unrelated like a timeout or an Erlang distribution failure, the `check_node_compatibility_task()` function always logged and reported the same "nodes are incompatible" message. This makes it unclear what is wrong. Are my two RabbitMQ nodes really incompatible? Or was there a network issue? [How] Now, the function logs a more precise message explaining the source of the error. It will also return two different return errors: * `incompatible_feature_flags` for an actual incompatibility * `aborted_feature_flags_compat_check`, plus the error term, for any error not coming from the Feature flags subsystem itself. In the end, regardless of the error, the nodes will still be considered incompatible and possibly one of them will refuse to start. But now, the user should better understand why. Reported-by: @dcorbacho
-rw-r--r--deps/rabbit/src/rabbit_ff_controller.erl122
1 files changed, 85 insertions, 37 deletions
diff --git a/deps/rabbit/src/rabbit_ff_controller.erl b/deps/rabbit/src/rabbit_ff_controller.erl
index cc46ad5d24..6e9a40d8c5 100644
--- a/deps/rabbit/src/rabbit_ff_controller.erl
+++ b/deps/rabbit/src/rabbit_ff_controller.erl
@@ -342,51 +342,99 @@ check_node_compatibility_task(NodeA, NodeB) ->
[NodeA, NodeB],
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
NodesA = list_nodes_clustered_with(NodeA),
- NodesB = list_nodes_clustered_with(NodeB),
- AreCompatible = case collect_inventory_on_nodes(NodesA) of
- {ok, InventoryA} ->
- ?LOG_DEBUG(
- "Feature flags: inventory of node `~ts`:~n~tp",
- [NodeA, InventoryA],
- #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
- case collect_inventory_on_nodes(NodesB) of
- {ok, InventoryB} ->
- ?LOG_DEBUG(
- "Feature flags: inventory of node "
- "`~ts`:~n~tp",
- [NodeB, InventoryB],
- #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
- are_compatible(InventoryA, InventoryB);
- _ ->
- false
- end;
- _ ->
- false
- end,
- case AreCompatible of
- true ->
- ?LOG_NOTICE(
- "Feature flags: nodes `~ts` and `~ts` are compatible",
- [NodeA, NodeB],
+ case NodesA of
+ _ when is_list(NodesA) ->
+ NodesB = list_nodes_clustered_with(NodeB),
+ case NodesB of
+ _ when is_list(NodesB) ->
+ check_node_compatibility_task1(
+ NodeA, NodesA,
+ NodeB, NodesB);
+ Error ->
+ ?LOG_WARNING(
+ "Feature flags: "
+ "error while querying cluster members from "
+ "node `~ts`:~n~tp",
+ [NodeB, Error],
+ #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
+ {error, {aborted_feature_flags_compat_check, Error}}
+ end;
+ Error ->
+ ?LOG_WARNING(
+ "Feature flags: "
+ "error while querying cluster members from node `~ts`:~n~tp",
+ [NodeA, Error],
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
- ok;
- false ->
+ {error, {aborted_feature_flags_compat_check, Error}}
+ end.
+
+check_node_compatibility_task1(NodeA, NodesA, NodeB, NodesB)
+ when is_list(NodesA) andalso is_list(NodesB) ->
+ case collect_inventory_on_nodes(NodesA) of
+ {ok, InventoryA} ->
+ ?LOG_DEBUG(
+ "Feature flags: inventory of node `~ts`:~n~tp",
+ [NodeA, InventoryA],
+ #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
+ case collect_inventory_on_nodes(NodesB) of
+ {ok, InventoryB} ->
+ ?LOG_DEBUG(
+ "Feature flags: inventory of node "
+ "`~ts`:~n~tp",
+ [NodeB, InventoryB],
+ #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
+ case are_compatible(InventoryA, InventoryB) of
+ true ->
+ ?LOG_NOTICE(
+ "Feature flags: "
+ "nodes `~ts` and `~ts` are compatible",
+ [NodeA, NodeB],
+ #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
+ ok;
+ false ->
+ ?LOG_WARNING(
+ "Feature flags: "
+ "nodes `~ts` and `~ts` are incompatible",
+ [NodeA, NodeB],
+ #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
+ {error, incompatible_feature_flags}
+ end;
+ Error ->
+ ?LOG_WARNING(
+ "Feature flags: "
+ "error while collecting inventory from "
+ "nodes ~0tp:~n~tp",
+ [NodesB, Error],
+ #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
+ {error, {aborted_feature_flags_compat_check, Error}}
+ end;
+ Error ->
?LOG_WARNING(
- "Feature flags: nodes `~ts` and `~ts` are incompatible",
- [NodeA, NodeB],
+ "Feature flags: "
+ "error while collecting inventory from nodes ~0tp:~n~tp",
+ [NodesA, Error],
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
- {error, incompatible_feature_flags}
+ {error, {aborted_feature_flags_compat_check, Error}}
end.
--spec list_nodes_clustered_with(Node) -> [Node] when
- Node :: node().
+-spec list_nodes_clustered_with(Node) -> Ret when
+ Node :: node(),
+ Ret :: Members | Error,
+ Members :: [node()],
+ Error :: {error, term()}.
list_nodes_clustered_with(Node) ->
- %% If Mnesia is stopped on the given node, it will return an empty list.
- %% In this case, only consider that stopped node.
+ %% If `running_nodes()' returns an empty list, it means the `rabbit'
+ %% application is not running on `Node'. In this case, we consider this
+ %% node alone for now.
+ %%
+ %% It could be that RabbitMQ is starting on that node for instance;
+ %% indeed, feature flags compatibility is checked as part of RabbitMQ
+ %% booting. If that's not the case, collecting the feature flags inventory
+ %% later will fail anyway.
case rpc_call(Node, ?MODULE, running_nodes, [], ?TIMEOUT) of
- [] -> [Node];
- List -> List
+ [] -> [Node];
+ ListOrError -> ListOrError
end.
-spec are_compatible(Inventory, Inventory) -> AreCompatible when