diff options
author | Jean-Sébastien Pédron <jean-sebastien.pedron@dumbbell.fr> | 2023-05-16 14:13:22 +0200 |
---|---|---|
committer | Jean-Sébastien Pédron <jean-sebastien.pedron@dumbbell.fr> | 2023-05-16 14:26:42 +0200 |
commit | 208f23551d3bd6f04ec8029485206b10b653f4a5 (patch) | |
tree | b388c5f35c169e8e8e66cbceb47feafb33baa147 | |
parent | c7d0427d621493f00a3aa0d6e7720f709f3769a6 (diff) | |
download | rabbitmq-server-git-improve-feature-flags-compat-error-reporting.tar.gz |
rabbit_feature_flags: Improve error reporting from compat. checkimprove-feature-flags-compat-error-reporting
[Why]
So far, no matter what the error was, and no matter if it was an actual
incompatibility or something unrelated like a timeout or an Erlang
distribution failure, the `check_node_compatibility_task()` function
always logged and reported the same "nodes are incompatible" message.
This makes it unclear what is wrong. Are my two RabbitMQ nodes really
incompatible? Or was there a network issue?
[How]
Now, the function logs a more precise message explaining the source of
the error. It will also return two different return errors:
* `incompatible_feature_flags` for an actual incompatibility
* `aborted_feature_flags_compat_check`, plus the error term, for any
error not coming from the Feature flags subsystem itself.
In the end, regardless of the error, the nodes will still be considered
incompatible and possibly one of them will refuse to start. But now, the
user should better understand why.
Reported-by: @dcorbacho
-rw-r--r-- | deps/rabbit/src/rabbit_ff_controller.erl | 122 |
1 files changed, 85 insertions, 37 deletions
diff --git a/deps/rabbit/src/rabbit_ff_controller.erl b/deps/rabbit/src/rabbit_ff_controller.erl index cc46ad5d24..6e9a40d8c5 100644 --- a/deps/rabbit/src/rabbit_ff_controller.erl +++ b/deps/rabbit/src/rabbit_ff_controller.erl @@ -342,51 +342,99 @@ check_node_compatibility_task(NodeA, NodeB) -> [NodeA, NodeB], #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}), NodesA = list_nodes_clustered_with(NodeA), - NodesB = list_nodes_clustered_with(NodeB), - AreCompatible = case collect_inventory_on_nodes(NodesA) of - {ok, InventoryA} -> - ?LOG_DEBUG( - "Feature flags: inventory of node `~ts`:~n~tp", - [NodeA, InventoryA], - #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}), - case collect_inventory_on_nodes(NodesB) of - {ok, InventoryB} -> - ?LOG_DEBUG( - "Feature flags: inventory of node " - "`~ts`:~n~tp", - [NodeB, InventoryB], - #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}), - are_compatible(InventoryA, InventoryB); - _ -> - false - end; - _ -> - false - end, - case AreCompatible of - true -> - ?LOG_NOTICE( - "Feature flags: nodes `~ts` and `~ts` are compatible", - [NodeA, NodeB], + case NodesA of + _ when is_list(NodesA) -> + NodesB = list_nodes_clustered_with(NodeB), + case NodesB of + _ when is_list(NodesB) -> + check_node_compatibility_task1( + NodeA, NodesA, + NodeB, NodesB); + Error -> + ?LOG_WARNING( + "Feature flags: " + "error while querying cluster members from " + "node `~ts`:~n~tp", + [NodeB, Error], + #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}), + {error, {aborted_feature_flags_compat_check, Error}} + end; + Error -> + ?LOG_WARNING( + "Feature flags: " + "error while querying cluster members from node `~ts`:~n~tp", + [NodeA, Error], #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}), - ok; - false -> + {error, {aborted_feature_flags_compat_check, Error}} + end. + +check_node_compatibility_task1(NodeA, NodesA, NodeB, NodesB) + when is_list(NodesA) andalso is_list(NodesB) -> + case collect_inventory_on_nodes(NodesA) of + {ok, InventoryA} -> + ?LOG_DEBUG( + "Feature flags: inventory of node `~ts`:~n~tp", + [NodeA, InventoryA], + #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}), + case collect_inventory_on_nodes(NodesB) of + {ok, InventoryB} -> + ?LOG_DEBUG( + "Feature flags: inventory of node " + "`~ts`:~n~tp", + [NodeB, InventoryB], + #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}), + case are_compatible(InventoryA, InventoryB) of + true -> + ?LOG_NOTICE( + "Feature flags: " + "nodes `~ts` and `~ts` are compatible", + [NodeA, NodeB], + #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}), + ok; + false -> + ?LOG_WARNING( + "Feature flags: " + "nodes `~ts` and `~ts` are incompatible", + [NodeA, NodeB], + #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}), + {error, incompatible_feature_flags} + end; + Error -> + ?LOG_WARNING( + "Feature flags: " + "error while collecting inventory from " + "nodes ~0tp:~n~tp", + [NodesB, Error], + #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}), + {error, {aborted_feature_flags_compat_check, Error}} + end; + Error -> ?LOG_WARNING( - "Feature flags: nodes `~ts` and `~ts` are incompatible", - [NodeA, NodeB], + "Feature flags: " + "error while collecting inventory from nodes ~0tp:~n~tp", + [NodesA, Error], #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}), - {error, incompatible_feature_flags} + {error, {aborted_feature_flags_compat_check, Error}} end. --spec list_nodes_clustered_with(Node) -> [Node] when - Node :: node(). +-spec list_nodes_clustered_with(Node) -> Ret when + Node :: node(), + Ret :: Members | Error, + Members :: [node()], + Error :: {error, term()}. list_nodes_clustered_with(Node) -> - %% If Mnesia is stopped on the given node, it will return an empty list. - %% In this case, only consider that stopped node. + %% If `running_nodes()' returns an empty list, it means the `rabbit' + %% application is not running on `Node'. In this case, we consider this + %% node alone for now. + %% + %% It could be that RabbitMQ is starting on that node for instance; + %% indeed, feature flags compatibility is checked as part of RabbitMQ + %% booting. If that's not the case, collecting the feature flags inventory + %% later will fail anyway. case rpc_call(Node, ?MODULE, running_nodes, [], ?TIMEOUT) of - [] -> [Node]; - List -> List + [] -> [Node]; + ListOrError -> ListOrError end. -spec are_compatible(Inventory, Inventory) -> AreCompatible when |