From 242775ae184582ce51a006e5797e7017133617f3 Mon Sep 17 00:00:00 2001 From: Julia Kreger Date: Thu, 26 Mar 2020 09:14:30 -0700 Subject: Retry agent get_command_status upon failures The agent command status code lacks any retry mechanism which meant if any intermittent failure such as a dropped packet or an overloaded firewall could potentially begin to cause the entire deployment or cleaning process to derail and fail. This fix addes logic to ensure we retry upon such failures. Worth noting, the exact same logic has been used elsewhere in the agent client code for the exact same problem when issuing commands. Change-Id: I4f6581b7fb895ed2b1d505b9947e363665551b57 Story: 2007470 Task: 39158 --- ironic/drivers/modules/agent_client.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'ironic/drivers/modules/agent_client.py') diff --git a/ironic/drivers/modules/agent_client.py b/ironic/drivers/modules/agent_client.py index 32427c6e7..fed684670 100644 --- a/ironic/drivers/modules/agent_client.py +++ b/ironic/drivers/modules/agent_client.py @@ -137,6 +137,10 @@ class AgentClient(object): return result @METRICS.timer('AgentClient.get_commands_status') + @retrying.retry( + retry_on_exception=( + lambda e: isinstance(e, exception.AgentConnectionFailed)), + stop_max_attempt_number=CONF.agent.max_command_attempts) def get_commands_status(self, node): """Get command status from agent. @@ -166,7 +170,16 @@ class AgentClient(object): """ url = self._get_command_url(node) LOG.debug('Fetching status of agent commands for node %s', node.uuid) - resp = self.session.get(url, timeout=CONF.agent.command_timeout) + try: + resp = self.session.get(url, timeout=CONF.agent.command_timeout) + except (requests.ConnectionError, requests.Timeout) as e: + msg = (_('Failed to connect to the agent running on node %(node)s ' + 'to collect commands status. ' + 'Error: %(error)s') % + {'node': node.uuid, 'error': e}) + LOG.error(msg) + raise exception.AgentConnectionFailed(reason=msg) + result = resp.json()['commands'] status = '; '.join('%(cmd)s: result "%(res)s", error "%(err)s"' % {'cmd': r.get('command_name'), -- cgit v1.2.1