summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDmitry Tantsur <dtantsur@protonmail.com>2023-02-22 16:08:09 +0100
committerDmitry Tantsur <dtantsur@protonmail.com>2023-03-02 18:45:33 +0000
commit316be3675a3358860882ba0772dcf6ffa9fef230 (patch)
tree524f89049b8b3c420de28b85e7e9eae23d621754
parentb7bcb94ceb9a28cd912e07c09cfd332e1c88c7e2 (diff)
downloadironic-316be3675a3358860882ba0772dcf6ffa9fef230.tar.gz
Do not move nodes to CLEAN FAILED with empty last_error
When cleaning fails, we power off the node, unless it has been running a clean step already. This happens when aborting cleaning or on a boot failure. This change makes sure that the power action does not wipe the last_error field, resulting in a node with provision_state=CLEANFAIL and last_error=None for several seconds. I've hit this in Metal3. Also when aborting cleaning, make sure last_error is set during the transition to CLEANFAIL, not when the clean up thread starts running. While here, make sure to log the current step in all cases, not only when aborting a non-abortable step. Change-Id: Id21dd7eb44dad149661ebe2d75a9b030aa70526f Story: #2010603 Task: #47476 (cherry picked from commit 9a0fa631ca53b40f4dc1877a73e65ded8ac37616)
-rw-r--r--ironic/common/states.py3
-rw-r--r--ironic/conductor/cleaning.py26
-rw-r--r--ironic/conductor/manager.py3
-rw-r--r--ironic/conductor/task_manager.py11
-rw-r--r--ironic/conductor/utils.py8
-rw-r--r--ironic/tests/unit/conductor/test_cleaning.py21
-rw-r--r--ironic/tests/unit/conductor/test_manager.py3
-rw-r--r--ironic/tests/unit/conductor/test_utils.py24
-rw-r--r--releasenotes/notes/cleaning-error-5c13c33c58404b97.yaml8
9 files changed, 80 insertions, 27 deletions
diff --git a/ironic/common/states.py b/ironic/common/states.py
index 89b710189..f2238b41b 100644
--- a/ironic/common/states.py
+++ b/ironic/common/states.py
@@ -269,6 +269,9 @@ _FASTTRACK_LOOKUP_ALLOWED_STATES = (ENROLL, MANAGEABLE, AVAILABLE,
FASTTRACK_LOOKUP_ALLOWED_STATES = frozenset(_FASTTRACK_LOOKUP_ALLOWED_STATES)
"""States where API lookups are permitted with fast track enabled."""
+FAILURE_STATES = frozenset((DEPLOYFAIL, CLEANFAIL, INSPECTFAIL,
+ RESCUEFAIL, UNRESCUEFAIL, ADOPTFAIL))
+
##############
# Power states
diff --git a/ironic/conductor/cleaning.py b/ironic/conductor/cleaning.py
index e59841a99..9e4edb809 100644
--- a/ironic/conductor/cleaning.py
+++ b/ironic/conductor/cleaning.py
@@ -248,12 +248,21 @@ def do_next_clean_step(task, step_index, disable_ramdisk=None):
task.process_event(event)
+def get_last_error(node):
+ last_error = _('By request, the clean operation was aborted')
+ if node.clean_step:
+ last_error += (
+ _(' during or after the completion of step "%s"')
+ % conductor_steps.step_id(node.clean_step)
+ )
+ return last_error
+
+
@task_manager.require_exclusive_lock
-def do_node_clean_abort(task, step_name=None):
+def do_node_clean_abort(task):
"""Internal method to abort an ongoing operation.
:param task: a TaskManager instance with an exclusive lock
- :param step_name: The name of the clean step.
"""
node = task.node
try:
@@ -271,12 +280,13 @@ def do_node_clean_abort(task, step_name=None):
set_fail_state=False)
return
+ last_error = get_last_error(node)
info_message = _('Clean operation aborted for node %s') % node.uuid
- last_error = _('By request, the clean operation was aborted')
- if step_name:
- msg = _(' after the completion of step "%s"') % step_name
- last_error += msg
- info_message += msg
+ if node.clean_step:
+ info_message += (
+ _(' during or after the completion of step "%s"')
+ % node.clean_step
+ )
node.last_error = last_error
node.clean_step = None
@@ -318,7 +328,7 @@ def continue_node_clean(task):
target_state = None
task.process_event('fail', target_state=target_state)
- do_node_clean_abort(task, step_name)
+ do_node_clean_abort(task)
return
LOG.debug('The cleaning operation for node %(node)s was '
diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py
index cf4988958..ef24581cc 100644
--- a/ironic/conductor/manager.py
+++ b/ironic/conductor/manager.py
@@ -1348,7 +1348,8 @@ class ConductorManager(base_manager.BaseConductorManager):
callback=self._spawn_worker,
call_args=(cleaning.do_node_clean_abort, task),
err_handler=utils.provisioning_error_handler,
- target_state=target_state)
+ target_state=target_state,
+ last_error=cleaning.get_last_error(node))
return
if node.provision_state == states.RESCUEWAIT:
diff --git a/ironic/conductor/task_manager.py b/ironic/conductor/task_manager.py
index 509c9ce92..922e74cf6 100644
--- a/ironic/conductor/task_manager.py
+++ b/ironic/conductor/task_manager.py
@@ -527,7 +527,8 @@ class TaskManager(object):
self.release_resources()
def process_event(self, event, callback=None, call_args=None,
- call_kwargs=None, err_handler=None, target_state=None):
+ call_kwargs=None, err_handler=None, target_state=None,
+ last_error=None):
"""Process the given event for the task's current state.
:param event: the name of the event to process
@@ -540,6 +541,8 @@ class TaskManager(object):
prev_target_state)
:param target_state: if specified, the target provision state for the
node. Otherwise, use the target state from the fsm
+ :param last_error: last error to set on the node together with
+ the state transition.
:raises: InvalidState if the event is not allowed by the associated
state machine
"""
@@ -572,13 +575,15 @@ class TaskManager(object):
# set up the async worker
if callback:
- # clear the error if we're going to start work in a callback
- self.node.last_error = None
+ # update the error if we're going to start work in a callback
+ self.node.last_error = last_error
if call_args is None:
call_args = ()
if call_kwargs is None:
call_kwargs = {}
self.spawn_after(callback, *call_args, **call_kwargs)
+ elif last_error is not None:
+ self.node.last_error = last_error
# publish the state transition by saving the Node
self.node.save()
diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py
index c107f076f..cdf3a99ee 100644
--- a/ironic/conductor/utils.py
+++ b/ironic/conductor/utils.py
@@ -302,9 +302,11 @@ def node_power_action(task, new_state, timeout=None):
# Set the target_power_state and clear any last_error, if we're
# starting a new operation. This will expose to other processes
- # and clients that work is in progress.
- node['target_power_state'] = target_state
- node['last_error'] = None
+ # and clients that work is in progress. Keep the last_error intact
+ # if the power action happens as a result of a failure.
+ node.target_power_state = target_state
+ if node.provision_state not in states.FAILURE_STATES:
+ node.last_error = None
node.timestamp_driver_internal_info('last_power_state_change')
# NOTE(dtantsur): wipe token on shutting down, otherwise a reboot in
# fast-track (or an accidentally booted agent) will cause subsequent
diff --git a/ironic/tests/unit/conductor/test_cleaning.py b/ironic/tests/unit/conductor/test_cleaning.py
index a4c3d57b6..34e805deb 100644
--- a/ironic/tests/unit/conductor/test_cleaning.py
+++ b/ironic/tests/unit/conductor/test_cleaning.py
@@ -1138,12 +1138,12 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
class DoNodeCleanAbortTestCase(db_base.DbTestCase):
@mock.patch.object(fake.FakeDeploy, 'tear_down_cleaning', autospec=True)
- def _test__do_node_clean_abort(self, step_name, tear_mock):
+ def _test_do_node_clean_abort(self, clean_step, tear_mock):
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
- provision_state=states.CLEANFAIL,
+ provision_state=states.CLEANWAIT,
target_provision_state=states.AVAILABLE,
- clean_step={'step': 'foo', 'abortable': True},
+ clean_step=clean_step,
driver_internal_info={
'agent_url': 'some url',
'agent_secret_token': 'token',
@@ -1153,11 +1153,11 @@ class DoNodeCleanAbortTestCase(db_base.DbTestCase):
'skip_current_clean_step': True})
with task_manager.acquire(self.context, node.uuid) as task:
- cleaning.do_node_clean_abort(task, step_name=step_name)
+ cleaning.do_node_clean_abort(task)
self.assertIsNotNone(task.node.last_error)
tear_mock.assert_called_once_with(task.driver.deploy, task)
- if step_name:
- self.assertIn(step_name, task.node.last_error)
+ if clean_step:
+ self.assertIn(clean_step['step'], task.node.last_error)
# assert node's clean_step and metadata was cleaned up
self.assertEqual({}, task.node.clean_step)
self.assertNotIn('clean_step_index',
@@ -1173,11 +1173,12 @@ class DoNodeCleanAbortTestCase(db_base.DbTestCase):
self.assertNotIn('agent_secret_token',
task.node.driver_internal_info)
- def test__do_node_clean_abort(self):
- self._test__do_node_clean_abort(None)
+ def test_do_node_clean_abort_early(self):
+ self._test_do_node_clean_abort(None)
- def test__do_node_clean_abort_with_step_name(self):
- self._test__do_node_clean_abort('foo')
+ def test_do_node_clean_abort_with_step(self):
+ self._test_do_node_clean_abort({'step': 'foo', 'interface': 'deploy',
+ 'abortable': True})
@mock.patch.object(fake.FakeDeploy, 'tear_down_cleaning', autospec=True)
def test__do_node_clean_abort_tear_down_fail(self, tear_mock):
diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py
index fd206e36d..cf71c4ef8 100644
--- a/ironic/tests/unit/conductor/test_manager.py
+++ b/ironic/tests/unit/conductor/test_manager.py
@@ -2733,7 +2733,8 @@ class DoProvisioningActionTestCase(mgr_utils.ServiceSetUpMixin,
# Node will be moved to tgt_prov_state after cleaning, not tested here
self.assertEqual(states.CLEANFAIL, node.provision_state)
self.assertEqual(tgt_prov_state, node.target_provision_state)
- self.assertIsNone(node.last_error)
+ self.assertEqual('By request, the clean operation was aborted',
+ node.last_error)
mock_spawn.assert_called_with(
self.service, cleaning.do_node_clean_abort, mock.ANY)
diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py
index a424e5132..a29da21a7 100644
--- a/ironic/tests/unit/conductor/test_utils.py
+++ b/ironic/tests/unit/conductor/test_utils.py
@@ -196,7 +196,8 @@ class NodePowerActionTestCase(db_base.DbTestCase):
node = obj_utils.create_test_node(self.context,
uuid=uuidutils.generate_uuid(),
driver='fake-hardware',
- power_state=states.POWER_OFF)
+ power_state=states.POWER_OFF,
+ last_error='failed before')
task = task_manager.TaskManager(self.context, node.uuid)
get_power_mock.return_value = states.POWER_OFF
@@ -209,6 +210,27 @@ class NodePowerActionTestCase(db_base.DbTestCase):
self.assertIsNone(node['target_power_state'])
self.assertIsNone(node['last_error'])
+ @mock.patch.object(fake.FakePower, 'get_power_state', autospec=True)
+ def test_node_power_action_keep_last_error(self, get_power_mock):
+ """Test node_power_action to keep last_error for failed states."""
+ node = obj_utils.create_test_node(self.context,
+ uuid=uuidutils.generate_uuid(),
+ driver='fake-hardware',
+ power_state=states.POWER_OFF,
+ provision_state=states.CLEANFAIL,
+ last_error='failed before')
+ task = task_manager.TaskManager(self.context, node.uuid)
+
+ get_power_mock.return_value = states.POWER_OFF
+
+ conductor_utils.node_power_action(task, states.POWER_ON)
+
+ node.refresh()
+ get_power_mock.assert_called_once_with(mock.ANY, mock.ANY)
+ self.assertEqual(states.POWER_ON, node['power_state'])
+ self.assertIsNone(node['target_power_state'])
+ self.assertEqual('failed before', node['last_error'])
+
@mock.patch('ironic.objects.node.NodeSetPowerStateNotification',
autospec=True)
@mock.patch.object(fake.FakePower, 'get_power_state', autospec=True)
diff --git a/releasenotes/notes/cleaning-error-5c13c33c58404b97.yaml b/releasenotes/notes/cleaning-error-5c13c33c58404b97.yaml
new file mode 100644
index 000000000..270278f1b
--- /dev/null
+++ b/releasenotes/notes/cleaning-error-5c13c33c58404b97.yaml
@@ -0,0 +1,8 @@
+---
+fixes:
+ - |
+ When aborting cleaning, the ``last_error`` field is no longer initially
+ empty. It is now populated on the state transition to ``clean failed``.
+ - |
+ When cleaning or deployment fails, the ``last_error`` field is no longer
+ temporary set to ``None`` while the power off action is running.