diff options
author | Dmitry Tantsur <dtantsur@protonmail.com> | 2023-02-22 16:08:09 +0100 |
---|---|---|
committer | Dmitry Tantsur <dtantsur@protonmail.com> | 2023-03-02 18:46:04 +0000 |
commit | a4a3b31d441e7f99d3845c84bb16d9e78ba8ec81 (patch) | |
tree | 1d74899725e2310804ad47ed2c67c97400a1a82b /ironic/conductor/cleaning.py | |
parent | 33963b50c39931df1b5ee86e8ba9327e64441256 (diff) | |
download | ironic-a4a3b31d441e7f99d3845c84bb16d9e78ba8ec81.tar.gz |
Do not move nodes to CLEAN FAILED with empty last_error
When cleaning fails, we power off the node, unless it has been running
a clean step already. This happens when aborting cleaning or on a boot
failure. This change makes sure that the power action does not wipe
the last_error field, resulting in a node with provision_state=CLEANFAIL
and last_error=None for several seconds. I've hit this in Metal3.
Also when aborting cleaning, make sure last_error is set during
the transition to CLEANFAIL, not when the clean up thread starts
running.
While here, make sure to log the current step in all cases, not only
when aborting a non-abortable step.
Change-Id: Id21dd7eb44dad149661ebe2d75a9b030aa70526f
Story: #2010603
Task: #47476
(cherry picked from commit 9a0fa631ca53b40f4dc1877a73e65ded8ac37616)
Diffstat (limited to 'ironic/conductor/cleaning.py')
-rw-r--r-- | ironic/conductor/cleaning.py | 26 |
1 files changed, 18 insertions, 8 deletions
diff --git a/ironic/conductor/cleaning.py b/ironic/conductor/cleaning.py index 53d66ddd8..5f69a4ab9 100644 --- a/ironic/conductor/cleaning.py +++ b/ironic/conductor/cleaning.py @@ -247,12 +247,21 @@ def do_next_clean_step(task, step_index, disable_ramdisk=None): task.process_event(event) +def get_last_error(node): + last_error = _('By request, the clean operation was aborted') + if node.clean_step: + last_error += ( + _(' during or after the completion of step "%s"') + % conductor_steps.step_id(node.clean_step) + ) + return last_error + + @task_manager.require_exclusive_lock -def do_node_clean_abort(task, step_name=None): +def do_node_clean_abort(task): """Internal method to abort an ongoing operation. :param task: a TaskManager instance with an exclusive lock - :param step_name: The name of the clean step. """ node = task.node try: @@ -270,12 +279,13 @@ def do_node_clean_abort(task, step_name=None): set_fail_state=False) return + last_error = get_last_error(node) info_message = _('Clean operation aborted for node %s') % node.uuid - last_error = _('By request, the clean operation was aborted') - if step_name: - msg = _(' after the completion of step "%s"') % step_name - last_error += msg - info_message += msg + if node.clean_step: + info_message += ( + _(' during or after the completion of step "%s"') + % node.clean_step + ) node.last_error = last_error node.clean_step = None @@ -317,7 +327,7 @@ def continue_node_clean(task): target_state = None task.process_event('fail', target_state=target_state) - do_node_clean_abort(task, step_name) + do_node_clean_abort(task) return LOG.debug('The cleaning operation for node %(node)s was ' |