summaryrefslogtreecommitdiff
path: root/ironic/conductor/cleaning.py
diff options
context:
space:
mode:
authorDmitry Tantsur <dtantsur@protonmail.com>2023-02-22 16:08:09 +0100
committerDmitry Tantsur <dtantsur@protonmail.com>2023-03-02 18:46:04 +0000
commita4a3b31d441e7f99d3845c84bb16d9e78ba8ec81 (patch)
tree1d74899725e2310804ad47ed2c67c97400a1a82b /ironic/conductor/cleaning.py
parent33963b50c39931df1b5ee86e8ba9327e64441256 (diff)
downloadironic-a4a3b31d441e7f99d3845c84bb16d9e78ba8ec81.tar.gz
Do not move nodes to CLEAN FAILED with empty last_error
When cleaning fails, we power off the node, unless it has been running a clean step already. This happens when aborting cleaning or on a boot failure. This change makes sure that the power action does not wipe the last_error field, resulting in a node with provision_state=CLEANFAIL and last_error=None for several seconds. I've hit this in Metal3. Also when aborting cleaning, make sure last_error is set during the transition to CLEANFAIL, not when the clean up thread starts running. While here, make sure to log the current step in all cases, not only when aborting a non-abortable step. Change-Id: Id21dd7eb44dad149661ebe2d75a9b030aa70526f Story: #2010603 Task: #47476 (cherry picked from commit 9a0fa631ca53b40f4dc1877a73e65ded8ac37616)
Diffstat (limited to 'ironic/conductor/cleaning.py')
-rw-r--r--ironic/conductor/cleaning.py26
1 files changed, 18 insertions, 8 deletions
diff --git a/ironic/conductor/cleaning.py b/ironic/conductor/cleaning.py
index 53d66ddd8..5f69a4ab9 100644
--- a/ironic/conductor/cleaning.py
+++ b/ironic/conductor/cleaning.py
@@ -247,12 +247,21 @@ def do_next_clean_step(task, step_index, disable_ramdisk=None):
task.process_event(event)
+def get_last_error(node):
+ last_error = _('By request, the clean operation was aborted')
+ if node.clean_step:
+ last_error += (
+ _(' during or after the completion of step "%s"')
+ % conductor_steps.step_id(node.clean_step)
+ )
+ return last_error
+
+
@task_manager.require_exclusive_lock
-def do_node_clean_abort(task, step_name=None):
+def do_node_clean_abort(task):
"""Internal method to abort an ongoing operation.
:param task: a TaskManager instance with an exclusive lock
- :param step_name: The name of the clean step.
"""
node = task.node
try:
@@ -270,12 +279,13 @@ def do_node_clean_abort(task, step_name=None):
set_fail_state=False)
return
+ last_error = get_last_error(node)
info_message = _('Clean operation aborted for node %s') % node.uuid
- last_error = _('By request, the clean operation was aborted')
- if step_name:
- msg = _(' after the completion of step "%s"') % step_name
- last_error += msg
- info_message += msg
+ if node.clean_step:
+ info_message += (
+ _(' during or after the completion of step "%s"')
+ % node.clean_step
+ )
node.last_error = last_error
node.clean_step = None
@@ -317,7 +327,7 @@ def continue_node_clean(task):
target_state = None
task.process_event('fail', target_state=target_state)
- do_node_clean_abort(task, step_name)
+ do_node_clean_abort(task)
return
LOG.debug('The cleaning operation for node %(node)s was '