ironic/conductor/cleaning.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338

#    Licensed under the Apache License, Version 2.0 (the "License"); you may
#    not use this file except in compliance with the License. You may obtain
#    a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#    License for the specific language governing permissions and limitations
#    under the License.

"""Functionality related to cleaning."""

from oslo_log import log

from ironic.common import exception
from ironic.common.i18n import _
from ironic.common import states
from ironic.conductor import steps as conductor_steps
from ironic.conductor import task_manager
from ironic.conductor import utils
from ironic.conf import CONF
from ironic.drivers import utils as driver_utils

LOG = log.getLogger(__name__)


@task_manager.require_exclusive_lock
def do_node_clean(task, clean_steps=None, disable_ramdisk=False):
    """Internal RPC method to perform cleaning of a node.

    :param task: a TaskManager instance with an exclusive lock on its node
    :param clean_steps: For a manual clean, the list of clean steps to
                        perform. Is None For automated cleaning (default).
                        For more information, see the clean_steps parameter
                        of :func:`ConductorManager.do_node_clean`.
    :param disable_ramdisk: Whether to skip booting ramdisk for cleaning.
    """
    node = task.node
    manual_clean = clean_steps is not None
    clean_type = 'manual' if manual_clean else 'automated'
    LOG.debug('Starting %(type)s cleaning for node %(node)s',
              {'type': clean_type, 'node': node.uuid})

    if not manual_clean and utils.skip_automated_cleaning(node):
        # Skip cleaning, move to AVAILABLE.
        node.clean_step = None
        node.save()

        task.process_event('done')
        LOG.info('Automated cleaning is disabled, node %s has been '
                 'successfully moved to AVAILABLE state.', node.uuid)
        return

    # NOTE(dtantsur): this is only reachable during automated cleaning,
    # for manual cleaning we verify maintenance mode earlier on.
    if (not CONF.conductor.allow_provisioning_in_maintenance
            and node.maintenance):
        msg = _('Cleaning a node in maintenance mode is not allowed')
        return utils.cleaning_error_handler(task, msg,
                                            tear_down_cleaning=False)

    try:
        # NOTE(ghe): Valid power and network values are needed to perform
        # a cleaning.
        task.driver.power.validate(task)
        if not disable_ramdisk:
            task.driver.network.validate(task)
    except exception.InvalidParameterValue as e:
        msg = (_('Validation of node %(node)s for cleaning failed: %(msg)s') %
               {'node': node.uuid, 'msg': e})
        return utils.cleaning_error_handler(task, msg)

    utils.wipe_cleaning_internal_info(task)
    if manual_clean:
        node.set_driver_internal_info('clean_steps', clean_steps)
        node.set_driver_internal_info('cleaning_disable_ramdisk',
                                      disable_ramdisk)
    task.node.save()

    # Retrieve BIOS config settings for this node
    utils.node_cache_bios_settings(task, node)

    # Allow the deploy driver to set up the ramdisk again (necessary for
    # IPA cleaning)
    try:
        if not disable_ramdisk:
            prepare_result = task.driver.deploy.prepare_cleaning(task)
        else:
            LOG.info('Skipping preparing for in-band cleaning since '
                     'out-of-band only cleaning has been requested for node '
                     '%s', node.uuid)
            prepare_result = None
    except Exception as e:
        msg = (_('Failed to prepare node %(node)s for cleaning: %(e)s')
               % {'node': node.uuid, 'e': e})
        return utils.cleaning_error_handler(task, msg, traceback=True)

    if prepare_result == states.CLEANWAIT:
        # Prepare is asynchronous, the deploy driver will need to
        # set node.driver_internal_info['clean_steps'] and
        # node.clean_step and then make an RPC call to
        # continue_node_clean to start cleaning.

        # For manual cleaning, the target provision state is MANAGEABLE,
        # whereas for automated cleaning, it is AVAILABLE (the default).
        target_state = states.MANAGEABLE if manual_clean else None
        task.process_event('wait', target_state=target_state)
        return

    try:
        conductor_steps.set_node_cleaning_steps(
            task, disable_ramdisk=disable_ramdisk)
    except (exception.InvalidParameterValue,
            exception.NodeCleaningFailure) as e:
        msg = (_('Cannot clean node %(node)s: %(msg)s')
               % {'node': node.uuid, 'msg': e})
        return utils.cleaning_error_handler(task, msg)

    steps = node.driver_internal_info.get('clean_steps', [])
    step_index = 0 if steps else None
    do_next_clean_step(task, step_index, disable_ramdisk=disable_ramdisk)


@utils.fail_on_error(utils.cleaning_error_handler,
                     _("Unexpected error when processing next clean step"),
                     traceback=True)
@task_manager.require_exclusive_lock
def do_next_clean_step(task, step_index, disable_ramdisk=None):
    """Do cleaning, starting from the specified clean step.

    :param task: a TaskManager instance with an exclusive lock
    :param step_index: The first clean step in the list to execute. This
        is the index (from 0) into the list of clean steps in the node's
        driver_internal_info['clean_steps']. Is None if there are no steps
        to execute.
    :param disable_ramdisk: Whether to skip booting ramdisk for cleaning.
    """
    node = task.node
    # For manual cleaning, the target provision state is MANAGEABLE,
    # whereas for automated cleaning, it is AVAILABLE.
    manual_clean = node.target_provision_state == states.MANAGEABLE
    if step_index is None:
        steps = []
    else:
        assert node.driver_internal_info.get('clean_steps') is not None, \
            f"BUG: No clean steps for {node.uuid}, step index is {step_index}"
        steps = node.driver_internal_info['clean_steps'][step_index:]

    if disable_ramdisk is None:
        disable_ramdisk = node.driver_internal_info.get(
            'cleaning_disable_ramdisk', False)

    LOG.info('Executing %(kind)s cleaning on node %(node)s, remaining steps: '
             '%(steps)s', {'node': node.uuid, 'steps': steps,
                           'kind': 'manual' if manual_clean else 'automated'})

    # Execute each step until we hit an async step or run out of steps
    for ind, step in enumerate(steps):
        # Save which step we're about to start so we can restart
        # if necessary
        node.clean_step = step
        node.set_driver_internal_info('clean_step_index', step_index + ind)
        node.save()
        interface = getattr(task.driver, step.get('interface'))
        LOG.info('Executing %(step)s on node %(node)s',
                 {'step': step, 'node': node.uuid})
        try:
            result = interface.execute_clean_step(task, step)
        except Exception as e:
            if isinstance(e, exception.AgentConnectionFailed):
                if task.node.driver_internal_info.get('cleaning_reboot'):
                    LOG.info('Agent is not yet running on node %(node)s '
                             'after cleaning reboot, waiting for agent to '
                             'come up to run next clean step %(step)s.',
                             {'node': node.uuid, 'step': step})
                    node.set_driver_internal_info('skip_current_clean_step',
                                                  False)
                    target_state = (states.MANAGEABLE if manual_clean
                                    else None)
                    task.process_event('wait', target_state=target_state)
                    return
            if isinstance(e, exception.AgentInProgress):
                LOG.info('Conductor attempted to process clean step for '
                         'node %(node)s. Agent indicated it is presently '
                         'executing a command. Error: %(error)s',
                         {'node': task.node.uuid,
                          'error': e})
                node.set_driver_internal_info('skip_current_clean_step', False)
                target_state = states.MANAGEABLE if manual_clean else None
                task.process_event('wait', target_state=target_state)
                return

            msg = (_('Node %(node)s failed step %(step)s: '
                     '%(exc)s') %
                   {'node': node.uuid, 'exc': e,
                    'step': node.clean_step})
            if not disable_ramdisk:
                driver_utils.collect_ramdisk_logs(task.node, label='cleaning')
            utils.cleaning_error_handler(task, msg, traceback=True)
            return

        # Check if the step is done or not. The step should return
        # states.CLEANWAIT if the step is still being executed, or
        # None if the step is done.
        if result == states.CLEANWAIT:
            # Kill this worker, the async step will make an RPC call to
            # continue_node_clean to continue cleaning
            LOG.info('Clean step %(step)s on node %(node)s being '
                     'executed asynchronously, waiting for driver.',
                     {'node': node.uuid, 'step': step})
            target_state = states.MANAGEABLE if manual_clean else None
            task.process_event('wait', target_state=target_state)
            return
        elif result is not None:
            msg = (_('While executing step %(step)s on node '
                     '%(node)s, step returned invalid value: %(val)s')
                   % {'step': step, 'node': node.uuid, 'val': result})
            return utils.cleaning_error_handler(task, msg)
        LOG.info('Node %(node)s finished clean step %(step)s',
                 {'node': node.uuid, 'step': step})

    if CONF.agent.deploy_logs_collect == 'always' and not disable_ramdisk:
        driver_utils.collect_ramdisk_logs(task.node, label='cleaning')

    # Clear clean_step
    node.clean_step = None
    utils.wipe_cleaning_internal_info(task)
    node.save()
    if not disable_ramdisk:
        try:
            task.driver.deploy.tear_down_cleaning(task)
        except Exception as e:
            msg = (_('Failed to tear down from cleaning for node %(node)s, '
                     'reason: %(err)s')
                   % {'node': node.uuid, 'err': e})
            return utils.cleaning_error_handler(task, msg,
                                                traceback=True,
                                                tear_down_cleaning=False)

    LOG.info('Node %s cleaning complete', node.uuid)
    event = 'manage' if manual_clean or node.retired else 'done'
    # NOTE(rloo): No need to specify target prov. state; we're done
    task.process_event(event)


def get_last_error(node):
    last_error = _('By request, the clean operation was aborted')
    if node.clean_step:
        last_error += (
            _(' during or after the completion of step "%s"')
            % conductor_steps.step_id(node.clean_step)
        )
    return last_error


@task_manager.require_exclusive_lock
def do_node_clean_abort(task):
    """Internal method to abort an ongoing operation.

    :param task: a TaskManager instance with an exclusive lock
    """
    node = task.node
    try:
        task.driver.deploy.tear_down_cleaning(task)
    except Exception as e:
        log_msg = (_('Failed to tear down cleaning for node %(node)s '
                     'after aborting the operation. Error: %(err)s') %
                   {'node': node.uuid, 'err': e})
        error_msg = _('Failed to tear down cleaning after aborting '
                      'the operation')
        utils.cleaning_error_handler(task, log_msg,
                                     errmsg=error_msg,
                                     traceback=True,
                                     tear_down_cleaning=False,
                                     set_fail_state=False)
        return

    last_error = get_last_error(node)
    info_message = _('Clean operation aborted for node %s') % node.uuid
    if node.clean_step:
        info_message += (
            _(' during or after the completion of step "%s"')
            % node.clean_step
        )

    node.last_error = last_error
    node.clean_step = None
    utils.wipe_cleaning_internal_info(task)
    node.save()
    LOG.info(info_message)


@utils.fail_on_error(utils.cleaning_error_handler,
                     _("Unexpected error when processing next clean step"),
                     traceback=True)
@task_manager.require_exclusive_lock
def continue_node_clean(task):
    """Continue cleaning after finishing an async clean step.

    This function calculates which step has to run next and passes control
    into do_next_clean_step.

    :param task: a TaskManager instance with an exclusive lock
    """
    node = task.node

    next_step_index = utils.update_next_step_index(task, 'clean')

    # If this isn't the final clean step in the cleaning operation
    # and it is flagged to abort after the clean step that just
    # finished, we abort the cleaning operation.
    if node.clean_step.get('abort_after'):
        step_name = node.clean_step['step']
        if next_step_index is not None:
            LOG.debug('The cleaning operation for node %(node)s was '
                      'marked to be aborted after step "%(step)s '
                      'completed. Aborting now that it has completed.',
                      {'node': task.node.uuid, 'step': step_name})

            if node.target_provision_state == states.MANAGEABLE:
                target_state = states.MANAGEABLE
            else:
                target_state = None

            task.process_event('fail', target_state=target_state)
            do_node_clean_abort(task)
            return

        LOG.debug('The cleaning operation for node %(node)s was '
                  'marked to be aborted after step "%(step)s" '
                  'completed. However, since there are no more '
                  'clean steps after this, the abort is not going '
                  'to be done.', {'node': node.uuid,
                                  'step': step_name})

    do_next_clean_step(task, next_step_index)