ironic/conductor/deployments.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473

#    Licensed under the Apache License, Version 2.0 (the "License"); you may
#    not use this file except in compliance with the License. You may obtain
#    a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#    License for the specific language governing permissions and limitations
#    under the License.

"""Functionality related to deploying and undeploying."""

import tempfile

from ironic_lib import metrics_utils
from oslo_db import exception as db_exception
from oslo_log import log
from oslo_utils import excutils

from ironic.common import exception
from ironic.common.glance_service import service_utils as glance_utils
from ironic.common.i18n import _
from ironic.common import states
from ironic.common import swift
from ironic.conductor import notification_utils as notify_utils
from ironic.conductor import steps as conductor_steps
from ironic.conductor import task_manager
from ironic.conductor import utils
from ironic.conf import CONF
from ironic.objects import fields

LOG = log.getLogger(__name__)

METRICS = metrics_utils.get_metrics_logger(__name__)


def validate_node(task, event='deploy'):
    """Validate that a node is suitable for deployment/rebuilding.

    :param task: a TaskManager instance.
    :param event: event to process: deploy or rebuild.
    :raises: NodeInMaintenance, NodeProtected, InvalidStateRequested
    """
    if task.node.maintenance:
        raise exception.NodeInMaintenance(op=_('provisioning'),
                                          node=task.node.uuid)

    if event == 'rebuild' and task.node.protected:
        raise exception.NodeProtected(node=task.node.uuid)

    if not task.fsm.is_actionable_event(event):
        raise exception.InvalidStateRequested(
            action=event, node=task.node.uuid, state=task.node.provision_state)


@METRICS.timer('start_deploy')
@task_manager.require_exclusive_lock
def start_deploy(task, manager, configdrive=None, event='deploy',
                 deploy_steps=None):
    """Start deployment or rebuilding on a node.

    This function does not check the node suitability for deployment, it's left
    up to the caller.

    :param task: a TaskManager instance.
    :param manager: a ConductorManager to run tasks on.
    :param configdrive: a configdrive, if requested.
    :param event: event to process: deploy or rebuild.
    :param deploy_steps: Optional deploy steps.
    """
    node = task.node

    if event == 'rebuild':
        # Note(gilliard) Clear these to force the driver to
        # check whether they have been changed in glance
        # NOTE(vdrok): If image_source is not from Glance we should
        # not clear kernel and ramdisk as they're input manually
        if glance_utils.is_glance_image(
                node.instance_info.get('image_source')):
            instance_info = node.instance_info
            instance_info.pop('kernel', None)
            instance_info.pop('ramdisk', None)
            node.instance_info = instance_info

    # Infer the image type to make sure the deploy driver
    # validates only the necessary variables for different
    # image types.
    if utils.update_image_type(task.context, task.node):
        node.save()

    try:
        task.driver.power.validate(task)
        task.driver.deploy.validate(task)
        utils.validate_instance_info_traits(task.node)
        conductor_steps.validate_user_deploy_steps_and_templates(
            task, deploy_steps, skip_missing=True)
    except exception.InvalidParameterValue as e:
        raise exception.InstanceDeployFailure(
            _("Failed to validate deploy or power info for node "
              "%(node_uuid)s. Error: %(msg)s") %
            {'node_uuid': node.uuid, 'msg': e}, code=e.code)

    try:
        task.process_event(
            event,
            callback=manager._spawn_worker,
            call_args=(do_node_deploy, task,
                       manager.conductor.id, configdrive, deploy_steps),
            err_handler=utils.provisioning_error_handler)
    except exception.InvalidState:
        raise exception.InvalidStateRequested(
            action=event, node=task.node.uuid,
            state=task.node.provision_state)


@METRICS.timer('do_node_deploy')
@task_manager.require_exclusive_lock
def do_node_deploy(task, conductor_id=None, configdrive=None,
                   deploy_steps=None):
    """Prepare the environment and deploy a node."""
    node = task.node
    utils.wipe_deploy_internal_info(task)
    try:
        if configdrive:
            _store_configdrive(node, configdrive)
    except (exception.SwiftOperationError, exception.ConfigInvalid) as e:
        with excutils.save_and_reraise_exception():
            utils.deploying_error_handler(
                task,
                ('Error while uploading the configdrive for %(node)s '
                 'to Swift') % {'node': node.uuid},
                _('Failed to upload the configdrive to Swift. '
                  'Error: %s') % e,
                clean_up=False)
    except db_exception.DBDataError as e:
        with excutils.save_and_reraise_exception():
            # NOTE(hshiina): This error happens when the configdrive is
            #                too large. Remove the configdrive from the
            #                object to update DB successfully in handling
            #                the failure.
            node.obj_reset_changes()
            utils.deploying_error_handler(
                task,
                ('Error while storing the configdrive for %(node)s into '
                 'the database: %(err)s') % {'node': node.uuid, 'err': e},
                _("Failed to store the configdrive in the database. "
                  "%s") % e,
                clean_up=False)
    except Exception as e:
        with excutils.save_and_reraise_exception():
            utils.deploying_error_handler(
                task,
                ('Unexpected error while preparing the configdrive for '
                 'node %(node)s') % {'node': node.uuid},
                _("Failed to prepare the configdrive. Exception: %s") % e,
                traceback=True, clean_up=False)

    try:
        task.driver.deploy.prepare(task)
    except exception.IronicException as e:
        with excutils.save_and_reraise_exception():
            utils.deploying_error_handler(
                task,
                ('Error while preparing to deploy to node %(node)s: '
                 '%(err)s') % {'node': node.uuid, 'err': e},
                _("Failed to prepare to deploy: %s") % e,
                clean_up=False)
    except Exception as e:
        with excutils.save_and_reraise_exception():
            utils.deploying_error_handler(
                task,
                ('Unexpected error while preparing to deploy to node '
                 '%(node)s') % {'node': node.uuid},
                _("Failed to prepare to deploy. Exception: %s") % e,
                traceback=True, clean_up=False)

    try:
        # If any deploy steps provided by user, save them to node. They will be
        # validated & processed later together with driver and deploy template
        # steps.
        if deploy_steps:
            node.set_driver_internal_info('user_deploy_steps', deploy_steps)
            node.save()
        # This gets the deploy steps (if any) from driver, deploy template and
        # deploy_steps argument and updates them in the node's
        # driver_internal_info['deploy_steps']. In-band steps are skipped since
        # we know that an agent is not running yet.
        conductor_steps.set_node_deployment_steps(task, skip_missing=True)
    except exception.InstanceDeployFailure as e:
        with excutils.save_and_reraise_exception():
            utils.deploying_error_handler(
                task,
                'Error while getting deploy steps; cannot deploy to node '
                '%(node)s. Error: %(err)s' % {'node': node.uuid, 'err': e},
                _("Cannot get deploy steps; failed to deploy: %s") % e)

    if not node.driver_internal_info.get('deploy_steps'):
        msg = _('Error while getting deploy steps: no steps returned for '
                'node %s') % node.uuid
        utils.deploying_error_handler(
            task, msg,
            _("No deploy steps returned by the driver"))
        raise exception.InstanceDeployFailure(msg)

    if conductor_id is not None:
        # Update conductor_affinity to reference this conductor's ID
        # since there may be local persistent state
        node.conductor_affinity = conductor_id
        node.save()

    do_next_deploy_step(task, 0)


@utils.fail_on_error(utils.deploying_error_handler,
                     _("Unexpected error when processing next deploy step"),
                     traceback=True)
@task_manager.require_exclusive_lock
def do_next_deploy_step(task, step_index):
    """Do deployment, starting from the specified deploy step.

    :param task: a TaskManager instance with an exclusive lock
    :param step_index: The first deploy step in the list to execute. This
        is the index (from 0) into the list of deploy steps in the node's
        driver_internal_info['deploy_steps']. Is None if there are no steps
        to execute.
    """
    node = task.node

    def _iter_steps():
        if step_index is None:
            return  # short-circuit to the end
        idx = step_index
        # The list can change in-flight, do not cache it!
        while idx < len(node.driver_internal_info['deploy_steps']):
            yield idx, node.driver_internal_info['deploy_steps'][idx]
            idx += 1

    # Execute each step until we hit an async step or run out of steps, keeping
    # in mind that the steps list can be modified in-flight.
    for idx, step in _iter_steps():
        LOG.info('Deploying on node %(node)s, remaining steps: '
                 '%(steps)s', {
                     'node': node.uuid,
                     'steps': node.driver_internal_info['deploy_steps'][idx:],
                 })
        # Save which step we're about to start so we can restart
        # if necessary
        node.deploy_step = step
        node.set_driver_internal_info('deploy_step_index', idx)
        node.save()
        interface = getattr(task.driver, step.get('interface'))
        LOG.info('Executing %(step)s on node %(node)s',
                 {'step': step, 'node': node.uuid})
        try:
            result = interface.execute_deploy_step(task, step)
        except exception.AgentInProgress as e:
            LOG.info('Conductor attempted to process deploy step for '
                     'node %(node)s. Agent indicated it is presently '
                     'executing a command. Error: %(error)s',
                     {'node': task.node.uuid,
                      'error': e})
            node.set_driver_internal_info('skip_current_deploy_step',
                                          False)
            task.process_event('wait')
            return
        except exception.IronicException as e:
            if isinstance(e, exception.AgentConnectionFailed):
                if task.node.driver_internal_info.get('deployment_reboot'):
                    LOG.info('Agent is not yet running on node %(node)s after '
                             'deployment reboot, waiting for agent to come up '
                             'to run next deploy step %(step)s.',
                             {'node': node.uuid, 'step': step})
                    node.set_driver_internal_info('skip_current_deploy_step',
                                                  False)
                    task.process_event('wait')
                    return

            # Avoid double handling of failures. For example, set_failed_state
            # from deploy_utils already calls deploying_error_handler.
            if task.node.provision_state != states.DEPLOYFAIL:
                log_msg = ('Node %(node)s failed deploy step %(step)s. Error: '
                           '%(err)s' % {'node': node.uuid,
                                        'step': node.deploy_step, 'err': e})
                utils.deploying_error_handler(
                    task, log_msg,
                    _("Deploy step %(step)s failed: %(err)s.")
                    % {'step': conductor_steps.step_id(step), 'err': e})
            return
        except Exception as e:
            log_msg = ('Node %(node)s failed deploy step %(step)s with '
                       'unexpected error: %(err)s' %
                       {'node': node.uuid, 'step': node.deploy_step, 'err': e})
            utils.deploying_error_handler(
                task, log_msg,
                _("Deploy step %(step)s failed with %(exc)s: %(err)s.")
                % {'step': conductor_steps.step_id(step), 'err': e,
                   'exc': e.__class__.__name__},
                traceback=True)
            return

        if task.node.provision_state == states.DEPLOYFAIL:
            # NOTE(dtantsur): some deploy steps do not raise but rather update
            # the node and return. Take them into account.
            LOG.debug('Node %s is in error state, not processing '
                      'the remaining deploy steps', task.node)
            return

        # Check if the step is done or not. The step should return
        # states.DEPLOYWAIT if the step is still being executed, or
        # None if the step is done.
        # NOTE(tenbrae): Some drivers may return states.DEPLOYWAIT
        #                eg. if they are waiting for a callback
        if result == states.DEPLOYWAIT:
            # Kill this worker, the async step will make an RPC call to
            # continue_node_deploy() to continue deploying
            LOG.info('Deploy step %(step)s on node %(node)s being '
                     'executed asynchronously, waiting for driver.',
                     {'node': node.uuid, 'step': step})
            if task.node.provision_state != states.DEPLOYWAIT:
                task.process_event('wait')
            return
        elif result is not None:
            # NOTE(rloo): This is an internal/dev error; shouldn't happen.
            log_msg = (_('While executing deploy step %(step)s on node '
                       '%(node)s, step returned unexpected state: %(val)s')
                       % {'step': step, 'node': node.uuid, 'val': result})
            utils.deploying_error_handler(
                task, log_msg,
                _("Failed to deploy: %s") % node.deploy_step)
            return

        LOG.info('Node %(node)s finished deploy step %(step)s',
                 {'node': node.uuid, 'step': step})

    # Finished executing the steps. Clear deploy_step.
    node.deploy_step = None
    utils.wipe_deploy_internal_info(task)
    node.save()

    _start_console_in_deploy(task)

    task.process_event('done')
    LOG.info('Successfully deployed node %(node)s with '
             'instance %(instance)s.',
             {'node': node.uuid, 'instance': node.instance_uuid})


@task_manager.require_exclusive_lock
def validate_deploy_steps(task):
    """Validate the deploy steps after the ramdisk learns about them."""
    conductor_steps.validate_user_deploy_steps_and_templates(task)
    conductor_steps.set_node_deployment_steps(
        task, reset_current=False)

    task.node.set_driver_internal_info('steps_validated', True)
    task.node.save()


@utils.fail_on_error(utils.deploying_error_handler,
                     _("Unexpected error when processing next deploy step"),
                     traceback=True)
@task_manager.require_exclusive_lock
def continue_node_deploy(task):
    """Continue deployment after finishing an async deploy step.

    This function calculates which step has to run next and passes control
    into do_next_deploy_step. On the first run, deploy steps and templates are
    also validated.

    :param task: a TaskManager instance with an exclusive lock
    """
    node = task.node

    # Agent is now running, we're ready to validate the remaining steps
    if not task.node.driver_internal_info.get('steps_validated'):
        try:
            validate_deploy_steps(task)
        except exception.IronicException as exc:
            msg = _('Failed to validate the final deploy steps list '
                    'for node %(node)s: %(exc)s') % {'node': node.uuid,
                                                     'exc': exc}
            return utils.deploying_error_handler(task, msg)

    next_step_index = utils.update_next_step_index(task, 'deploy')

    do_next_deploy_step(task, next_step_index)


def _get_configdrive_obj_name(node):
    """Generate the object name for the config drive."""
    return 'configdrive-%s' % node.uuid


def _store_configdrive(node, configdrive):
    """Handle the storage of the config drive.

    If configured, the config drive data are uploaded to a swift endpoint.
    The Node's instance_info is updated to include either the temporary
    Swift URL from the upload, or if no upload, the actual config drive data.

    :param node: an Ironic node object.
    :param configdrive: A gzipped and base64 encoded configdrive.
    :raises: SwiftOperationError if an error occur when uploading the
             config drive to the swift endpoint.
    :raises: ConfigInvalid if required keystone authorization credentials
             with swift are missing.


    """
    if CONF.deploy.configdrive_use_object_store:
        # Don't store the JSON source in swift.
        if isinstance(configdrive, dict):
            configdrive = utils.build_configdrive(node, configdrive)

        # NOTE(lucasagomes): No reason to use a different timeout than
        # the one used for deploying the node
        timeout = (CONF.conductor.configdrive_swift_temp_url_duration
                   or CONF.conductor.deploy_callback_timeout
                   # The documented default in ironic.conf.conductor
                   or 1800)
        container = CONF.conductor.configdrive_swift_container
        object_name = _get_configdrive_obj_name(node)

        object_headers = {'X-Delete-After': str(timeout)}

        with tempfile.NamedTemporaryFile(dir=CONF.tempdir,
                                         mode="wt") as fileobj:
            fileobj.write(configdrive)
            fileobj.flush()

            swift_api = swift.SwiftAPI()
            swift_api.create_object(container, object_name, fileobj.name,
                                    object_headers=object_headers)
            configdrive = swift_api.get_temp_url(container, object_name,
                                                 timeout)

    i_info = node.instance_info
    i_info['configdrive'] = configdrive
    node.instance_info = i_info
    node.save()


def _start_console_in_deploy(task):
    """Start console at the end of deployment.

    Console is stopped at tearing down not to be exposed to an instance user.
    Then, restart at deployment.

    :param task: a TaskManager instance with an exclusive lock
    """

    if not task.node.console_enabled:
        return

    notify_utils.emit_console_notification(
        task, 'console_restore', fields.NotificationStatus.START)
    try:
        task.driver.console.start_console(task)
    except Exception as err:
        msg = (_('Failed to start console while deploying the '
                 'node %(node)s: %(err)s.') % {'node': task.node.uuid,
                                               'err': err})
        LOG.error(msg)
        task.node.last_error = msg
        task.node.console_enabled = False
        task.node.save()
        notify_utils.emit_console_notification(
            task, 'console_restore', fields.NotificationStatus.ERROR)
    else:
        notify_utils.emit_console_notification(
            task, 'console_restore', fields.NotificationStatus.END)