ironic/conductor/deployments.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404

#    Licensed under the Apache License, Version 2.0 (the "License"); you may
#    not use this file except in compliance with the License. You may obtain
#    a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#    License for the specific language governing permissions and limitations
#    under the License.

"""Functionality related to deploying and undeploying."""

import tempfile

from ironic_lib import metrics_utils
from oslo_db import exception as db_exception
from oslo_log import log
from oslo_utils import excutils

from ironic.common import exception
from ironic.common.glance_service import service_utils as glance_utils
from ironic.common.i18n import _
from ironic.common import images
from ironic.common import states
from ironic.common import swift
from ironic.conductor import notification_utils as notify_utils
from ironic.conductor import steps as conductor_steps
from ironic.conductor import task_manager
from ironic.conductor import utils
from ironic.conf import CONF
from ironic.objects import fields

LOG = log.getLogger(__name__)

METRICS = metrics_utils.get_metrics_logger(__name__)


def validate_node(task, event='deploy'):
    """Validate that a node is suitable for deployment/rebuilding.

    :param task: a TaskManager instance.
    :param event: event to process: deploy or rebuild.
    :raises: NodeInMaintenance, NodeProtected, InvalidStateRequested
    """
    if task.node.maintenance:
        raise exception.NodeInMaintenance(op=_('provisioning'),
                                          node=task.node.uuid)

    if event == 'rebuild' and task.node.protected:
        raise exception.NodeProtected(node=task.node.uuid)

    if not task.fsm.is_actionable_event(event):
        raise exception.InvalidStateRequested(
            action=event, node=task.node.uuid, state=task.node.provision_state)


@METRICS.timer('start_deploy')
@task_manager.require_exclusive_lock
def start_deploy(task, manager, configdrive=None, event='deploy'):
    """Start deployment or rebuilding on a node.

    This function does not check the node suitability for deployment, it's left
    up to the caller.

    :param task: a TaskManager instance.
    :param manager: a ConductorManager to run tasks on.
    :param configdrive: a configdrive, if requested.
    :param event: event to process: deploy or rebuild.
    """
    node = task.node

    if event == 'rebuild':
        # Note(gilliard) Clear these to force the driver to
        # check whether they have been changed in glance
        # NOTE(vdrok): If image_source is not from Glance we should
        # not clear kernel and ramdisk as they're input manually
        if glance_utils.is_glance_image(
                node.instance_info.get('image_source')):
            instance_info = node.instance_info
            instance_info.pop('kernel', None)
            instance_info.pop('ramdisk', None)
            node.instance_info = instance_info

    # Infer the image type to make sure the deploy driver
    # validates only the necessary variables for different
    # image types.
    # NOTE(sirushtim): The iwdi variable can be None. It's up to
    # the deploy driver to validate this.
    iwdi = images.is_whole_disk_image(task.context, node.instance_info)
    driver_internal_info = node.driver_internal_info
    driver_internal_info['is_whole_disk_image'] = iwdi
    node.driver_internal_info = driver_internal_info
    node.save()

    try:
        task.driver.power.validate(task)
        task.driver.deploy.validate(task)
        utils.validate_instance_info_traits(task.node)
        conductor_steps.validate_deploy_templates(task, skip_missing=True)
    except exception.InvalidParameterValue as e:
        raise exception.InstanceDeployFailure(
            _("Failed to validate deploy or power info for node "
              "%(node_uuid)s. Error: %(msg)s") %
            {'node_uuid': node.uuid, 'msg': e}, code=e.code)

    try:
        task.process_event(
            event,
            callback=manager._spawn_worker,
            call_args=(do_node_deploy, task,
                       manager.conductor.id, configdrive),
            err_handler=utils.provisioning_error_handler)
    except exception.InvalidState:
        raise exception.InvalidStateRequested(
            action=event, node=task.node.uuid,
            state=task.node.provision_state)


@METRICS.timer('do_node_deploy')
@task_manager.require_exclusive_lock
def do_node_deploy(task, conductor_id=None, configdrive=None):
    """Prepare the environment and deploy a node."""
    node = task.node
    utils.wipe_deploy_internal_info(task)
    try:
        if configdrive:
            if isinstance(configdrive, dict):
                configdrive = utils.build_configdrive(node, configdrive)
            _store_configdrive(node, configdrive)
    except (exception.SwiftOperationError, exception.ConfigInvalid) as e:
        with excutils.save_and_reraise_exception():
            utils.deploying_error_handler(
                task,
                ('Error while uploading the configdrive for %(node)s '
                 'to Swift') % {'node': node.uuid},
                _('Failed to upload the configdrive to Swift. '
                  'Error: %s') % e,
                clean_up=False)
    except db_exception.DBDataError as e:
        with excutils.save_and_reraise_exception():
            # NOTE(hshiina): This error happens when the configdrive is
            #                too large. Remove the configdrive from the
            #                object to update DB successfully in handling
            #                the failure.
            node.obj_reset_changes()
            utils.deploying_error_handler(
                task,
                ('Error while storing the configdrive for %(node)s into '
                 'the database: %(err)s') % {'node': node.uuid, 'err': e},
                _("Failed to store the configdrive in the database. "
                  "%s") % e,
                clean_up=False)
    except Exception as e:
        with excutils.save_and_reraise_exception():
            utils.deploying_error_handler(
                task,
                ('Unexpected error while preparing the configdrive for '
                 'node %(node)s') % {'node': node.uuid},
                _("Failed to prepare the configdrive. Exception: %s") % e,
                traceback=True, clean_up=False)

    try:
        task.driver.deploy.prepare(task)
    except exception.IronicException as e:
        with excutils.save_and_reraise_exception():
            utils.deploying_error_handler(
                task,
                ('Error while preparing to deploy to node %(node)s: '
                 '%(err)s') % {'node': node.uuid, 'err': e},
                _("Failed to prepare to deploy: %s") % e,
                clean_up=False)
    except Exception as e:
        with excutils.save_and_reraise_exception():
            utils.deploying_error_handler(
                task,
                ('Unexpected error while preparing to deploy to node '
                 '%(node)s') % {'node': node.uuid},
                _("Failed to prepare to deploy. Exception: %s") % e,
                traceback=True, clean_up=False)

    try:
        # This gets the deploy steps (if any) and puts them in the node's
        # driver_internal_info['deploy_steps']. In-band steps are skipped since
        # we know that an agent is not running yet.
        conductor_steps.set_node_deployment_steps(task, skip_missing=True)
    except exception.InstanceDeployFailure as e:
        with excutils.save_and_reraise_exception():
            utils.deploying_error_handler(
                task,
                'Error while getting deploy steps; cannot deploy to node '
                '%(node)s. Error: %(err)s' % {'node': node.uuid, 'err': e},
                _("Cannot get deploy steps; failed to deploy: %s") % e)

    if not node.driver_internal_info.get('deploy_steps'):
        msg = _('Error while getting deploy steps: no steps returned for '
                'node %s') % node.uuid
        utils.deploying_error_handler(
            task, msg,
            _("No deploy steps returned by the driver"))
        raise exception.InstanceDeployFailure(msg)

    do_next_deploy_step(task, 0, conductor_id)


@utils.fail_on_error(utils.deploying_error_handler,
                     _("Unexpected error when processing next deploy step"))
@task_manager.require_exclusive_lock
def do_next_deploy_step(task, step_index, conductor_id):
    """Do deployment, starting from the specified deploy step.

    :param task: a TaskManager instance with an exclusive lock
    :param step_index: The first deploy step in the list to execute. This
        is the index (from 0) into the list of deploy steps in the node's
        driver_internal_info['deploy_steps']. Is None if there are no steps
        to execute.
    """
    node = task.node
    if step_index is None:
        steps = []
    else:
        steps = node.driver_internal_info['deploy_steps'][step_index:]

    LOG.info('Executing %(state)s on node %(node)s, remaining steps: '
             '%(steps)s', {'node': node.uuid, 'steps': steps,
                           'state': node.provision_state})

    # Execute each step until we hit an async step or run out of steps
    for ind, step in enumerate(steps):
        # Save which step we're about to start so we can restart
        # if necessary
        node.deploy_step = step
        driver_internal_info = node.driver_internal_info
        driver_internal_info['deploy_step_index'] = step_index + ind
        node.driver_internal_info = driver_internal_info
        node.save()
        interface = getattr(task.driver, step.get('interface'))
        LOG.info('Executing %(step)s on node %(node)s',
                 {'step': step, 'node': node.uuid})
        try:
            result = interface.execute_deploy_step(task, step)
        except exception.IronicException as e:
            if isinstance(e, exception.AgentConnectionFailed):
                if task.node.driver_internal_info.get('deployment_reboot'):
                    LOG.info('Agent is not yet running on node %(node)s after '
                             'deployment reboot, waiting for agent to come up '
                             'to run next deploy step %(step)s.',
                             {'node': node.uuid, 'step': step})
                    driver_internal_info['skip_current_deploy_step'] = False
                    node.driver_internal_info = driver_internal_info
                    task.process_event('wait')
                    return
            log_msg = ('Node %(node)s failed deploy step %(step)s. Error: '
                       '%(err)s' %
                       {'node': node.uuid, 'step': node.deploy_step, 'err': e})
            utils.deploying_error_handler(
                task, log_msg,
                _("Failed to deploy: Deploy step %(step)s, "
                  "error: %(err)s.") % {
                    'step': node.deploy_step,
                    'err': e})
            return
        except Exception as e:
            log_msg = ('Node %(node)s failed deploy step %(step)s with '
                       'unexpected error: %(err)s' %
                       {'node': node.uuid, 'step': node.deploy_step, 'err': e})
            utils.deploying_error_handler(
                task, log_msg,
                _("Failed to deploy. Exception: %s") % e, traceback=True)
            return

        if task.node.provision_state == states.DEPLOYFAIL:
            # NOTE(dtantsur): some deploy steps do not raise but rather update
            # the node and return. Take them into account.
            LOG.debug('Node %s is in error state, not processing '
                      'the remaining deploy steps', task.node)
            return

        if ind == 0:
            # We've done the very first deploy step.
            # Update conductor_affinity to reference this conductor's ID
            # since there may be local persistent state
            node.conductor_affinity = conductor_id
            node.save()

        # Check if the step is done or not. The step should return
        # states.DEPLOYWAIT if the step is still being executed, or
        # None if the step is done.
        # NOTE(tenbrae): Some drivers may return states.DEPLOYWAIT
        #                eg. if they are waiting for a callback
        if result == states.DEPLOYWAIT:
            # Kill this worker, the async step will make an RPC call to
            # continue_node_deploy() to continue deploying
            LOG.info('Deploy step %(step)s on node %(node)s being '
                     'executed asynchronously, waiting for driver.',
                     {'node': node.uuid, 'step': step})
            if task.node.provision_state != states.DEPLOYWAIT:
                task.process_event('wait')
            return
        elif result is not None:
            # NOTE(rloo): This is an internal/dev error; shouldn't happen.
            log_msg = (_('While executing deploy step %(step)s on node '
                       '%(node)s, step returned unexpected state: %(val)s')
                       % {'step': step, 'node': node.uuid, 'val': result})
            utils.deploying_error_handler(
                task, log_msg,
                _("Failed to deploy: %s") % node.deploy_step)
            return

        LOG.info('Node %(node)s finished deploy step %(step)s',
                 {'node': node.uuid, 'step': step})

    # Finished executing the steps. Clear deploy_step.
    node.deploy_step = None
    utils.wipe_deploy_internal_info(task)
    node.save()

    _start_console_in_deploy(task)

    task.process_event('done')
    LOG.info('Successfully deployed node %(node)s with '
             'instance %(instance)s.',
             {'node': node.uuid, 'instance': node.instance_uuid})


def _get_configdrive_obj_name(node):
    """Generate the object name for the config drive."""
    return 'configdrive-%s' % node.uuid


def _store_configdrive(node, configdrive):
    """Handle the storage of the config drive.

    If configured, the config drive data are uploaded to a swift endpoint.
    The Node's instance_info is updated to include either the temporary
    Swift URL from the upload, or if no upload, the actual config drive data.

    :param node: an Ironic node object.
    :param configdrive: A gzipped and base64 encoded configdrive.
    :raises: SwiftOperationError if an error occur when uploading the
             config drive to the swift endpoint.
    :raises: ConfigInvalid if required keystone authorization credentials
             with swift are missing.


    """
    if CONF.deploy.configdrive_use_object_store:
        # NOTE(lucasagomes): No reason to use a different timeout than
        # the one used for deploying the node
        timeout = (CONF.conductor.configdrive_swift_temp_url_duration
                   or CONF.conductor.deploy_callback_timeout
                   # The documented default in ironic.conf.conductor
                   or 1800)
        container = CONF.conductor.configdrive_swift_container
        object_name = _get_configdrive_obj_name(node)

        object_headers = {'X-Delete-After': str(timeout)}

        with tempfile.NamedTemporaryFile(dir=CONF.tempdir) as fileobj:
            fileobj.write(configdrive)
            fileobj.flush()

            swift_api = swift.SwiftAPI()
            swift_api.create_object(container, object_name, fileobj.name,
                                    object_headers=object_headers)
            configdrive = swift_api.get_temp_url(container, object_name,
                                                 timeout)

    i_info = node.instance_info
    i_info['configdrive'] = configdrive
    node.instance_info = i_info
    node.save()


def _start_console_in_deploy(task):
    """Start console at the end of deployment.

    Console is stopped at tearing down not to be exposed to an instance user.
    Then, restart at deployment.

    :param task: a TaskManager instance with an exclusive lock
    """

    if not task.node.console_enabled:
        return

    notify_utils.emit_console_notification(
        task, 'console_restore', fields.NotificationStatus.START)
    try:
        task.driver.console.start_console(task)
    except Exception as err:
        msg = (_('Failed to start console while deploying the '
                 'node %(node)s: %(err)s.') % {'node': task.node.uuid,
                                               'err': err})
        LOG.error(msg)
        task.node.last_error = msg
        task.node.console_enabled = False
        task.node.save()
        notify_utils.emit_console_notification(
            task, 'console_restore', fields.NotificationStatus.ERROR)
    else:
        notify_utils.emit_console_notification(
            task, 'console_restore', fields.NotificationStatus.END)