diff options
24 files changed, 392 insertions, 172 deletions
diff --git a/doc/source/admin/fast-track.rst b/doc/source/admin/fast-track.rst index 20ca6199f..e42942818 100644 --- a/doc/source/admin/fast-track.rst +++ b/doc/source/admin/fast-track.rst @@ -15,6 +15,19 @@ provisioned within a short period of time. the ``noop`` networking. The case where inspection, cleaning and provisioning networks are different is not supported. +.. note:: + Fast track mode is very sensitive to long-running processes on the conductor + side that may prevent agent heartbeats from being registered. + + For example, converting a large image to the raw format may take long enough + to reach the fast track timeout. In this case, you can either :ref:`use raw + images <stream_raw_images>` or move the conversion to the agent side with: + + .. code-block:: ini + + [DEFAULT] + force_raw_images = False + Enabling ======== diff --git a/doc/source/admin/interfaces/deploy.rst b/doc/source/admin/interfaces/deploy.rst index 7db5a24ff..79d004ad0 100644 --- a/doc/source/admin/interfaces/deploy.rst +++ b/doc/source/admin/interfaces/deploy.rst @@ -81,6 +81,46 @@ accessible from HTTP service. Please refer to configuration option ``FollowSymLinks`` if you are using Apache HTTP server, or ``disable_symlinks`` if Nginx HTTP server is in use. +.. _stream_raw_images: + +Streaming raw images +-------------------- + +The Bare Metal service is capable of streaming raw images directly to the +target disk of a node, without caching them in the node's RAM. When the source +image is not already raw, the conductor will convert the image and calculate +the new checksum. + +.. note:: + If no algorithm is specified via the ``image_os_hash_algo`` field, or if + this field is set to ``md5``, SHA256 is used for the updated checksum. + +For HTTP or local file images that are already raw, you need to explicitly set +the disk format to prevent the checksum from being unnecessarily re-calculated. +For example: + +.. code-block:: shell + + baremetal node set <node> \ + --instance-info image_source=http://server/myimage.img \ + --instance-info image_os_hash_algo=sha512 \ + --instance-info image_os_hash_value=<SHA512 of the raw image> \ + --instance-info image_disk_format=raw + +To disable this feature and cache images in the node's RAM, set + +.. code-block:: ini + + [agent] + stream_raw_images = False + +To disable the conductor-side conversion completely, set + +.. code-block:: ini + + [DEFAULT] + force_raw_images = False + .. _ansible-deploy: Ansible deploy diff --git a/doc/source/install/refarch/common.rst b/doc/source/install/refarch/common.rst index 800632fd5..ce0dedfb1 100644 --- a/doc/source/install/refarch/common.rst +++ b/doc/source/install/refarch/common.rst @@ -277,9 +277,8 @@ the space requirements are different: In both cases a cached image is converted to raw if ``force_raw_images`` is ``True`` (the default). - .. note:: - ``image_download_source`` can also be provided in the node's - ``driver_info`` or ``instance_info``. See :ref:`image_download_source`. + See :ref:`image_download_source` and :ref:`stream_raw_images` for more + details. * When network boot is used, the instance image kernel and ramdisk are cached locally while the instance is active. diff --git a/ironic/api/controllers/v1/ramdisk.py b/ironic/api/controllers/v1/ramdisk.py index 5feef5e02..b98eb7dc2 100644 --- a/ironic/api/controllers/v1/ramdisk.py +++ b/ironic/api/controllers/v1/ramdisk.py @@ -131,13 +131,17 @@ class LookupController(rest.RestController): else: node = objects.Node.get_by_port_addresses( api.request.context, valid_addresses) - except exception.NotFound: + except exception.NotFound as e: # NOTE(dtantsur): we are reraising the same exception to make sure # we don't disclose the difference between nodes that are not found # at all and nodes in a wrong state by different error messages. + LOG.error('No node has been found during lookup: %s', e) raise exception.NotFound() if CONF.api.restrict_lookup and not self.lookup_allowed(node): + LOG.error('Lookup is not allowed for node %(node)s in the ' + 'provision state %(state)s', + {'node': node.uuid, 'state': node.provision_state}) raise exception.NotFound() if api_utils.allow_agent_token(): diff --git a/ironic/common/image_service.py b/ironic/common/image_service.py index 5280ee6bc..4b95b5152 100644 --- a/ironic/common/image_service.py +++ b/ironic/common/image_service.py @@ -34,10 +34,6 @@ from ironic.common import utils from ironic.conf import CONF IMAGE_CHUNK_SIZE = 1024 * 1024 # 1mb -# NOTE(kaifeng) Image will be truncated to 2GiB by sendfile, -# we use a large chunk size here for a better performance -# while keep the chunk size less than the size limit. -SENDFILE_CHUNK_SIZE = 1024 * 1024 * 1024 # 1Gb LOG = log.getLogger(__name__) @@ -264,26 +260,23 @@ class FileImageService(BaseImageService): """ source_image_path = self.validate_href(image_href) dest_image_path = image_file.name - local_device = os.stat(dest_image_path).st_dev try: - # We should have read and write access to source file to create - # hard link to it. - if (local_device == os.stat(source_image_path).st_dev - and os.access(source_image_path, os.R_OK | os.W_OK)): - image_file.close() - os.remove(dest_image_path) + image_file.close() + os.remove(dest_image_path) + + try: os.link(source_image_path, dest_image_path) + except OSError as exc: + LOG.debug('Could not create a link from %(src)s to %(dest)s, ' + 'will copy the content instead. Error: %(exc)s.', + {'src': source_image_path, 'dest': dest_image_path, + 'exc': exc}) else: - filesize = os.path.getsize(source_image_path) - offset = 0 - with open(source_image_path, 'rb') as input_img: - while offset < filesize: - count = min(SENDFILE_CHUNK_SIZE, filesize - offset) - nbytes_out = os.sendfile(image_file.fileno(), - input_img.fileno(), - offset, - count) - offset += nbytes_out + return + + # NOTE(dtantsur): starting with Python 3.8, copyfile() uses + # efficient copying (i.e. sendfile) under the hood. + shutil.copyfile(source_image_path, dest_image_path) except Exception as e: raise exception.ImageDownloadFailed(image_href=image_href, reason=str(e)) diff --git a/ironic/common/states.py b/ironic/common/states.py index 89b710189..f2238b41b 100644 --- a/ironic/common/states.py +++ b/ironic/common/states.py @@ -269,6 +269,9 @@ _FASTTRACK_LOOKUP_ALLOWED_STATES = (ENROLL, MANAGEABLE, AVAILABLE, FASTTRACK_LOOKUP_ALLOWED_STATES = frozenset(_FASTTRACK_LOOKUP_ALLOWED_STATES) """States where API lookups are permitted with fast track enabled.""" +FAILURE_STATES = frozenset((DEPLOYFAIL, CLEANFAIL, INSPECTFAIL, + RESCUEFAIL, UNRESCUEFAIL, ADOPTFAIL)) + ############## # Power states diff --git a/ironic/conductor/cleaning.py b/ironic/conductor/cleaning.py index 53d66ddd8..5f69a4ab9 100644 --- a/ironic/conductor/cleaning.py +++ b/ironic/conductor/cleaning.py @@ -247,12 +247,21 @@ def do_next_clean_step(task, step_index, disable_ramdisk=None): task.process_event(event) +def get_last_error(node): + last_error = _('By request, the clean operation was aborted') + if node.clean_step: + last_error += ( + _(' during or after the completion of step "%s"') + % conductor_steps.step_id(node.clean_step) + ) + return last_error + + @task_manager.require_exclusive_lock -def do_node_clean_abort(task, step_name=None): +def do_node_clean_abort(task): """Internal method to abort an ongoing operation. :param task: a TaskManager instance with an exclusive lock - :param step_name: The name of the clean step. """ node = task.node try: @@ -270,12 +279,13 @@ def do_node_clean_abort(task, step_name=None): set_fail_state=False) return + last_error = get_last_error(node) info_message = _('Clean operation aborted for node %s') % node.uuid - last_error = _('By request, the clean operation was aborted') - if step_name: - msg = _(' after the completion of step "%s"') % step_name - last_error += msg - info_message += msg + if node.clean_step: + info_message += ( + _(' during or after the completion of step "%s"') + % node.clean_step + ) node.last_error = last_error node.clean_step = None @@ -317,7 +327,7 @@ def continue_node_clean(task): target_state = None task.process_event('fail', target_state=target_state) - do_node_clean_abort(task, step_name) + do_node_clean_abort(task) return LOG.debug('The cleaning operation for node %(node)s was ' diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index 7e98459ff..a50722597 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -1348,7 +1348,8 @@ class ConductorManager(base_manager.BaseConductorManager): callback=self._spawn_worker, call_args=(cleaning.do_node_clean_abort, task), err_handler=utils.provisioning_error_handler, - target_state=target_state) + target_state=target_state, + last_error=cleaning.get_last_error(node)) return if node.provision_state == states.RESCUEWAIT: diff --git a/ironic/conductor/task_manager.py b/ironic/conductor/task_manager.py index 509c9ce92..922e74cf6 100644 --- a/ironic/conductor/task_manager.py +++ b/ironic/conductor/task_manager.py @@ -527,7 +527,8 @@ class TaskManager(object): self.release_resources() def process_event(self, event, callback=None, call_args=None, - call_kwargs=None, err_handler=None, target_state=None): + call_kwargs=None, err_handler=None, target_state=None, + last_error=None): """Process the given event for the task's current state. :param event: the name of the event to process @@ -540,6 +541,8 @@ class TaskManager(object): prev_target_state) :param target_state: if specified, the target provision state for the node. Otherwise, use the target state from the fsm + :param last_error: last error to set on the node together with + the state transition. :raises: InvalidState if the event is not allowed by the associated state machine """ @@ -572,13 +575,15 @@ class TaskManager(object): # set up the async worker if callback: - # clear the error if we're going to start work in a callback - self.node.last_error = None + # update the error if we're going to start work in a callback + self.node.last_error = last_error if call_args is None: call_args = () if call_kwargs is None: call_kwargs = {} self.spawn_after(callback, *call_args, **call_kwargs) + elif last_error is not None: + self.node.last_error = last_error # publish the state transition by saving the Node self.node.save() diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index c107f076f..2272c0df7 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -297,14 +297,23 @@ def node_power_action(task, new_state, timeout=None): node = task.node if _can_skip_state_change(task, new_state): + # NOTE(TheJulia): Even if we are not changing the power state, + # we need to wipe the token out, just in case for some reason + # the power was turned off outside of our interaction/management. + if new_state in (states.POWER_OFF, states.SOFT_POWER_OFF, + states.REBOOT, states.SOFT_REBOOT): + wipe_internal_info_on_power_off(node) + node.save() return target_state = _calculate_target_state(new_state) # Set the target_power_state and clear any last_error, if we're # starting a new operation. This will expose to other processes - # and clients that work is in progress. - node['target_power_state'] = target_state - node['last_error'] = None + # and clients that work is in progress. Keep the last_error intact + # if the power action happens as a result of a failure. + node.target_power_state = target_state + if node.provision_state not in states.FAILURE_STATES: + node.last_error = None node.timestamp_driver_internal_info('last_power_state_change') # NOTE(dtantsur): wipe token on shutting down, otherwise a reboot in # fast-track (or an accidentally booted agent) will cause subsequent @@ -479,9 +488,9 @@ def cleaning_error_handler(task, logmsg, errmsg=None, traceback=False, node.del_driver_internal_info('cleaning_reboot') node.del_driver_internal_info('cleaning_polling') node.del_driver_internal_info('skip_current_clean_step') - # We don't need to keep the old agent URL + # We don't need to keep the old agent URL, or token # as it should change upon the next cleaning attempt. - node.del_driver_internal_info('agent_url') + wipe_token_and_url(task) # For manual cleaning, the target provision state is MANAGEABLE, whereas # for automated cleaning, it is AVAILABLE. manual_clean = node.target_provision_state == states.MANAGEABLE diff --git a/ironic/db/sqlalchemy/api.py b/ironic/db/sqlalchemy/api.py index c14719af8..2618f106c 100644 --- a/ironic/db/sqlalchemy/api.py +++ b/ironic/db/sqlalchemy/api.py @@ -1716,9 +1716,12 @@ class Connection(api.Connection): max_to_migrate = max_count or total_to_migrate for model in sql_models: + use_node_id = False + if (not hasattr(model, 'id') and hasattr(model, 'node_id')): + use_node_id = True version = mapping[model.__name__][0] num_migrated = 0 - with _session_for_write(): + with _session_for_write() as session: query = model_query(model).filter(model.version != version) # NOTE(rloo) Caution here; after doing query.count(), it is # possible that the value is different in the @@ -1729,13 +1732,27 @@ class Connection(api.Connection): # max_to_migrate objects. ids = [] for obj in query.slice(0, max_to_migrate): - ids.append(obj['id']) - num_migrated = ( - model_query(model). - filter(sql.and_(model.id.in_(ids), - model.version != version)). - update({model.version: version}, - synchronize_session=False)) + if not use_node_id: + ids.append(obj['id']) + else: + # BIOSSettings, NodeTrait, NodeTag do not have id + # columns, fallback to node_id as they both have + # it. + ids.append(obj['node_id']) + if not use_node_id: + num_migrated = ( + session.query(model). + filter(sql.and_(model.id.in_(ids), + model.version != version)). + update({model.version: version}, + synchronize_session=False)) + else: + num_migrated = ( + session.query(model). + filter(sql.and_(model.node_id.in_(ids), + model.version != version)). + update({model.version: version}, + synchronize_session=False)) else: num_migrated = ( model_query(model). diff --git a/ironic/drivers/modules/deploy_utils.py b/ironic/drivers/modules/deploy_utils.py index 13f91e9cd..fd83f9f08 100644 --- a/ironic/drivers/modules/deploy_utils.py +++ b/ironic/drivers/modules/deploy_utils.py @@ -1093,6 +1093,11 @@ def _cache_and_convert_image(task, instance_info, image_info=None): _, image_path = cache_instance_image(task.context, task.node, force_raw=force_raw) if force_raw or image_info is None: + if image_info is None: + initial_format = instance_info.get('image_disk_format') + else: + initial_format = image_info.get('disk_format') + if force_raw: instance_info['image_disk_format'] = 'raw' else: @@ -1108,21 +1113,29 @@ def _cache_and_convert_image(task, instance_info, image_info=None): # sha256. if image_info is None: os_hash_algo = instance_info.get('image_os_hash_algo') + hash_value = instance_info.get('image_os_hash_value') + old_checksum = instance_info.get('image_checksum') else: os_hash_algo = image_info.get('os_hash_algo') + hash_value = image_info.get('os_hash_value') + old_checksum = image_info.get('checksum') + + if initial_format != instance_info['image_disk_format']: + if not os_hash_algo or os_hash_algo == 'md5': + LOG.debug("Checksum algorithm for image %(image)s for node " + "%(node)s is set to '%(algo)s', changing to sha256", + {'algo': os_hash_algo, 'node': task.node.uuid, + 'image': image_path}) + os_hash_algo = 'sha256' + + LOG.debug('Recalculating checksum for image %(image)s for node ' + '%(node)s due to image conversion', + {'image': image_path, 'node': task.node.uuid}) + instance_info['image_checksum'] = None + hash_value = compute_image_checksum(image_path, os_hash_algo) + else: + instance_info['image_checksum'] = old_checksum - if not os_hash_algo or os_hash_algo == 'md5': - LOG.debug("Checksum algorithm for image %(image)s for node " - "%(node)s is set to '%(algo)s', changing to 'sha256'", - {'algo': os_hash_algo, 'node': task.node.uuid, - 'image': image_path}) - os_hash_algo = 'sha256' - - LOG.debug('Recalculating checksum for image %(image)s for node ' - '%(node)s due to image conversion', - {'image': image_path, 'node': task.node.uuid}) - instance_info['image_checksum'] = None - hash_value = compute_image_checksum(image_path, os_hash_algo) instance_info['image_os_hash_algo'] = os_hash_algo instance_info['image_os_hash_value'] = hash_value else: diff --git a/ironic/tests/unit/common/test_image_service.py b/ironic/tests/unit/common/test_image_service.py index 197518e1a..297a1b4b9 100644 --- a/ironic/tests/unit/common/test_image_service.py +++ b/ironic/tests/unit/common/test_image_service.py @@ -10,7 +10,6 @@ # License for the specific language governing permissions and limitations # under the License. -import builtins import datetime from http import client as http_client import io @@ -483,119 +482,55 @@ class FileImageServiceTestCase(base.TestCase): 'properties': {}, 'no_cache': True}, result) + @mock.patch.object(shutil, 'copyfile', autospec=True) @mock.patch.object(os, 'link', autospec=True) @mock.patch.object(os, 'remove', autospec=True) - @mock.patch.object(os, 'access', return_value=True, autospec=True) - @mock.patch.object(os, 'stat', autospec=True) @mock.patch.object(image_service.FileImageService, 'validate_href', autospec=True) - def test_download_hard_link(self, _validate_mock, stat_mock, access_mock, - remove_mock, link_mock): + def test_download_hard_link(self, _validate_mock, remove_mock, link_mock, + copy_mock): _validate_mock.return_value = self.href_path - stat_mock.return_value.st_dev = 'dev1' file_mock = mock.Mock(spec=io.BytesIO) file_mock.name = 'file' self.service.download(self.href, file_mock) _validate_mock.assert_called_once_with(mock.ANY, self.href) - self.assertEqual(2, stat_mock.call_count) - access_mock.assert_called_once_with(self.href_path, os.R_OK | os.W_OK) remove_mock.assert_called_once_with('file') link_mock.assert_called_once_with(self.href_path, 'file') + copy_mock.assert_not_called() - @mock.patch.object(os, 'sendfile', return_value=42, autospec=True) - @mock.patch.object(os.path, 'getsize', return_value=42, autospec=True) - @mock.patch.object(builtins, 'open', autospec=True) - @mock.patch.object(os, 'access', return_value=False, autospec=True) - @mock.patch.object(os, 'stat', autospec=True) - @mock.patch.object(image_service.FileImageService, 'validate_href', - autospec=True) - def test_download_copy(self, _validate_mock, stat_mock, access_mock, - open_mock, size_mock, copy_mock): - _validate_mock.return_value = self.href_path - stat_mock.return_value.st_dev = 'dev1' - file_mock = mock.MagicMock(spec=io.BytesIO) - file_mock.name = 'file' - input_mock = mock.MagicMock(spec=io.BytesIO) - open_mock.return_value = input_mock - self.service.download(self.href, file_mock) - _validate_mock.assert_called_once_with(mock.ANY, self.href) - self.assertEqual(2, stat_mock.call_count) - access_mock.assert_called_once_with(self.href_path, os.R_OK | os.W_OK) - copy_mock.assert_called_once_with(file_mock.fileno(), - input_mock.__enter__().fileno(), - 0, 42) - - @mock.patch.object(os, 'sendfile', autospec=True) - @mock.patch.object(os.path, 'getsize', return_value=42, autospec=True) - @mock.patch.object(builtins, 'open', autospec=True) - @mock.patch.object(os, 'access', return_value=False, autospec=True) - @mock.patch.object(os, 'stat', autospec=True) + @mock.patch.object(shutil, 'copyfile', autospec=True) + @mock.patch.object(os, 'link', autospec=True) + @mock.patch.object(os, 'remove', autospec=True) @mock.patch.object(image_service.FileImageService, 'validate_href', autospec=True) - def test_download_copy_segmented(self, _validate_mock, stat_mock, - access_mock, open_mock, size_mock, - copy_mock): - # Fake a 3G + 1k image - chunk_size = image_service.SENDFILE_CHUNK_SIZE - fake_image_size = chunk_size * 3 + 1024 - fake_chunk_seq = [chunk_size, chunk_size, chunk_size, 1024] + def test_download_copy(self, _validate_mock, remove_mock, link_mock, + copy_mock): _validate_mock.return_value = self.href_path - stat_mock.return_value.st_dev = 'dev1' + link_mock.side_effect = PermissionError file_mock = mock.MagicMock(spec=io.BytesIO) file_mock.name = 'file' - input_mock = mock.MagicMock(spec=io.BytesIO) - open_mock.return_value = input_mock - size_mock.return_value = fake_image_size - copy_mock.side_effect = fake_chunk_seq self.service.download(self.href, file_mock) _validate_mock.assert_called_once_with(mock.ANY, self.href) - self.assertEqual(2, stat_mock.call_count) - access_mock.assert_called_once_with(self.href_path, os.R_OK | os.W_OK) - copy_calls = [mock.call(file_mock.fileno(), - input_mock.__enter__().fileno(), - chunk_size * i, - fake_chunk_seq[i]) for i in range(4)] - copy_mock.assert_has_calls(copy_calls) - size_mock.assert_called_once_with(self.href_path) - - @mock.patch.object(os, 'remove', side_effect=OSError, autospec=True) - @mock.patch.object(os, 'access', return_value=True, autospec=True) - @mock.patch.object(os, 'stat', autospec=True) - @mock.patch.object(image_service.FileImageService, 'validate_href', - autospec=True) - def test_download_hard_link_fail(self, _validate_mock, stat_mock, - access_mock, remove_mock): - _validate_mock.return_value = self.href_path - stat_mock.return_value.st_dev = 'dev1' - file_mock = mock.MagicMock(spec=io.BytesIO) - file_mock.name = 'file' - self.assertRaises(exception.ImageDownloadFailed, - self.service.download, self.href, file_mock) - _validate_mock.assert_called_once_with(mock.ANY, self.href) - self.assertEqual(2, stat_mock.call_count) - access_mock.assert_called_once_with(self.href_path, os.R_OK | os.W_OK) + link_mock.assert_called_once_with(self.href_path, 'file') + copy_mock.assert_called_once_with(self.href_path, 'file') - @mock.patch.object(os, 'sendfile', side_effect=OSError, autospec=True) - @mock.patch.object(os.path, 'getsize', return_value=42, autospec=True) - @mock.patch.object(builtins, 'open', autospec=True) - @mock.patch.object(os, 'access', return_value=False, autospec=True) - @mock.patch.object(os, 'stat', autospec=True) + @mock.patch.object(shutil, 'copyfile', autospec=True) + @mock.patch.object(os, 'link', autospec=True) + @mock.patch.object(os, 'remove', autospec=True) @mock.patch.object(image_service.FileImageService, 'validate_href', autospec=True) - def test_download_copy_fail(self, _validate_mock, stat_mock, access_mock, - open_mock, size_mock, copy_mock): + def test_download_copy_fail(self, _validate_mock, remove_mock, link_mock, + copy_mock): _validate_mock.return_value = self.href_path - stat_mock.return_value.st_dev = 'dev1' + link_mock.side_effect = PermissionError + copy_mock.side_effect = PermissionError file_mock = mock.MagicMock(spec=io.BytesIO) file_mock.name = 'file' - input_mock = mock.MagicMock(spec=io.BytesIO) - open_mock.return_value = input_mock self.assertRaises(exception.ImageDownloadFailed, self.service.download, self.href, file_mock) _validate_mock.assert_called_once_with(mock.ANY, self.href) - self.assertEqual(2, stat_mock.call_count) - access_mock.assert_called_once_with(self.href_path, os.R_OK | os.W_OK) - size_mock.assert_called_once_with(self.href_path) + link_mock.assert_called_once_with(self.href_path, 'file') + copy_mock.assert_called_once_with(self.href_path, 'file') class ServiceGetterTestCase(base.TestCase): diff --git a/ironic/tests/unit/conductor/test_cleaning.py b/ironic/tests/unit/conductor/test_cleaning.py index a4c3d57b6..34e805deb 100644 --- a/ironic/tests/unit/conductor/test_cleaning.py +++ b/ironic/tests/unit/conductor/test_cleaning.py @@ -1138,12 +1138,12 @@ class DoNodeCleanTestCase(db_base.DbTestCase): class DoNodeCleanAbortTestCase(db_base.DbTestCase): @mock.patch.object(fake.FakeDeploy, 'tear_down_cleaning', autospec=True) - def _test__do_node_clean_abort(self, step_name, tear_mock): + def _test_do_node_clean_abort(self, clean_step, tear_mock): node = obj_utils.create_test_node( self.context, driver='fake-hardware', - provision_state=states.CLEANFAIL, + provision_state=states.CLEANWAIT, target_provision_state=states.AVAILABLE, - clean_step={'step': 'foo', 'abortable': True}, + clean_step=clean_step, driver_internal_info={ 'agent_url': 'some url', 'agent_secret_token': 'token', @@ -1153,11 +1153,11 @@ class DoNodeCleanAbortTestCase(db_base.DbTestCase): 'skip_current_clean_step': True}) with task_manager.acquire(self.context, node.uuid) as task: - cleaning.do_node_clean_abort(task, step_name=step_name) + cleaning.do_node_clean_abort(task) self.assertIsNotNone(task.node.last_error) tear_mock.assert_called_once_with(task.driver.deploy, task) - if step_name: - self.assertIn(step_name, task.node.last_error) + if clean_step: + self.assertIn(clean_step['step'], task.node.last_error) # assert node's clean_step and metadata was cleaned up self.assertEqual({}, task.node.clean_step) self.assertNotIn('clean_step_index', @@ -1173,11 +1173,12 @@ class DoNodeCleanAbortTestCase(db_base.DbTestCase): self.assertNotIn('agent_secret_token', task.node.driver_internal_info) - def test__do_node_clean_abort(self): - self._test__do_node_clean_abort(None) + def test_do_node_clean_abort_early(self): + self._test_do_node_clean_abort(None) - def test__do_node_clean_abort_with_step_name(self): - self._test__do_node_clean_abort('foo') + def test_do_node_clean_abort_with_step(self): + self._test_do_node_clean_abort({'step': 'foo', 'interface': 'deploy', + 'abortable': True}) @mock.patch.object(fake.FakeDeploy, 'tear_down_cleaning', autospec=True) def test__do_node_clean_abort_tear_down_fail(self, tear_mock): diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index 5d84dbbef..ece5a8a58 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -2733,7 +2733,8 @@ class DoProvisioningActionTestCase(mgr_utils.ServiceSetUpMixin, # Node will be moved to tgt_prov_state after cleaning, not tested here self.assertEqual(states.CLEANFAIL, node.provision_state) self.assertEqual(tgt_prov_state, node.target_provision_state) - self.assertIsNone(node.last_error) + self.assertEqual('By request, the clean operation was aborted', + node.last_error) mock_spawn.assert_called_with( self.service, cleaning.do_node_clean_abort, mock.ANY) diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py index a424e5132..52fc72436 100644 --- a/ironic/tests/unit/conductor/test_utils.py +++ b/ironic/tests/unit/conductor/test_utils.py @@ -196,7 +196,8 @@ class NodePowerActionTestCase(db_base.DbTestCase): node = obj_utils.create_test_node(self.context, uuid=uuidutils.generate_uuid(), driver='fake-hardware', - power_state=states.POWER_OFF) + power_state=states.POWER_OFF, + last_error='failed before') task = task_manager.TaskManager(self.context, node.uuid) get_power_mock.return_value = states.POWER_OFF @@ -209,6 +210,27 @@ class NodePowerActionTestCase(db_base.DbTestCase): self.assertIsNone(node['target_power_state']) self.assertIsNone(node['last_error']) + @mock.patch.object(fake.FakePower, 'get_power_state', autospec=True) + def test_node_power_action_keep_last_error(self, get_power_mock): + """Test node_power_action to keep last_error for failed states.""" + node = obj_utils.create_test_node(self.context, + uuid=uuidutils.generate_uuid(), + driver='fake-hardware', + power_state=states.POWER_OFF, + provision_state=states.CLEANFAIL, + last_error='failed before') + task = task_manager.TaskManager(self.context, node.uuid) + + get_power_mock.return_value = states.POWER_OFF + + conductor_utils.node_power_action(task, states.POWER_ON) + + node.refresh() + get_power_mock.assert_called_once_with(mock.ANY, mock.ANY) + self.assertEqual(states.POWER_ON, node['power_state']) + self.assertIsNone(node['target_power_state']) + self.assertEqual('failed before', node['last_error']) + @mock.patch('ironic.objects.node.NodeSetPowerStateNotification', autospec=True) @mock.patch.object(fake.FakePower, 'get_power_state', autospec=True) @@ -282,6 +304,31 @@ class NodePowerActionTestCase(db_base.DbTestCase): node['driver_internal_info']) @mock.patch.object(fake.FakePower, 'get_power_state', autospec=True) + def test_node_power_action_power_off_already(self, get_power_mock): + """Test node_power_action to turn node power off, but already off.""" + dii = {'agent_secret_token': 'token', + 'agent_cached_deploy_steps': ['steps']} + node = obj_utils.create_test_node(self.context, + uuid=uuidutils.generate_uuid(), + driver='fake-hardware', + power_state=states.POWER_ON, + driver_internal_info=dii) + task = task_manager.TaskManager(self.context, node.uuid) + + get_power_mock.return_value = states.POWER_OFF + + conductor_utils.node_power_action(task, states.POWER_OFF) + + node.refresh() + get_power_mock.assert_called_once_with(mock.ANY, mock.ANY) + self.assertEqual(states.POWER_OFF, node['power_state']) + self.assertIsNone(node['target_power_state']) + self.assertIsNone(node['last_error']) + self.assertNotIn('agent_secret_token', node['driver_internal_info']) + self.assertNotIn('agent_cached_deploy_steps', + node['driver_internal_info']) + + @mock.patch.object(fake.FakePower, 'get_power_state', autospec=True) def test_node_power_action_power_off_pregenerated_token(self, get_power_mock): dii = {'agent_secret_token': 'token', @@ -1150,6 +1197,9 @@ class ErrorHandlersTestCase(db_base.DbTestCase): self.node.set_driver_internal_info('skip_current_clean_step', True) self.node.set_driver_internal_info('clean_step_index', 0) self.node.set_driver_internal_info('agent_url', 'url') + self.node.set_driver_internal_info('agent_secret_token', 'foo') + self.node.set_driver_internal_info('agent_secret_token_pregenerated', + False) msg = 'error bar' last_error = "last error" @@ -1162,6 +1212,9 @@ class ErrorHandlersTestCase(db_base.DbTestCase): self.assertNotIn('cleaning_polling', self.node.driver_internal_info) self.assertNotIn('skip_current_clean_step', self.node.driver_internal_info) + self.assertNotIn('agent_secret_token', self.node.driver_internal_info) + self.assertNotIn('agent_secret_token_pregenerated', + self.node.driver_internal_info) self.assertEqual(last_error, self.node.last_error) self.assertTrue(self.node.maintenance) self.assertEqual(last_error, self.node.maintenance_reason) diff --git a/ironic/tests/unit/db/test_api.py b/ironic/tests/unit/db/test_api.py index 6142fdfae..2396b1253 100644 --- a/ironic/tests/unit/db/test_api.py +++ b/ironic/tests/unit/db/test_api.py @@ -226,6 +226,11 @@ class UpdateToLatestVersionsTestCase(base.DbTestCase): for i in range(0, num_nodes): node = utils.create_test_node(version=version, uuid=uuidutils.generate_uuid()) + # Create entries on the tables so we force field upgrades + utils.create_test_node_trait(node_id=node.id, trait='foo', + version='0.0') + utils.create_test_bios_setting(node_id=node.id, version='1.0') + nodes.append(node.uuid) for uuid in nodes: node = self.dbapi.get_node_by_uuid(uuid) @@ -238,10 +243,15 @@ class UpdateToLatestVersionsTestCase(base.DbTestCase): return nodes = self._create_nodes(5) + # Check/migrate 2, 10 remain. + self.assertEqual( + (10, 2), self.dbapi.update_to_latest_versions(self.context, 2)) + # Check/migrate 10, 8 migrated, 8 remain. self.assertEqual( - (5, 2), self.dbapi.update_to_latest_versions(self.context, 2)) + (8, 8), self.dbapi.update_to_latest_versions(self.context, 10)) + # Just make sure it is still 0, 0 in case more things are added. self.assertEqual( - (3, 3), self.dbapi.update_to_latest_versions(self.context, 10)) + (0, 0), self.dbapi.update_to_latest_versions(self.context, 10)) for uuid in nodes: node = self.dbapi.get_node_by_uuid(uuid) self.assertEqual(self.node_ver, node.version) @@ -250,10 +260,19 @@ class UpdateToLatestVersionsTestCase(base.DbTestCase): if self.node_version_same: # can't test if we don't have diff versions of the node return - - nodes = self._create_nodes(5) + vm_count = 5 + nodes = self._create_nodes(vm_count) + # NOTE(TheJulia): Under current testing, 5 node will result in 10 + # records implicitly needing to be migrated. + migrate_count = vm_count * 2 + self.assertEqual( + (migrate_count, migrate_count), + self.dbapi.update_to_latest_versions(self.context, + migrate_count)) self.assertEqual( - (5, 5), self.dbapi.update_to_latest_versions(self.context, 5)) + (0, 0), self.dbapi.update_to_latest_versions(self.context, + migrate_count)) + for uuid in nodes: node = self.dbapi.get_node_by_uuid(uuid) self.assertEqual(self.node_ver, node.version) diff --git a/ironic/tests/unit/drivers/modules/test_deploy_utils.py b/ironic/tests/unit/drivers/modules/test_deploy_utils.py index 1177e9743..7220697cb 100644 --- a/ironic/tests/unit/drivers/modules/test_deploy_utils.py +++ b/ironic/tests/unit/drivers/modules/test_deploy_utils.py @@ -1940,7 +1940,7 @@ class TestBuildInstanceInfoForHttpProvisioning(db_base.DbTestCase): self.node.save() self.checksum_mock = self.useFixture(fixtures.MockPatchObject( - fileutils, 'compute_file_checksum')).mock + fileutils, 'compute_file_checksum', autospec=True)).mock self.checksum_mock.return_value = 'fake-checksum' self.cache_image_mock = self.useFixture(fixtures.MockPatchObject( utils, 'cache_instance_image', autospec=True)).mock @@ -2012,9 +2012,25 @@ class TestBuildInstanceInfoForHttpProvisioning(db_base.DbTestCase): image_info=self.image_info, expect_raw=True) self.assertIsNone(instance_info['image_checksum']) + self.assertEqual(instance_info['image_os_hash_algo'], 'sha512') + self.assertEqual(instance_info['image_os_hash_value'], + 'fake-checksum') self.assertEqual(instance_info['image_disk_format'], 'raw') - calls = [mock.call(image_path, algorithm='sha512')] - self.checksum_mock.assert_has_calls(calls) + self.checksum_mock.assert_called_once_with(image_path, + algorithm='sha512') + + def test_build_instance_info_already_raw(self): + cfg.CONF.set_override('force_raw_images', True) + self.image_info['disk_format'] = 'raw' + image_path, instance_info = self._test_build_instance_info( + image_info=self.image_info, expect_raw=True) + + self.assertEqual(instance_info['image_checksum'], 'aa') + self.assertEqual(instance_info['image_os_hash_algo'], 'sha512') + self.assertEqual(instance_info['image_os_hash_value'], + 'fake-sha512') + self.assertEqual(instance_info['image_disk_format'], 'raw') + self.checksum_mock.assert_not_called() def test_build_instance_info_force_raw_drops_md5(self): cfg.CONF.set_override('force_raw_images', True) @@ -2027,6 +2043,17 @@ class TestBuildInstanceInfoForHttpProvisioning(db_base.DbTestCase): calls = [mock.call(image_path, algorithm='sha256')] self.checksum_mock.assert_has_calls(calls) + def test_build_instance_info_already_raw_keeps_md5(self): + cfg.CONF.set_override('force_raw_images', True) + self.image_info['os_hash_algo'] = 'md5' + self.image_info['disk_format'] = 'raw' + image_path, instance_info = self._test_build_instance_info( + image_info=self.image_info, expect_raw=True) + + self.assertEqual(instance_info['image_checksum'], 'aa') + self.assertEqual(instance_info['image_disk_format'], 'raw') + self.checksum_mock.assert_not_called() + @mock.patch.object(image_service.HttpImageService, 'validate_href', autospec=True) def test_build_instance_info_file_image(self, validate_href_mock): @@ -2035,7 +2062,6 @@ class TestBuildInstanceInfoForHttpProvisioning(db_base.DbTestCase): i_info['image_source'] = 'file://image-ref' i_info['image_checksum'] = 'aa' i_info['root_gb'] = 10 - i_info['image_checksum'] = 'aa' driver_internal_info['is_whole_disk_image'] = True self.node.instance_info = i_info self.node.driver_internal_info = driver_internal_info @@ -2052,6 +2078,7 @@ class TestBuildInstanceInfoForHttpProvisioning(db_base.DbTestCase): self.assertEqual(expected_url, info['image_url']) self.assertEqual('sha256', info['image_os_hash_algo']) self.assertEqual('fake-checksum', info['image_os_hash_value']) + self.assertEqual('raw', info['image_disk_format']) self.cache_image_mock.assert_called_once_with( task.context, task.node, force_raw=True) self.checksum_mock.assert_called_once_with( @@ -2068,7 +2095,6 @@ class TestBuildInstanceInfoForHttpProvisioning(db_base.DbTestCase): i_info['image_source'] = 'http://image-ref' i_info['image_checksum'] = 'aa' i_info['root_gb'] = 10 - i_info['image_checksum'] = 'aa' driver_internal_info['is_whole_disk_image'] = True self.node.instance_info = i_info self.node.driver_internal_info = driver_internal_info @@ -2102,7 +2128,6 @@ class TestBuildInstanceInfoForHttpProvisioning(db_base.DbTestCase): i_info['image_source'] = 'http://image-ref' i_info['image_checksum'] = 'aa' i_info['root_gb'] = 10 - i_info['image_checksum'] = 'aa' i_info['image_download_source'] = 'local' driver_internal_info['is_whole_disk_image'] = True self.node.instance_info = i_info @@ -2138,7 +2163,6 @@ class TestBuildInstanceInfoForHttpProvisioning(db_base.DbTestCase): i_info['image_source'] = 'http://image-ref' i_info['image_checksum'] = 'aa' i_info['root_gb'] = 10 - i_info['image_checksum'] = 'aa' d_info['image_download_source'] = 'local' driver_internal_info['is_whole_disk_image'] = True self.node.instance_info = i_info @@ -2164,6 +2188,41 @@ class TestBuildInstanceInfoForHttpProvisioning(db_base.DbTestCase): validate_href_mock.assert_called_once_with( mock.ANY, expected_url, False) + @mock.patch.object(image_service.HttpImageService, 'validate_href', + autospec=True) + def test_build_instance_info_local_image_already_raw(self, + validate_href_mock): + cfg.CONF.set_override('image_download_source', 'local', group='agent') + i_info = self.node.instance_info + driver_internal_info = self.node.driver_internal_info + i_info['image_source'] = 'http://image-ref' + i_info['image_checksum'] = 'aa' + i_info['root_gb'] = 10 + i_info['image_disk_format'] = 'raw' + driver_internal_info['is_whole_disk_image'] = True + self.node.instance_info = i_info + self.node.driver_internal_info = driver_internal_info + self.node.save() + + expected_url = ( + 'http://172.172.24.10:8080/agent_images/%s' % self.node.uuid) + + with task_manager.acquire( + self.context, self.node.uuid, shared=False) as task: + + info = utils.build_instance_info_for_deploy(task) + + self.assertEqual(expected_url, info['image_url']) + self.assertEqual('aa', info['image_checksum']) + self.assertEqual('raw', info['image_disk_format']) + self.assertIsNone(info['image_os_hash_algo']) + self.assertIsNone(info['image_os_hash_value']) + self.cache_image_mock.assert_called_once_with( + task.context, task.node, force_raw=True) + self.checksum_mock.assert_not_called() + validate_href_mock.assert_called_once_with( + mock.ANY, expected_url, False) + class TestStorageInterfaceUtils(db_base.DbTestCase): def setUp(self): diff --git a/releasenotes/notes/cleaning-error-5c13c33c58404b97.yaml b/releasenotes/notes/cleaning-error-5c13c33c58404b97.yaml new file mode 100644 index 000000000..270278f1b --- /dev/null +++ b/releasenotes/notes/cleaning-error-5c13c33c58404b97.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + When aborting cleaning, the ``last_error`` field is no longer initially + empty. It is now populated on the state transition to ``clean failed``. + - | + When cleaning or deployment fails, the ``last_error`` field is no longer + temporary set to ``None`` while the power off action is running. diff --git a/releasenotes/notes/cross-link-1ffd1a4958f14fd7.yaml b/releasenotes/notes/cross-link-1ffd1a4958f14fd7.yaml new file mode 100644 index 000000000..caac13dd4 --- /dev/null +++ b/releasenotes/notes/cross-link-1ffd1a4958f14fd7.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + Fixes ``Invalid cross-device link`` in some cases when using ``file://`` + image URLs. diff --git a/releasenotes/notes/fix-online-version-migration-db432a7b239647fa.yaml b/releasenotes/notes/fix-online-version-migration-db432a7b239647fa.yaml new file mode 100644 index 000000000..824185aab --- /dev/null +++ b/releasenotes/notes/fix-online-version-migration-db432a7b239647fa.yaml @@ -0,0 +1,14 @@ +--- +fixes: + - | + Fixes an issue in the online upgrade logic where database models for + Node Traits and BIOS Settings resulted in an error when performing + the online data migration. This was because these tables were originally + created as extensions of the Nodes database table, and the schema + of the database was slightly different enough to result in an error + if there was data to migrate in these tables upon upgrade, + which would have occured if an early BIOS Setting adopter had + data in the database prior to upgrading to the Yoga release of Ironic. + + The online upgrade parameter now subsitutes an alternate primary key name + name when applicable. diff --git a/releasenotes/notes/fix-power-off-token-wipe-e7d605997f00d39d.yaml b/releasenotes/notes/fix-power-off-token-wipe-e7d605997f00d39d.yaml new file mode 100644 index 000000000..14a489b46 --- /dev/null +++ b/releasenotes/notes/fix-power-off-token-wipe-e7d605997f00d39d.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + Fixes an issue where an agent token could be inadvertently orphaned + if a node is already in the target power state when we attempt to turn + the node off. diff --git a/releasenotes/notes/no-recalculate-653e524fd6160e72.yaml b/releasenotes/notes/no-recalculate-653e524fd6160e72.yaml new file mode 100644 index 000000000..3d2e6dad4 --- /dev/null +++ b/releasenotes/notes/no-recalculate-653e524fd6160e72.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + No longer re-calculates checksums for images that are already raw. + Previously, it would cause significant delays in deploying raw images. diff --git a/releasenotes/notes/wipe-agent-token-upon-cleaning-timeout-c9add514fad1b02c.yaml b/releasenotes/notes/wipe-agent-token-upon-cleaning-timeout-c9add514fad1b02c.yaml new file mode 100644 index 000000000..0aa828ccd --- /dev/null +++ b/releasenotes/notes/wipe-agent-token-upon-cleaning-timeout-c9add514fad1b02c.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + Fixes an issue where an agent token was being orphaned if a baremetal node + timed out during cleaning operations, leading to issues where the node + would not be able to establish a new token with Ironic upon future + in some cases. We now always wipe the token in this case. |