diff options
-rw-r--r-- | doc/source/deploy/cleaning.rst | 37 | ||||
-rw-r--r-- | ironic/common/glance_service/base_image_service.py | 31 | ||||
-rw-r--r-- | ironic/common/pxe_utils.py | 33 | ||||
-rw-r--r-- | ironic/drivers/modules/boot.ipxe | 2 | ||||
-rw-r--r-- | ironic/drivers/modules/ipmitool.py | 72 | ||||
-rw-r--r-- | ironic/tests/drivers/test_ipmitool.py | 80 | ||||
-rw-r--r-- | ironic/tests/stubs.py | 12 | ||||
-rw-r--r-- | ironic/tests/test_glance_service.py | 17 | ||||
-rw-r--r-- | ironic/tests/test_pxe_utils.py | 37 |
9 files changed, 252 insertions, 69 deletions
diff --git a/doc/source/deploy/cleaning.rst b/doc/source/deploy/cleaning.rst index b19b88a1b..8897bf46b 100644 --- a/doc/source/deploy/cleaning.rst +++ b/doc/source/deploy/cleaning.rst @@ -13,9 +13,14 @@ the tenant will get a consistent baremetal node deployed every time. Ironic implements cleaning by collecting a list of steps to perform on a node from each Power, Deploy, and Management driver assigned to the node. These steps are then arranged by priority and executed on the node when it is moved -to CLEANING state, if cleaning is enabled. +to cleaning state, if cleaning is enabled. -Ironic added support for cleaning used nodes in the Kilo release. +Typically, nodes move to cleaning state when moving from active -> available. +Nodes also traverse cleaning when going from manageable -> available. For a +full understanding of all state transitions into cleaning, please see +:ref:`states`. + +Ironic added support for cleaning nodes in the Kilo release. Enabling Cleaning @@ -41,8 +46,8 @@ In-band steps are performed by Ironic making API calls to a ramdisk running on the node using a Deploy driver. Currently, only the ironic-python-agent ramdisk used with an agent_* driver supports in-band cleaning. By default, ironic-python-agent ships with a minimal cleaning configuration, only erasing -disks. However, with this ramdisk, you can add your own clean_steps and/or -override default clean_steps with a custom Hardware Manager. +disks. However, with this ramdisk, you can add your own cleaning steps and/or +override default cleaning steps with a custom Hardware Manager. There is currently no support for in-band cleaning using the Ironic pxe ramdisk. @@ -81,12 +86,14 @@ to disable erase_devices, you'd use the following config:: agent_erase_devices_priority=0 -What clean_step is running? ---------------------------- -To check what clean_step the node is performing or attempted to perform and +What cleaning step is running? +------------------------------ +To check what cleaning step the node is performing or attempted to perform and failed, either query the node endpoint for the node or run ``ironic node-show -$node_ident`` and look at the 'clean_step' field. This will tell you which -step for which driver is or was (if in CLEANFAIL state) being executed. +$node_ident`` and look in the `internal_driver_info` field. The `clean_steps` +field will contain a list of all remaining steps with their priority, and the +first one listed is the step currently in progress or that the node failed +before going into cleanfail state. Should I disable cleaning? -------------------------- @@ -107,17 +114,17 @@ cleaning. Troubleshooting =============== -If cleaning fails on a node, the node will be put into CLEANFAIL state and +If cleaning fails on a node, the node will be put into cleanfail state and placed in maintenance mode, to prevent Ironic from taking actions on the node. -Nodes in CLEANFAIL will not be powered off, as the node might be in a state +Nodes in cleanfail will not be powered off, as the node might be in a state such that powering it off could damage the node or remove useful information about the nature of the cleaning failure. -A CLEANFAIL node can be moved to MANAGEABLE state, where they cannot be +A cleanfail node can be moved to manageable state, where they cannot be scheduled by Nova and you can safely attempt to fix the node. To move a node -from CLEANFAIL to MANAGEABLE: ``ironic node-set-provision-state manage``. +from cleanfail to manageable: ``ironic node-set-provision-state manage``. You can now take actions on the node, such as replacing a bad disk drive. Strategies for determining why a cleaning step failed include checking the @@ -125,7 +132,7 @@ Ironic conductor logs, viewing logs on the still-running ironic-python-agent (if an in-band step failed), or performing general hardware troubleshooting on the node. -When the node is repaired, you can move the node back to AVAILABLE state, to +When the node is repaired, you can move the node back to available state, to allow it to be scheduled by Nova. :: @@ -136,5 +143,5 @@ allow it to be scheduled by Nova. # Now, make the node available for scheduling by Nova ironic node-set-provision-state $node_ident provide -The node will begin cleaning from the start, and move to AVAILABLE state +The node will begin cleaning from the start, and move to available state when complete. diff --git a/ironic/common/glance_service/base_image_service.py b/ironic/common/glance_service/base_image_service.py index 1b646d587..9a97cbbf0 100644 --- a/ironic/common/glance_service/base_image_service.py +++ b/ironic/common/glance_service/base_image_service.py @@ -22,6 +22,7 @@ import sys import time from glanceclient import client +from glanceclient import exc as glance_exc from oslo_config import cfg import sendfile import six.moves.urllib.parse as urlparse @@ -36,23 +37,23 @@ CONF = cfg.CONF def _translate_image_exception(image_id, exc_value): - if isinstance(exc_value, (exception.Forbidden, - exception.Unauthorized)): + if isinstance(exc_value, (glance_exc.Forbidden, + glance_exc.Unauthorized)): return exception.ImageNotAuthorized(image_id=image_id) - if isinstance(exc_value, exception.NotFound): + if isinstance(exc_value, glance_exc.NotFound): return exception.ImageNotFound(image_id=image_id) - if isinstance(exc_value, exception.BadRequest): + if isinstance(exc_value, glance_exc.BadRequest): return exception.Invalid(exc_value) return exc_value def _translate_plain_exception(exc_value): - if isinstance(exc_value, (exception.Forbidden, - exception.Unauthorized)): + if isinstance(exc_value, (glance_exc.Forbidden, + glance_exc.Unauthorized)): return exception.NotAuthorized(exc_value) - if isinstance(exc_value, exception.NotFound): + if isinstance(exc_value, glance_exc.NotFound): return exception.NotFound(exc_value) - if isinstance(exc_value, exception.BadRequest): + if isinstance(exc_value, glance_exc.BadRequest): return exception.Invalid(exc_value) return exc_value @@ -109,13 +110,13 @@ class BaseImageService(object): :raises: GlanceConnectionFailed """ - retry_excs = (exception.ServiceUnavailable, - exception.InvalidEndpoint, - exception.CommunicationError) - image_excs = (exception.Forbidden, - exception.Unauthorized, - exception.NotFound, - exception.BadRequest) + retry_excs = (glance_exc.ServiceUnavailable, + glance_exc.InvalidEndpoint, + glance_exc.CommunicationError) + image_excs = (glance_exc.Forbidden, + glance_exc.Unauthorized, + glance_exc.NotFound, + glance_exc.BadRequest) num_attempts = 1 + CONF.glance.glance_num_retries for attempt in range(1, num_attempts + 1): diff --git a/ironic/common/pxe_utils.py b/ironic/common/pxe_utils.py index a125da034..09528a218 100644 --- a/ironic/common/pxe_utils.py +++ b/ironic/common/pxe_utils.py @@ -80,12 +80,20 @@ def _link_mac_pxe_configs(task): :param task: A TaskManager instance. """ - pxe_config_file_path = get_pxe_config_file_path(task.node.uuid) - for mac in driver_utils.get_node_mac_addresses(task): - mac_path = _get_pxe_mac_path(mac) + + def create_link(mac_path): utils.unlink_without_raise(mac_path) utils.create_link_without_raise(pxe_config_file_path, mac_path) + pxe_config_file_path = get_pxe_config_file_path(task.node.uuid) + for mac in driver_utils.get_node_mac_addresses(task): + create_link(_get_pxe_mac_path(mac)) + # TODO(lucasagomes): Backward compatibility with :hexraw, + # to be removed in M. + # see: https://bugs.launchpad.net/ironic/+bug/1441710 + if CONF.pxe.ipxe_enabled: + create_link(_get_pxe_mac_path(mac, delimiter='')) + def _link_ip_address_pxe_configs(task): """Link each IP address with the PXE configuration file. @@ -110,17 +118,20 @@ def _link_ip_address_pxe_configs(task): ip_address_path) -def _get_pxe_mac_path(mac): +def _get_pxe_mac_path(mac, delimiter=None): """Convert a MAC address into a PXE config file name. :param mac: A MAC address string in the format xx:xx:xx:xx:xx:xx. + :param delimiter: The MAC address delimiter. Defaults to dash ('-'). :returns: the path to the config file. """ - if CONF.pxe.ipxe_enabled: - mac_file_name = mac.replace(':', '').lower() - else: - mac_file_name = "01-" + mac.replace(":", "-").lower() + if delimiter is None: + delimiter = '-' + + mac_file_name = mac.replace(':', delimiter).lower() + if not CONF.pxe.ipxe_enabled: + mac_file_name = '01-' + mac_file_name return os.path.join(get_root_dir(), PXE_CFG_DIR_NAME, mac_file_name) @@ -221,6 +232,12 @@ def clean_up_pxe_config(task): else: for mac in driver_utils.get_node_mac_addresses(task): utils.unlink_without_raise(_get_pxe_mac_path(mac)) + # TODO(lucasagomes): Backward compatibility with :hexraw, + # to be removed in M. + # see: https://bugs.launchpad.net/ironic/+bug/1441710 + if CONF.pxe.ipxe_enabled: + utils.unlink_without_raise(_get_pxe_mac_path(mac, + delimiter='')) utils.rmtree_without_raise(os.path.join(get_root_dir(), task.node.uuid)) diff --git a/ironic/drivers/modules/boot.ipxe b/ironic/drivers/modules/boot.ipxe index 25a0ea8dc..3567dc029 100644 --- a/ironic/drivers/modules/boot.ipxe +++ b/ironic/drivers/modules/boot.ipxe @@ -1,7 +1,7 @@ #!ipxe # load the MAC-specific file or fail if it's not found -chain --autofree pxelinux.cfg/${mac:hexraw} || goto error_no_config +chain --autofree pxelinux.cfg/${mac:hexhyp} || goto error_no_config :error_no_config echo PXE boot failed. No configuration found for MAC ${mac} diff --git a/ironic/drivers/modules/ipmitool.py b/ironic/drivers/modules/ipmitool.py index 2826dd1e7..55fc1fa2d 100644 --- a/ironic/drivers/modules/ipmitool.py +++ b/ironic/drivers/modules/ipmitool.py @@ -111,6 +111,9 @@ ipmitool_command_options = { 'dual_bridge': ['ipmitool', '-m', '0', '-b', '0', '-t', '0', '-B', '0', '-T', '0', '-h']} +# Note(TheJulia): This string is hardcoded in ipmitool's lanplus driver. +IPMITOOL_RETRYABLE_FAILURES = ['insufficient resources for session'] + def _check_option_support(options): """Checks if the specific ipmitool options are supported on host. @@ -335,32 +338,68 @@ def _exec_ipmitool(driver_info, command): args.append(driver_info[name]) # specify retry timing more precisely, if supported + num_tries = max( + (CONF.ipmi.retry_timeout // CONF.ipmi.min_command_interval), 1) + if _is_option_supported('timing'): - num_tries = max( - (CONF.ipmi.retry_timeout // CONF.ipmi.min_command_interval), 1) args.append('-R') args.append(str(num_tries)) args.append('-N') args.append(str(CONF.ipmi.min_command_interval)) - # 'ipmitool' command will prompt password if there is no '-f' option, - # we set it to '\0' to write a password file to support empty password - with _make_password_file(driver_info['password'] or '\0') as pw_file: - args.append('-f') - args.append(pw_file) - args.extend(command.split(" ")) + end_time = (_time() + CONF.ipmi.retry_timeout) + + while True: + num_tries = num_tries - 1 # NOTE(deva): ensure that no communications are sent to a BMC more # often than once every min_command_interval seconds. time_till_next_poll = CONF.ipmi.min_command_interval - ( - time.time() - LAST_CMD_TIME.get(driver_info['address'], 0)) + _time() - LAST_CMD_TIME.get(driver_info['address'], 0)) if time_till_next_poll > 0: time.sleep(time_till_next_poll) - try: - out, err = utils.execute(*args) - finally: - LAST_CMD_TIME[driver_info['address']] = time.time() - return out, err + # Resetting the list that will be utilized so the password arguments + # from any previous execution are preserved. + cmd_args = args[:] + # 'ipmitool' command will prompt password if there is no '-f' + # option, we set it to '\0' to write a password file to support + # empty password + with _make_password_file( + driver_info['password'] or '\0' + ) as pw_file: + cmd_args.append('-f') + cmd_args.append(pw_file) + cmd_args.extend(command.split(" ")) + try: + out, err = utils.execute(*cmd_args) + return out, err + except processutils.ProcessExecutionError as e: + with excutils.save_and_reraise_exception() as ctxt: + err_list = [x for x in IPMITOOL_RETRYABLE_FAILURES + if x in e.message] + if ((_time() > end_time) or + (num_tries == 0) or + not err_list): + LOG.error(_LE('IPMI Error attempting to execute ' + '"%(cmd)s" for node %(node)s. ' + 'Error: %(error)s'), + { + 'node': driver_info['uuid'], + 'cmd': e.cmd, + 'error': e + }) + else: + ctxt.reraise = False + LOG.warning(_LW('IPMI Error encountered, retrying ' + '"%(cmd)s" for node %(node)s ' + 'Error: %(error)s'), + { + 'node': driver_info['uuid'], + 'cmd': e.cmd, + 'error': e + }) + finally: + LAST_CMD_TIME[driver_info['address']] = _time() def _sleep_time(iter): @@ -587,6 +626,11 @@ def send_raw(task, raw_bytes): raise exception.IPMIFailure(cmd=cmd) +def _time(): + """Wrapper for time.time() enabling simplified unit testing.""" + return time.time() + + class IPMIPower(base.PowerInterface): def __init__(self): diff --git a/ironic/tests/drivers/test_ipmitool.py b/ironic/tests/drivers/test_ipmitool.py index a27060d0f..0828f2c60 100644 --- a/ironic/tests/drivers/test_ipmitool.py +++ b/ironic/tests/drivers/test_ipmitool.py @@ -840,6 +840,86 @@ class IPMIToolPrivateMethodTestCase(db_base.DbTestCase): mock_support.assert_called_once_with('timing') mock_pwf.assert_called_once_with(self.info['password']) mock_exec.assert_called_once_with(*args) + self.assertEqual(1, mock_exec.call_count) + + @mock.patch.object(ipmi, '_is_option_supported', autospec=True) + @mock.patch.object(utils, 'execute', autospec=True) + def test__exec_ipmitool_exception_retry(self, + mock_exec, mock_support, mock_sleep): + + ipmi.LAST_CMD_TIME = {} + mock_support.return_value = False + mock_exec.side_effect = iter([ + processutils.ProcessExecutionError( + stderr="insufficient resources for session" + ), + (None, None) + ]) + + # Directly set the configuration values such that + # the logic will cause _exec_ipmitool to retry twice. + self.config(min_command_interval=1, group='ipmi') + self.config(retry_timeout=2, group='ipmi') + + ipmi._exec_ipmitool(self.info, 'A B C') + + mock_support.assert_called_once_with('timing') + self.assertEqual(2, mock_exec.call_count) + + @mock.patch.object(ipmi, '_is_option_supported', autospec=True) + @mock.patch.object(utils, 'execute', autospec=True) + def test__exec_ipmitool_exception_retries_exceeded(self, + mock_exec, mock_support, mock_sleep): + + ipmi.LAST_CMD_TIME = {} + mock_support.return_value = False + + mock_exec.side_effect = processutils.ProcessExecutionError( + stderr="insufficient resources for session" + ) + + # Directly set the configuration values such that + # the logic will cause _exec_ipmitool to timeout. + self.config(min_command_interval=1, group='ipmi') + self.config(retry_timeout=1, group='ipmi') + + self.assertRaises(processutils.ProcessExecutionError, + ipmi._exec_ipmitool, + self.info, 'A B C') + mock_support.assert_called_once_with('timing') + self.assertEqual(1, mock_exec.call_count) + + @mock.patch.object(ipmi, '_is_option_supported', autospec=True) + @mock.patch.object(utils, 'execute', autospec=True) + def test__exec_ipmitool_exception_retry_failure_unhandable(self, + mock_exec, mock_support, mock_sleep): + + ipmi.LAST_CMD_TIME = {} + mock_support.return_value = False + + # Return a retryable error, then a error that cannot + # be retryable thus resulting in a single retry + # attempt by _exec_ipmitool that is successful. + mock_exec.side_effect = iter([ + processutils.ProcessExecutionError( + stderr="insufficient resources for session" + ), + processutils.ProcessExecutionError( + "Unknown" + ), + ]) + + # Directly set the configuration values such that + # the logic will cause _exec_ipmitool to retry up + # to 3 times. + self.config(min_command_interval=1, group='ipmi') + self.config(retry_timeout=3, group='ipmi') + + self.assertRaises(processutils.ProcessExecutionError, + ipmi._exec_ipmitool, + self.info, 'A B C') + mock_support.assert_called_once_with('timing') + self.assertEqual(2, mock_exec.call_count) @mock.patch.object(ipmi, '_exec_ipmitool', autospec=True) def test__power_status_on(self, mock_exec, mock_sleep): diff --git a/ironic/tests/stubs.py b/ironic/tests/stubs.py index 7d43d2676..d20c1fd8a 100644 --- a/ironic/tests/stubs.py +++ b/ironic/tests/stubs.py @@ -12,7 +12,7 @@ # License for the specific language governing permissions and limitations # under the License. -from ironic.common import exception +from glanceclient import exc as glance_exc NOW_GLANCE_FORMAT = "2010-10-11T10:30:22" @@ -40,7 +40,7 @@ class StubGlanceClient(object): index += 1 break else: - raise exception.BadRequest('Marker not found') + raise glance_exc.BadRequest('Marker not found') return self._images[index:index + limit] @@ -48,7 +48,7 @@ class StubGlanceClient(object): for image in self._images: if image.id == str(image_id): return image - raise exception.ImageNotFound(image_id) + raise glance_exc.NotFound(image_id) def data(self, image_id): self.get(image_id) @@ -76,7 +76,7 @@ class StubGlanceClient(object): for k, v in metadata.items(): setattr(self._images[i], k, v) return self._images[i] - raise exception.NotFound(image_id) + raise glance_exc.NotFound(image_id) def delete(self, image_id): for i, image in enumerate(self._images): @@ -86,10 +86,10 @@ class StubGlanceClient(object): # HTTPForbidden. image_data = self._images[i] if image_data.deleted: - raise exception.Forbidden() + raise glance_exc.Forbidden() image_data.deleted = True return - raise exception.NotFound(image_id) + raise glance_exc.NotFound(image_id) class FakeImage(object): diff --git a/ironic/tests/test_glance_service.py b/ironic/tests/test_glance_service.py index 8482ed751..6c3276da6 100644 --- a/ironic/tests/test_glance_service.py +++ b/ironic/tests/test_glance_service.py @@ -20,8 +20,11 @@ import os import tempfile import time +from glanceclient import exc as glance_exc import mock +from oslo_config import cfg from oslo_context import context +from oslo_serialization import jsonutils import testtools @@ -33,8 +36,6 @@ from ironic.tests import base from ironic.tests import matchers from ironic.tests import stubs -from oslo_config import cfg -from oslo_serialization import jsonutils CONF = cfg.CONF @@ -468,7 +469,7 @@ class TestGlanceImageService(base.TestCase): def get(self, image_id): if tries[0] == 0: tries[0] = 1 - raise exception.ServiceUnavailable('') + raise glance_exc.ServiceUnavailable('') else: return {} @@ -536,7 +537,7 @@ class TestGlanceImageService(base.TestCase): class MyGlanceStubClient(stubs.StubGlanceClient): """A client that raises a Forbidden exception.""" def get(self, image_id): - raise exception.Forbidden(image_id) + raise glance_exc.Forbidden(image_id) stub_client = MyGlanceStubClient() stub_context = context.RequestContext(auth_token=True) @@ -552,7 +553,7 @@ class TestGlanceImageService(base.TestCase): class MyGlanceStubClient(stubs.StubGlanceClient): """A client that raises a HTTPForbidden exception.""" def get(self, image_id): - raise exception.HTTPForbidden(image_id) + raise glance_exc.HTTPForbidden(image_id) stub_client = MyGlanceStubClient() stub_context = context.RequestContext(auth_token=True) @@ -568,7 +569,7 @@ class TestGlanceImageService(base.TestCase): class MyGlanceStubClient(stubs.StubGlanceClient): """A client that raises a NotFound exception.""" def get(self, image_id): - raise exception.NotFound(image_id) + raise glance_exc.NotFound(image_id) stub_client = MyGlanceStubClient() stub_context = context.RequestContext(auth_token=True) @@ -584,7 +585,7 @@ class TestGlanceImageService(base.TestCase): class MyGlanceStubClient(stubs.StubGlanceClient): """A client that raises a HTTPNotFound exception.""" def get(self, image_id): - raise exception.HTTPNotFound(image_id) + raise glance_exc.HTTPNotFound(image_id) stub_client = MyGlanceStubClient() stub_context = context.RequestContext(auth_token=True) @@ -635,7 +636,7 @@ def _create_failing_glance_client(info): def get(self, image_id): info['num_calls'] += 1 if info['num_calls'] == 1: - raise exception.ServiceUnavailable('') + raise glance_exc.ServiceUnavailable('') return {} return MyGlanceStubClient() diff --git a/ironic/tests/test_pxe_utils.py b/ironic/tests/test_pxe_utils.py index f16cd3e1b..1792d840a 100644 --- a/ironic/tests/test_pxe_utils.py +++ b/ironic/tests/test_pxe_utils.py @@ -144,7 +144,40 @@ class TestPXEUtils(db_base.DbTestCase): ] unlink_calls = [ mock.call('/tftpboot/pxelinux.cfg/01-00-11-22-33-44-55-66'), - mock.call('/tftpboot/pxelinux.cfg/01-00-11-22-33-44-55-67') + mock.call('/tftpboot/pxelinux.cfg/01-00-11-22-33-44-55-67'), + ] + with task_manager.acquire(self.context, self.node.uuid) as task: + pxe_utils._link_mac_pxe_configs(task) + + unlink_mock.assert_has_calls(unlink_calls) + create_link_mock.assert_has_calls(create_link_calls) + + @mock.patch('ironic.common.utils.create_link_without_raise', autospec=True) + @mock.patch('ironic.common.utils.unlink_without_raise', autospec=True) + @mock.patch('ironic.drivers.utils.get_node_mac_addresses', autospec=True) + def test__write_mac_ipxe_configs(self, get_macs_mock, unlink_mock, + create_link_mock): + self.config(ipxe_enabled=True, group='pxe') + macs = [ + '00:11:22:33:44:55:66', + '00:11:22:33:44:55:67' + ] + get_macs_mock.return_value = macs + create_link_calls = [ + mock.call(u'/httpboot/1be26c0b-03f2-4d2e-ae87-c02d7f33c123/config', + '/httpboot/pxelinux.cfg/00-11-22-33-44-55-66'), + mock.call(u'/httpboot/1be26c0b-03f2-4d2e-ae87-c02d7f33c123/config', + '/httpboot/pxelinux.cfg/00112233445566'), + mock.call(u'/httpboot/1be26c0b-03f2-4d2e-ae87-c02d7f33c123/config', + '/httpboot/pxelinux.cfg/00-11-22-33-44-55-67'), + mock.call(u'/httpboot/1be26c0b-03f2-4d2e-ae87-c02d7f33c123/config', + '/httpboot/pxelinux.cfg/00112233445567'), + ] + unlink_calls = [ + mock.call('/httpboot/pxelinux.cfg/00-11-22-33-44-55-66'), + mock.call('/httpboot/pxelinux.cfg/00112233445566'), + mock.call('/httpboot/pxelinux.cfg/00-11-22-33-44-55-67'), + mock.call('/httpboot/pxelinux.cfg/00112233445567'), ] with task_manager.acquire(self.context, self.node.uuid) as task: pxe_utils._link_mac_pxe_configs(task) @@ -218,7 +251,7 @@ class TestPXEUtils(db_base.DbTestCase): self.config(ipxe_enabled=True, group='pxe') self.config(http_root='/httpboot', group='pxe') mac = '00:11:22:33:AA:BB:CC' - self.assertEqual('/httpboot/pxelinux.cfg/00112233aabbcc', + self.assertEqual('/httpboot/pxelinux.cfg/00-11-22-33-aa-bb-cc', pxe_utils._get_pxe_mac_path(mac)) def test__get_pxe_ip_address_path(self): |