diff options
24 files changed, 634 insertions, 106 deletions
diff --git a/devstack/lib/ironic b/devstack/lib/ironic index ab96638c0..08cccce7a 100644 --- a/devstack/lib/ironic +++ b/devstack/lib/ironic @@ -1332,6 +1332,17 @@ function configure_ironic_networks { configure_ironic_cleaning_network echo_summary "Configuring Ironic rescue network" configure_ironic_rescue_network + echo_summary "Configuring Neutron Private Subnet, if needed." + configure_ironic_private_subnet +} + +function configure_ironic_private_subnet { + if [[ "${IRONIC_ANACONDA_IMAGE_REF:-}" != "" ]]; then + # NOTE(TheJulia): Anaconda needs DNS for FQDN resolution + # and devstack doesn't create this network with dns. + subnet_id=$(openstack --os-cloud $OS_CLOUD subnet show private-subnet -f value -c id) + openstack --os-cloud $OS_CLOUD subnet set --dns-nameserver 8.8.8.8 $subnet_id + fi } function configure_ironic_cleaning_network { @@ -1405,7 +1416,8 @@ function configure_ironic_provision_network { ${net_segment_id:+--network-segment $net_segment_id} \ $IRONIC_PROVISION_PROVIDER_SUBNET_NAME \ --gateway $IRONIC_PROVISION_SUBNET_GATEWAY --network $net_id \ - --subnet-range $IRONIC_PROVISION_SUBNET_PREFIX -f value -c id)" + --subnet-range $IRONIC_PROVISION_SUBNET_PREFIX \ + --dns-nameserver 8.8.8.8 -f value -c id)" else # NOTE(TheJulia): Consider changing this to stateful to support UEFI once we move # CI to Ubuntu Jammy as it will support v6 and v4 UEFI firmware driven boot ops. @@ -3057,6 +3069,16 @@ function upload_baremetal_ironic_deploy { iniset $IRONIC_CONF_FILE conductor deploy_ramdisk $IRONIC_DEPLOY_RAMDISK_ID iniset $IRONIC_CONF_FILE conductor rescue_kernel $IRONIC_DEPLOY_KERNEL_ID iniset $IRONIC_CONF_FILE conductor rescue_ramdisk $IRONIC_DEPLOY_RAMDISK_ID + + if [[ "${IRONIC_ANACONDA_INSECURE_HEARTBEAT:-}" != "" ]]; then + iniset $IRONIC_CONF_FILE anaconda insecure_heartbeat ${IRONIC_ANACONDA_INSECURE_HEARTBEAT:-} + fi + # NOTE(TheJulia): Compared to an image deploy, anaconda is relatively + # slow as it installs packages one at a time. As such, we need an option + # to extend. + if [[ "${IRONIC_DEPLOY_CALLBACK_WAIT_TIMEOUT:-}" != "" ]]; then + iniset $IRONIC_CONF_FILE conductor deploy_callback_timeout ${IRONIC_DEPLOY_CALLBACK_WAIT_TIMEOUT:-} + fi } function prepare_baremetal_basic_ops { @@ -3221,6 +3243,23 @@ function ironic_configure_tempest { if [[ "$IRONIC_RAMDISK_IMAGE" != "" ]]; then iniset $TEMPEST_CONFIG baremetal ramdisk_iso_image_ref "$IRONIC_RAMDISK_IMAGE" fi + if [[ "${IRONIC_ANACONDA_IMAGE_REF:-}" != "" ]]; then + # In a perfect world we would use *just* the opendev repo + # mirror, and let things be magical, but OpenDev Infra cannot + # mirror the /images path with the limited storage space. + iniset $TEMPEST_CONFIG baremetal anaconda_image_ref ${IRONIC_ANACONDA_IMAGE_REF:-} + fi + if [[ "${IRONIC_ANACONDA_KERNEL_REF:-}" != "" ]]; then + iniset $TEMPEST_CONFIG baremetal anaconda_kernel_ref ${IRONIC_ANACONDA_KERNEL_REF:-} + fi + if [[ "${IRONIC_ANACONDA_RAMDISK_REF:-}" != "" ]]; then + iniset $TEMPEST_CONFIG baremetal anaconda_initial_ramdisk_ref ${IRONIC_ANACONDA_RAMDISK_REF:-} + fi + if [[ "${IRONIC_ANACONDA_STAGE2_REF:-}" != "" ]]; then + iniset $TEMPEST_CONFIG baremetal anaconda_stage2_ramdisk_ref ${IRONIC_ANACONDA_STAGE2_REF:-} + + fi + # NOTE(dtantsur): keep this option here until the defaults change in # ironic-tempest-plugin to disable classic drivers testing. iniset $TEMPEST_CONFIG baremetal enabled_drivers "" diff --git a/doc/source/admin/drivers.rst b/doc/source/admin/drivers.rst index c3d8eb377..f35cb2dfa 100644 --- a/doc/source/admin/drivers.rst +++ b/doc/source/admin/drivers.rst @@ -26,6 +26,7 @@ Hardware Types drivers/redfish drivers/snmp drivers/xclarity + drivers/fake Changing Hardware Types and Interfaces -------------------------------------- diff --git a/doc/source/admin/drivers/fake.rst b/doc/source/admin/drivers/fake.rst new file mode 100644 index 000000000..ea7d7ef4c --- /dev/null +++ b/doc/source/admin/drivers/fake.rst @@ -0,0 +1,36 @@ +=========== +Fake driver +=========== + +Overview +======== + +The ``fake-hardware`` hardware type is what it claims to be: fake. Use of this +type or the ``fake`` interfaces should be temporary or limited to +non-production environments, as the ``fake`` interfaces do not perform any of +the actions typically expected. + +The ``fake`` interfaces can be configured to be combined with any of the +"real" hardware interfaces, allowing you to effectively disable one or more +hardware interfaces for testing by simply setting that interface to +``fake``. + +Use cases +========= + +Development +----------- +Developers can use ``fake-hardware`` hardware-type to mock out nodes for +testing without those nodes needing to exist with physical or virtual hardware. + +Adoption +-------- +Some OpenStack deployers have used ``fake`` interfaces in Ironic to allow an +adoption-style workflow with Nova. By setting a node's hardware interfaces to +``fake``, it's possible to deploy to that node with Nova without causing any +actual changes to the hardware or an OS already deployed on it. + +This is generally an unsupported use case, but it is possible. For more +information, see the relevant `post from CERN TechBlog`_. + +.. _`post from CERN TechBlog`: https://techblog.web.cern.ch/techblog/post/ironic-nova-adoption/ diff --git a/doc/source/admin/troubleshooting.rst b/doc/source/admin/troubleshooting.rst index fa04d3006..2791430fd 100644 --- a/doc/source/admin/troubleshooting.rst +++ b/doc/source/admin/troubleshooting.rst @@ -973,3 +973,40 @@ Unfortunately, due to the way the conductor is designed, it is not possible to gracefully break a stuck lock held in ``*-ing`` states. As the last resort, you may need to restart the affected conductor. See `Why are my nodes stuck in a "-ing" state?`_. + +What is ConcurrentActionLimit? +============================== + +ConcurrentActionLimit is an exception which is raised to clients when an +operation is requested, but cannot be serviced at that moment because the +overall threshold of nodes in concurrent "Deployment" or "Cleaning" +operations has been reached. + +These limits exist for two distinct reasons. + +The first is they allow an operator to tune a deployment such that too many +concurrent deployments cannot be triggered at any given time, as a single +conductor has an internal limit to the number of overall concurrent tasks, +this restricts only the number of running concurrent actions. As such, this +accounts for the number of nodes in ``deploy`` and ``deploy wait`` states. +In the case of deployments, the default value is relatively high and should +be suitable for *most* larger operators. + +The second is to help slow down the ability in which an entire population of +baremetal nodes can be moved into and through cleaning, in order to help +guard against authenticated malicious users, or accidental script driven +operations. In this case, the total number of nodes in ``deleting``, +``cleaning``, and ``clean wait`` are evaluated. The default maximum limit +for cleaning operations is *50* and should be suitable for the majority of +baremetal operators. + +These settings can be modified by using the +``[conductor]max_concurrent_deploy`` and ``[conductor]max_concurrent_clean`` +settings from the ironic.conf file supporting the ``ironic-conductor`` +service. Neither setting can be explicity disabled, however there is also no +upper limit to the setting. + +.. note:: + This was an infrastructure operator requested feature from actual lessons + learned in the operation of Ironic in large scale production. The defaults + may not be suitable for the largest scale operators. diff --git a/ironic/common/exception.py b/ironic/common/exception.py index ddbce6f47..38316d2e7 100644 --- a/ironic/common/exception.py +++ b/ironic/common/exception.py @@ -851,3 +851,13 @@ class ImageRefIsARedirect(IronicException): message=msg, image_ref=image_ref, redirect_url=redirect_url) + + +class ConcurrentActionLimit(IronicException): + # NOTE(TheJulia): We explicitly don't report the concurrent + # action limit configuration value as a security guard since + # if informed of the limit, an attacker can tailor their attack. + _msg_fmt = _("Unable to process request at this time. " + "The concurrent action limit for %(task_type)s " + "has been reached. Please contact your administrator " + "and try again later.") diff --git a/ironic/common/pxe_utils.py b/ironic/common/pxe_utils.py index ec0719b75..33a24ecb6 100644 --- a/ironic/common/pxe_utils.py +++ b/ironic/common/pxe_utils.py @@ -674,20 +674,33 @@ def get_instance_image_info(task, ipxe_enabled=False): os.path.join(root_dir, node.uuid, 'boot_iso')) return image_info - image_properties = None d_info = deploy_utils.get_image_instance_info(node) + isap = node.driver_internal_info.get('is_source_a_path') def _get_image_properties(): - nonlocal image_properties - if not image_properties: + nonlocal image_properties, isap + if not image_properties and not isap: i_service = service.get_image_service( d_info['image_source'], context=ctx) image_properties = i_service.show( d_info['image_source'])['properties'] + # TODO(TheJulia): At some point, we should teach this code + # to understand that with a path, it *can* retrieve the + # manifest from the HTTP(S) endpoint, which can populate + # image_properties, and drive path to variable population + # like is done with basically Glance. labels = ('kernel', 'ramdisk') + if not isap: + anaconda_labels = ('stage2', 'ks_template', 'ks_cfg') + else: + # When a path is used, a stage2 ramdisk can be determiend + # automatically by anaconda, so it is not an explicit + # requirement. + anaconda_labels = ('ks_template', 'ks_cfg') + if not (i_info.get('kernel') and i_info.get('ramdisk')): # NOTE(rloo): If both are not specified in instance_info # we won't use any of them. We'll use the values specified @@ -700,20 +713,13 @@ def get_instance_image_info(task, ipxe_enabled=False): i_info[label] = str(image_properties[label + '_id']) node.instance_info = i_info node.save() + # TODO(TheJulia): Add functionality to look/grab the hints file + # for anaconda and just run with the entire path. - anaconda_labels = () - if deploy_utils.get_boot_option(node) == 'kickstart': - isap = node.driver_internal_info.get('is_source_a_path') # stage2: installer stage2 squashfs image # ks_template: anaconda kickstart template # ks_cfg - rendered ks_template - if not isap: - anaconda_labels = ('stage2', 'ks_template', 'ks_cfg') - else: - # When a path is used, a stage2 ramdisk can be determiend - # automatically by anaconda, so it is not an explicit - # requirement. - anaconda_labels = ('ks_template', 'ks_cfg') + # NOTE(rloo): We save stage2 & ks_template values in case they # are changed by the user after we start using them and to # prevent re-computing them again. @@ -733,26 +739,31 @@ def get_instance_image_info(task, ipxe_enabled=False): else: node.set_driver_internal_info( 'stage2', str(image_properties['stage2_id'])) - # NOTE(TheJulia): A kickstart template is entirely independent - # of the stage2 ramdisk. In the end, it was the configuration which - # told anaconda how to execute. - if i_info.get('ks_template'): - # If the value is set, we always overwrite it, in the event - # a rebuild is occuring or something along those lines. - node.set_driver_internal_info('ks_template', - i_info['ks_template']) + # NOTE(TheJulia): A kickstart template is entirely independent + # of the stage2 ramdisk. In the end, it was the configuration which + # told anaconda how to execute. + if i_info.get('ks_template'): + # If the value is set, we always overwrite it, in the event + # a rebuild is occuring or something along those lines. + node.set_driver_internal_info('ks_template', + i_info['ks_template']) + else: + _get_image_properties() + # ks_template is an optional property on the image + if image_properties and 'ks_template' in image_properties: + node.set_driver_internal_info( + 'ks_template', str(image_properties['ks_template'])) else: - _get_image_properties() - # ks_template is an optional property on the image - if 'ks_template' not in image_properties: - # If not defined, default to the overall system default - # kickstart template, as opposed to a user supplied - # template. - node.set_driver_internal_info( - 'ks_template', CONF.anaconda.default_ks_template) - else: - node.set_driver_internal_info( - 'ks_template', str(image_properties['ks_template'])) + # If not defined, default to the overall system default + # kickstart template, as opposed to a user supplied + # template. + node.set_driver_internal_info( + 'ks_template', + 'file://' + os.path.abspath( + CONF.anaconda.default_ks_template + ) + ) + node.save() for label in labels + anaconda_labels: @@ -1253,6 +1264,8 @@ def cache_ramdisk_kernel(task, pxe_info, ipxe_enabled=False): CONF.deploy.http_root, 'stage2') ensure_tree(os.path.dirname(file_path)) + + if 'ks_cfg' in pxe_info: # ks_cfg is rendered later by the driver using ks_template. It cannot # be fetched and cached. t_pxe_info.pop('ks_cfg') diff --git a/ironic/conductor/cleaning.py b/ironic/conductor/cleaning.py index e3151d4b8..53d66ddd8 100644 --- a/ironic/conductor/cleaning.py +++ b/ironic/conductor/cleaning.py @@ -69,7 +69,7 @@ def do_node_clean(task, clean_steps=None, disable_ramdisk=False): task.driver.power.validate(task) if not disable_ramdisk: task.driver.network.validate(task) - except exception.InvalidParameterValue as e: + except (exception.InvalidParameterValue, exception.NetworkError) as e: msg = (_('Validation of node %(node)s for cleaning failed: %(msg)s') % {'node': node.uuid, 'msg': e}) return utils.cleaning_error_handler(task, msg) diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index 13d11d1d9..7e98459ff 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -886,7 +886,8 @@ class ConductorManager(base_manager.BaseConductorManager): exception.NodeInMaintenance, exception.InstanceDeployFailure, exception.InvalidStateRequested, - exception.NodeProtected) + exception.NodeProtected, + exception.ConcurrentActionLimit) def do_node_deploy(self, context, node_id, rebuild=False, configdrive=None, deploy_steps=None): """RPC method to initiate deployment to a node. @@ -910,8 +911,11 @@ class ConductorManager(base_manager.BaseConductorManager): :raises: InvalidStateRequested when the requested state is not a valid target from the current state. :raises: NodeProtected if the node is protected. + :raises: ConcurrentActionLimit if this action would exceed the maximum + number of configured concurrent actions of this type. """ LOG.debug("RPC do_node_deploy called for node %s.", node_id) + self._concurrent_action_limit(action='provisioning') event = 'rebuild' if rebuild else 'deploy' # NOTE(comstud): If the _sync_power_states() periodic task happens @@ -983,7 +987,8 @@ class ConductorManager(base_manager.BaseConductorManager): exception.NodeLocked, exception.InstanceDeployFailure, exception.InvalidStateRequested, - exception.NodeProtected) + exception.NodeProtected, + exception.ConcurrentActionLimit) def do_node_tear_down(self, context, node_id): """RPC method to tear down an existing node deployment. @@ -998,8 +1003,11 @@ class ConductorManager(base_manager.BaseConductorManager): :raises: InvalidStateRequested when the requested state is not a valid target from the current state. :raises: NodeProtected if the node is protected. + :raises: ConcurrentActionLimit if this action would exceed the maximum + number of configured concurrent actions of this type. """ LOG.debug("RPC do_node_tear_down called for node %s.", node_id) + self._concurrent_action_limit(action='unprovisioning') with task_manager.acquire(context, node_id, shared=False, purpose='node tear down') as task: @@ -1121,7 +1129,8 @@ class ConductorManager(base_manager.BaseConductorManager): exception.InvalidStateRequested, exception.NodeInMaintenance, exception.NodeLocked, - exception.NoFreeConductorWorker) + exception.NoFreeConductorWorker, + exception.ConcurrentActionLimit) def do_node_clean(self, context, node_id, clean_steps, disable_ramdisk=False): """RPC method to initiate manual cleaning. @@ -1150,7 +1159,10 @@ class ConductorManager(base_manager.BaseConductorManager): :raises: NodeLocked if node is locked by another conductor. :raises: NoFreeConductorWorker when there is no free worker to start async task. + :raises: ConcurrentActionLimit If this action would exceed the + configured limits of the deployment. """ + self._concurrent_action_limit(action='cleaning') with task_manager.acquire(context, node_id, shared=False, purpose='node manual cleaning') as task: node = task.node @@ -3549,6 +3561,40 @@ class ConductorManager(base_manager.BaseConductorManager): # impact DB access if done in excess. eventlet.sleep(0) + def _concurrent_action_limit(self, action): + """Check Concurrency limits and block operations if needed. + + This method is used to serve as a central place for the logic + for checks on concurrency limits. If a limit is reached, then + an appropriate exception is raised. + + :raises: ConcurrentActionLimit If the system configuration + is exceeded. + """ + # NOTE(TheJulia): Keeping this all in one place for simplicity. + if action == 'provisioning': + node_count = self.dbapi.count_nodes_in_provision_state([ + states.DEPLOYING, + states.DEPLOYWAIT + ]) + if node_count >= CONF.conductor.max_concurrent_deploy: + raise exception.ConcurrentActionLimit( + task_type=action) + + if action == 'unprovisioning' or action == 'cleaning': + # NOTE(TheJulia): This also checks for the deleting state + # which is super transitory, *but* you can get a node into + # the state. So in order to guard against a DoS attack, we + # need to check even the super transitory node state. + node_count = self.dbapi.count_nodes_in_provision_state([ + states.DELETING, + states.CLEANING, + states.CLEANWAIT + ]) + if node_count >= CONF.conductor.max_concurrent_clean: + raise exception.ConcurrentActionLimit( + task_type=action) + @METRICS.timer('get_vendor_passthru_metadata') def get_vendor_passthru_metadata(route_dict): diff --git a/ironic/conf/conductor.py b/ironic/conf/conductor.py index b1d6bae4f..2161b9434 100644 --- a/ironic/conf/conductor.py +++ b/ironic/conf/conductor.py @@ -358,6 +358,32 @@ opts = [ 'model. The conductor does *not* record this value ' 'otherwise, and this information is not backfilled ' 'for prior instances which have been deployed.')), + cfg.IntOpt('max_concurrent_deploy', + default=250, + min=1, + mutable=True, + help=_('The maximum number of concurrent nodes in deployment ' + 'which are permitted in this Ironic system. ' + 'If this limit is reached, new requests will be ' + 'rejected until the number of deployments in progress ' + 'is lower than this maximum. As this is a security ' + 'mechanism requests are not queued, and this setting ' + 'is a global setting applying to all requests this ' + 'conductor receives, regardless of access rights. ' + 'The concurrent deployment limit cannot be disabled.')), + cfg.IntOpt('max_concurrent_clean', + default=50, + min=1, + mutable=True, + help=_('The maximum number of concurrent nodes in cleaning ' + 'which are permitted in this Ironic system. ' + 'If this limit is reached, new requests will be ' + 'rejected until the number of nodes in cleaning ' + 'is lower than this maximum. As this is a security ' + 'mechanism requests are not queued, and this setting ' + 'is a global setting applying to all requests this ' + 'conductor receives, regardless of access rights. ' + 'The concurrent clean limit cannot be disabled.')), ] diff --git a/ironic/conf/deploy.py b/ironic/conf/deploy.py index 198e0ac95..ff2020105 100644 --- a/ironic/conf/deploy.py +++ b/ironic/conf/deploy.py @@ -108,7 +108,7 @@ opts = [ 'state. If True, shred will be invoked and cleaning ' 'will continue.')), cfg.IntOpt('disk_erasure_concurrency', - default=1, + default=4, min=1, mutable=True, help=_('Defines the target pool size used by Ironic Python ' diff --git a/ironic/db/api.py b/ironic/db/api.py index 712919bb3..45e3fe2ca 100644 --- a/ironic/db/api.py +++ b/ironic/db/api.py @@ -1416,3 +1416,12 @@ class Connection(object, metaclass=abc.ABCMeta): :param entires: A list of node history entriy id's to be queried for deletion. """ + + @abc.abstractmethod + def count_nodes_in_provision_state(self, state): + """Count the number of nodes in given provision state. + + :param state: A provision_state value to match for the + count operation. This can be a single provision + state value or a list of values. + """ diff --git a/ironic/db/sqlalchemy/api.py b/ironic/db/sqlalchemy/api.py index 05d5cc45e..c14719af8 100644 --- a/ironic/db/sqlalchemy/api.py +++ b/ironic/db/sqlalchemy/api.py @@ -30,6 +30,7 @@ from oslo_utils import timeutils from oslo_utils import uuidutils from osprofiler import sqlalchemy as osp_sqlalchemy import sqlalchemy as sa +from sqlalchemy import or_ from sqlalchemy.orm.exc import NoResultFound, MultipleResultsFound from sqlalchemy.orm import joinedload from sqlalchemy.orm import Load @@ -2400,3 +2401,26 @@ class Connection(api.Connection): ).filter( models.NodeHistory.id.in_(entries) ).delete(synchronize_session=False) + + def count_nodes_in_provision_state(self, state): + if not isinstance(state, list): + state = [state] + with _session_for_read() as session: + # Intentionally does not use the full ORM model + # because that is de-duped by pkey, but we already + # have unique constraints on UUID/name, so... shouldn't + # be a big deal. #JuliaFamousLastWords. + # Anyway, intent here is to be as quick as possible and + # literally have the DB do *all* of the world, so no + # client side ops occur. The column is also indexed, + # which means this will be an index based response. + # TODO(TheJulia): This might need to be revised for + # SQLAlchemy 2.0 as it should be a scaler select and count + # instead. + return session.query( + models.Node.provision_state + ).filter( + or_( + models.Node.provision_state == v for v in state + ) + ).count() diff --git a/ironic/tests/base.py b/ironic/tests/base.py index 4b34ef0a4..348f15c20 100644 --- a/ironic/tests/base.py +++ b/ironic/tests/base.py @@ -27,6 +27,7 @@ import subprocess import sys import tempfile from unittest import mock +import warnings import eventlet eventlet.monkey_patch(os=False) @@ -38,6 +39,7 @@ from oslo_log import log as logging from oslo_serialization import jsonutils from oslo_utils import uuidutils from oslotest import base as oslo_test_base +from sqlalchemy import exc as sqla_exc from ironic.common import config as ironic_config from ironic.common import context as ironic_context @@ -70,6 +72,84 @@ def _patch_mock_callable(obj): return False +class WarningsFixture(fixtures.Fixture): + """Filters out warnings during test runs.""" + + def setUp(self): + super().setUp() + + self._original_warning_filters = warnings.filters[:] + + # NOTE(sdague): Make deprecation warnings only happen once. Otherwise + # this gets kind of crazy given the way that upstream python libs use + # this. + warnings.simplefilter('once', DeprecationWarning) + + # NOTE(stephenfin): We get way too many of these. Silence them. + warnings.filterwarnings( + 'ignore', + message=( + 'Policy enforcement is depending on the value of .*. ' + 'This key is deprecated. Please update your policy ' + 'file to use the standard policy values.' + ), + ) + + # NOTE(mriedem): Ignore scope check UserWarnings from oslo.policy. + warnings.filterwarnings( + 'ignore', + message='Policy .* failed scope check', + category=UserWarning, + ) + + # Enable deprecation warnings to capture upcoming SQLAlchemy changes + + warnings.filterwarnings( + 'ignore', + category=sqla_exc.SADeprecationWarning, + ) + + warnings.filterwarnings( + 'error', + module='ironic', + category=sqla_exc.SADeprecationWarning, + ) + + # Enable general SQLAlchemy warnings also to ensure we're not doing + # silly stuff. It's possible that we'll need to filter things out here + # with future SQLAlchemy versions, but that's a good thing + + warnings.filterwarnings( + 'error', + module='ironic', + category=sqla_exc.SAWarning, + ) + + # ...but filter everything out until we get around to fixing them + # TODO(stephenfin): Fix all of these + + warnings.filterwarnings( + 'ignore', + module='ironic', + message='SELECT statement has a cartesian product ', + category=sqla_exc.SAWarning, + ) + + # FIXME(stephenfin): We can remove this once oslo.db is fixed + # https://review.opendev.org/c/openstack/oslo.db/+/856453 + warnings.filterwarnings( + 'ignore', + module='ironic', + message='TypeDecorator .* will not produce a cache key', + category=sqla_exc.SAWarning, + ) + + self.addCleanup(self._reset_warning_filters) + + def _reset_warning_filters(self): + warnings.filters[:] = self._original_warning_filters + + class ReplaceModule(fixtures.Fixture): """Replace a module with a fake module.""" @@ -113,6 +193,7 @@ class TestCase(oslo_test_base.BaseTestCase): self.addCleanup(hash_ring.HashRingManager().reset) self.useFixture(fixtures.EnvironmentVariable('http_proxy')) self.policy = self.useFixture(policy_fixture.PolicyFixture()) + self.useFixture(WarningsFixture()) driver_factory.HardwareTypesFactory._extension_manager = None for factory in driver_factory._INTERFACE_LOADERS.values(): diff --git a/ironic/tests/unit/common/test_pxe_utils.py b/ironic/tests/unit/common/test_pxe_utils.py index 3e57b83ed..4d4fbb5b5 100644 --- a/ironic/tests/unit/common/test_pxe_utils.py +++ b/ironic/tests/unit/common/test_pxe_utils.py @@ -1357,7 +1357,7 @@ class PXEInterfacesTestCase(db_base.DbTestCase): 'LiveOS', 'squashfs.img')), 'ks_template': - (CONF.anaconda.default_ks_template, + ('file://' + CONF.anaconda.default_ks_template, os.path.join(CONF.deploy.http_root, self.node.uuid, 'ks.cfg.template')), @@ -1375,63 +1375,7 @@ class PXEInterfacesTestCase(db_base.DbTestCase): self.assertEqual(expected_info, image_info) # In the absense of kickstart template in both instance_info and # image default kickstart template is used - self.assertEqual(CONF.anaconda.default_ks_template, - image_info['ks_template'][0]) - calls = [mock.call(task.node), mock.call(task.node)] - boot_opt_mock.assert_has_calls(calls) - # Instance info gets presedence over kickstart template on the - # image - properties['properties'] = {'ks_template': 'glance://template_id'} - task.node.instance_info['ks_template'] = 'https://server/fake.tmpl' - image_show_mock.return_value = properties - image_info = pxe_utils.get_instance_image_info( - task, ipxe_enabled=False) - self.assertEqual('https://server/fake.tmpl', - image_info['ks_template'][0]) - - @mock.patch('ironic.drivers.modules.deploy_utils.get_boot_option', - return_value='kickstart', autospec=True) - @mock.patch.object(image_service.GlanceImageService, 'show', autospec=True) - def test_get_instance_image_info_with_kickstart_url( - self, image_show_mock, boot_opt_mock): - properties = {'properties': {u'kernel_id': u'instance_kernel_uuid', - u'ramdisk_id': u'instance_ramdisk_uuid', - u'image_source': u'http://path/to/os/'}} - - expected_info = {'ramdisk': - ('instance_ramdisk_uuid', - os.path.join(CONF.pxe.tftp_root, - self.node.uuid, - 'ramdisk')), - 'kernel': - ('instance_kernel_uuid', - os.path.join(CONF.pxe.tftp_root, - self.node.uuid, - 'kernel')), - 'ks_template': - (CONF.anaconda.default_ks_template, - os.path.join(CONF.deploy.http_root, - self.node.uuid, - 'ks.cfg.template')), - 'ks_cfg': - ('', - os.path.join(CONF.deploy.http_root, - self.node.uuid, - 'ks.cfg'))} - image_show_mock.return_value = properties - self.context.auth_token = 'fake' - with task_manager.acquire(self.context, self.node.uuid, - shared=True) as task: - dii = task.node.driver_internal_info - dii['is_source_a_path'] = True - task.node.driver_internal_info = dii - task.node.save() - image_info = pxe_utils.get_instance_image_info( - task, ipxe_enabled=False) - self.assertEqual(expected_info, image_info) - # In the absense of kickstart template in both instance_info and - # image default kickstart template is used - self.assertEqual(CONF.anaconda.default_ks_template, + self.assertEqual('file://' + CONF.anaconda.default_ks_template, image_info['ks_template'][0]) calls = [mock.call(task.node), mock.call(task.node)] boot_opt_mock.assert_has_calls(calls) @@ -1463,7 +1407,7 @@ class PXEInterfacesTestCase(db_base.DbTestCase): self.node.uuid, 'kernel')), 'ks_template': - (CONF.anaconda.default_ks_template, + ('file://' + CONF.anaconda.default_ks_template, os.path.join(CONF.deploy.http_root, self.node.uuid, 'ks.cfg.template')), @@ -1490,7 +1434,7 @@ class PXEInterfacesTestCase(db_base.DbTestCase): self.assertEqual(expected_info, image_info) # In the absense of kickstart template in both instance_info and # image default kickstart template is used - self.assertEqual(CONF.anaconda.default_ks_template, + self.assertEqual('file://' + CONF.anaconda.default_ks_template, image_info['ks_template'][0]) calls = [mock.call(task.node), mock.call(task.node)] boot_opt_mock.assert_has_calls(calls) @@ -1577,6 +1521,46 @@ class PXEInterfacesTestCase(db_base.DbTestCase): list(fake_pxe_info.values()), True) + @mock.patch.object(os, 'chmod', autospec=True) + @mock.patch.object(pxe_utils, 'TFTPImageCache', lambda: None) + @mock.patch.object(pxe_utils, 'ensure_tree', autospec=True) + @mock.patch.object(deploy_utils, 'fetch_images', autospec=True) + def test_cache_ramdisk_kernel_ipxe_anaconda(self, mock_fetch_image, + mock_ensure_tree, mock_chmod): + expected_path = os.path.join(CONF.deploy.http_root, + self.node.uuid) + fake_pxe_info = {'ramdisk': + ('instance_ramdisk_uuid', + os.path.join(CONF.pxe.tftp_root, + self.node.uuid, + 'ramdisk')), + 'kernel': + ('instance_kernel_uuid', + os.path.join(CONF.pxe.tftp_root, + self.node.uuid, + 'kernel')), + 'ks_template': + ('file://' + CONF.anaconda.default_ks_template, + os.path.join(CONF.deploy.http_root, + self.node.uuid, + 'ks.cfg.template')), + 'ks_cfg': + ('', + os.path.join(CONF.deploy.http_root, + self.node.uuid, + 'ks.cfg'))} + expected = fake_pxe_info.copy() + expected.pop('ks_cfg') + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + pxe_utils.cache_ramdisk_kernel(task, fake_pxe_info, + ipxe_enabled=True) + mock_ensure_tree.assert_called_with(expected_path) + mock_fetch_image.assert_called_once_with(self.context, mock.ANY, + list(expected.values()), + True) + @mock.patch.object(pxe.PXEBoot, '__init__', lambda self: None) class PXEBuildKickstartConfigOptionsTestCase(db_base.DbTestCase): diff --git a/ironic/tests/unit/conductor/test_cleaning.py b/ironic/tests/unit/conductor/test_cleaning.py index 65261450a..a4c3d57b6 100644 --- a/ironic/tests/unit/conductor/test_cleaning.py +++ b/ironic/tests/unit/conductor/test_cleaning.py @@ -51,8 +51,6 @@ class DoNodeCleanTestCase(db_base.DbTestCase): 'step': 'build_raid', 'priority': 0, 'interface': 'deploy'} def __do_node_clean_validate_fail(self, mock_validate, clean_steps=None): - # InvalidParameterValue should cause node to go to CLEANFAIL - mock_validate.side_effect = exception.InvalidParameterValue('error') tgt_prov_state = states.MANAGEABLE if clean_steps else states.AVAILABLE node = obj_utils.create_test_node( self.context, driver='fake-hardware', @@ -68,26 +66,42 @@ class DoNodeCleanTestCase(db_base.DbTestCase): self.assertIsNone(node.fault) mock_validate.assert_called_once_with(mock.ANY, mock.ANY) + def __do_node_clean_validate_fail_invalid(self, mock_validate, + clean_steps=None): + # InvalidParameterValue should cause node to go to CLEANFAIL + mock_validate.side_effect = exception.InvalidParameterValue('error') + self.__do_node_clean_validate_fail(mock_validate, + clean_steps=clean_steps) + @mock.patch('ironic.drivers.modules.fake.FakePower.validate', autospec=True) def test__do_node_clean_automated_power_validate_fail(self, mock_validate): - self.__do_node_clean_validate_fail(mock_validate) + self.__do_node_clean_validate_fail_invalid(mock_validate) @mock.patch('ironic.drivers.modules.fake.FakePower.validate', autospec=True) def test__do_node_clean_manual_power_validate_fail(self, mock_validate): - self.__do_node_clean_validate_fail(mock_validate, clean_steps=[]) + self.__do_node_clean_validate_fail_invalid(mock_validate, + clean_steps=[]) @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', autospec=True) def test__do_node_clean_automated_network_validate_fail(self, mock_validate): - self.__do_node_clean_validate_fail(mock_validate) + self.__do_node_clean_validate_fail_invalid(mock_validate) @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', autospec=True) def test__do_node_clean_manual_network_validate_fail(self, mock_validate): - self.__do_node_clean_validate_fail(mock_validate, clean_steps=[]) + self.__do_node_clean_validate_fail_invalid(mock_validate, + clean_steps=[]) + + @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', + autospec=True) + def test__do_node_clean_network_error_fail(self, mock_validate): + # NetworkError should cause node to go to CLEANFAIL + mock_validate.side_effect = exception.NetworkError() + self.__do_node_clean_validate_fail(mock_validate) @mock.patch.object(conductor_utils, 'LOG', autospec=True) @mock.patch.object(conductor_steps, 'set_node_cleaning_steps', diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index 378d65f15..5d84dbbef 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -1829,6 +1829,7 @@ class ServiceDoNodeDeployTestCase(mgr_utils.ServiceSetUpMixin, def test_do_node_deploy_maintenance(self, mock_iwdi): mock_iwdi.return_value = False + self._start_service() node = obj_utils.create_test_node(self.context, driver='fake-hardware', maintenance=True) exc = self.assertRaises(messaging.rpc.ExpectedException, @@ -1843,6 +1844,7 @@ class ServiceDoNodeDeployTestCase(mgr_utils.ServiceSetUpMixin, self.assertFalse(mock_iwdi.called) def _test_do_node_deploy_validate_fail(self, mock_validate, mock_iwdi): + self._start_service() mock_iwdi.return_value = False # InvalidParameterValue should be re-raised as InstanceDeployFailure mock_validate.side_effect = exception.InvalidParameterValue('error') @@ -2389,6 +2391,7 @@ class DoNodeTearDownTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): @mock.patch('ironic.drivers.modules.fake.FakePower.validate', autospec=True) def test_do_node_tear_down_validate_fail(self, mock_validate): + self._start_service() # InvalidParameterValue should be re-raised as InstanceDeployFailure mock_validate.side_effect = exception.InvalidParameterValue('error') node = obj_utils.create_test_node( @@ -8374,7 +8377,6 @@ class NodeHistoryRecordCleanupTestCase(mgr_utils.ServiceSetUpMixin, # 9 retained due to days, 3 to config self.service._manage_node_history(self.context) events = objects.NodeHistory.list(self.context) - print(events) self.assertEqual(12, len(events)) events = objects.NodeHistory.list_by_node_id(self.context, 10) self.assertEqual(4, len(events)) @@ -8394,3 +8396,73 @@ class NodeHistoryRecordCleanupTestCase(mgr_utils.ServiceSetUpMixin, self.assertEqual('one', events[1].event) self.assertEqual('two', events[2].event) self.assertEqual('three', events[3].event) + + +class ConcurrentActionLimitTestCase(mgr_utils.ServiceSetUpMixin, + db_base.DbTestCase): + + def setUp(self): + super(ConcurrentActionLimitTestCase, self).setUp() + self._start_service() + self.node1 = obj_utils.get_test_node( + self.context, + driver='fake-hardware', + id=110, + uuid=uuidutils.generate_uuid()) + self.node2 = obj_utils.get_test_node( + self.context, + driver='fake-hardware', + id=111, + uuid=uuidutils.generate_uuid()) + self.node3 = obj_utils.get_test_node( + self.context, + driver='fake-hardware', + id=112, + uuid=uuidutils.generate_uuid()) + self.node4 = obj_utils.get_test_node( + self.context, + driver='fake-hardware', + id=113, + uuid=uuidutils.generate_uuid()) + # Create the nodes, as the tasks need to operate across tables. + self.node1.create() + self.node2.create() + self.node3.create() + self.node4.create() + + def test_concurrent_action_limit_deploy(self): + self.node1.provision_state = states.DEPLOYING + self.node2.provision_state = states.DEPLOYWAIT + self.node1.save() + self.node2.save() + CONF.set_override('max_concurrent_deploy', 2, group='conductor') + self.assertRaises( + exception.ConcurrentActionLimit, + self.service._concurrent_action_limit, + 'provisioning') + self.service._concurrent_action_limit('unprovisioning') + self.service._concurrent_action_limit('cleaning') + CONF.set_override('max_concurrent_deploy', 3, group='conductor') + self.service._concurrent_action_limit('provisioning') + + def test_concurrent_action_limit_cleaning(self): + self.node1.provision_state = states.DELETING + self.node2.provision_state = states.CLEANING + self.node3.provision_state = states.CLEANWAIT + self.node1.save() + self.node2.save() + self.node3.save() + + CONF.set_override('max_concurrent_clean', 3, group='conductor') + self.assertRaises( + exception.ConcurrentActionLimit, + self.service._concurrent_action_limit, + 'cleaning') + self.assertRaises( + exception.ConcurrentActionLimit, + self.service._concurrent_action_limit, + 'unprovisioning') + self.service._concurrent_action_limit('provisioning') + CONF.set_override('max_concurrent_clean', 4, group='conductor') + self.service._concurrent_action_limit('cleaning') + self.service._concurrent_action_limit('unprovisioning') diff --git a/ironic/tests/unit/db/test_nodes.py b/ironic/tests/unit/db/test_nodes.py index eb5200f4e..b4d70b2dd 100644 --- a/ironic/tests/unit/db/test_nodes.py +++ b/ironic/tests/unit/db/test_nodes.py @@ -1081,3 +1081,39 @@ class DbNodeTestCase(base.DbTestCase): self.dbapi.check_node_list, [node1.uuid, 'this/cannot/be/a/name']) self.assertIn('this/cannot/be/a/name', str(exc)) + + def test_node_provision_state_count(self): + active_nodes = 5 + manageable_nodes = 3 + deploywait_nodes = 1 + for i in range(0, active_nodes): + utils.create_test_node(uuid=uuidutils.generate_uuid(), + provision_state=states.ACTIVE) + for i in range(0, manageable_nodes): + utils.create_test_node(uuid=uuidutils.generate_uuid(), + provision_state=states.MANAGEABLE) + for i in range(0, deploywait_nodes): + utils.create_test_node(uuid=uuidutils.generate_uuid(), + provision_state=states.DEPLOYWAIT) + + self.assertEqual( + active_nodes, + self.dbapi.count_nodes_in_provision_state(states.ACTIVE) + ) + self.assertEqual( + manageable_nodes, + self.dbapi.count_nodes_in_provision_state(states.MANAGEABLE) + ) + self.assertEqual( + deploywait_nodes, + self.dbapi.count_nodes_in_provision_state(states.DEPLOYWAIT) + ) + total = active_nodes + manageable_nodes + deploywait_nodes + self.assertEqual( + total, + self.dbapi.count_nodes_in_provision_state([ + states.ACTIVE, + states.MANAGEABLE, + states.DEPLOYWAIT + ]) + ) diff --git a/releasenotes/notes/concurrency-limit-control-4b101bca7136e08d.yaml b/releasenotes/notes/concurrency-limit-control-4b101bca7136e08d.yaml new file mode 100644 index 000000000..5eb8dd449 --- /dev/null +++ b/releasenotes/notes/concurrency-limit-control-4b101bca7136e08d.yaml @@ -0,0 +1,23 @@ +--- +features: + - | + Adds a concurrency limiter for number of nodes in states related to + *Cleaning* and *Provisioning* operations across the ironic deployment. + These settings default to a maximum number of concurrent deployments to + ``250`` and a maximum number of concurrent deletes and cleaning operations + to ``50``. These settings can be tuned using + ``[conductor]max_concurrent_deploy`` and + ``[conductor]max_concurrent_clean``, respectively. + The defaults should generally be good for most operators in most cases. + Large scale operators should evaluate the defaults and tune appropriately + as this feature cannot be disabled, as it is a security mechanism. +upgrade: + - | + Large scale operators should be aware that a new feature, referred to as + "Concurrent Action Limit" was introduced as a security mechanism to + provide a means to limit attackers, or faulty scripts, from potentially + causing irreperable harm to an environment. This feature cannot be + disabled, and operators are encouraged to tune the new settings + ``[conductor]max_concurrent_deploy`` and + ``[conductor]max_concurrent_clean`` to match the needs of their + environment. diff --git a/releasenotes/notes/correct-source-path-handling-lookups-4ce2023a56372f10.yaml b/releasenotes/notes/correct-source-path-handling-lookups-4ce2023a56372f10.yaml new file mode 100644 index 000000000..10d270a45 --- /dev/null +++ b/releasenotes/notes/correct-source-path-handling-lookups-4ce2023a56372f10.yaml @@ -0,0 +1,16 @@ +--- +fixes: + - | + Fixes an issue where image information retrieval would fail when a + path was supplied when using the ``anaconda`` deploy interface, + as `HTTP` ``HEAD`` requests on a URL path have no ``Content-Length``. + We now consider if a path is used prior to attempting to collect + additional configuration data from what is normally expected to + be Glance. + - | + Fixes an issue where the fallback to a default kickstart template + value would result in error indicating + "Scheme-less image href is not a UUID". + This was becaues the handling code falling back to the default + did not explicitly indicate it was a file URL before saving the + value. diff --git a/releasenotes/notes/fix-cleaning-stuck-on-networkerror-4aedbf3673413af6.yaml b/releasenotes/notes/fix-cleaning-stuck-on-networkerror-4aedbf3673413af6.yaml new file mode 100644 index 000000000..f7769afc1 --- /dev/null +++ b/releasenotes/notes/fix-cleaning-stuck-on-networkerror-4aedbf3673413af6.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + Fixes an issue where cleaning operations could fail in such a way that was + not easily recoverable when pre-cleaning network interface configuration + was validated, yet contained invalid configuration. + Now Ironic properly captures the error and exits from cleaning in a + state which allows for cleaning to be retried. diff --git a/releasenotes/notes/maximum-disk-erasure-concurrency-6d132bd84e3df4cf.yaml b/releasenotes/notes/maximum-disk-erasure-concurrency-6d132bd84e3df4cf.yaml new file mode 100644 index 000000000..f09421593 --- /dev/null +++ b/releasenotes/notes/maximum-disk-erasure-concurrency-6d132bd84e3df4cf.yaml @@ -0,0 +1,10 @@ +--- +other: + - | + The maximum disk erasure concurrency setting, + ``[deploy]disk_erasure_concurrency`` has been incremed to 4. + Previously, this was kept at 1 in order to maintain continuity of + experience, but operators have not reported any issues with an increased + concurrency, and as such we feel comfortable upstream enabling concurrent + disk erasure/cleaning. This setting applies to the ``erase_devices`` clean + step. @@ -11,7 +11,6 @@ setenv = VIRTUAL_ENV={envdir} PYTHONDONTWRITEBYTECODE = 1 LANGUAGE=en_US LC_ALL=en_US.UTF-8 - PYTHONWARNINGS=default::DeprecationWarning deps = -c{env:TOX_CONSTRAINTS_FILE:https://releases.openstack.org/constraints/upper/master} -r{toxinidir}/requirements.txt diff --git a/zuul.d/ironic-jobs.yaml b/zuul.d/ironic-jobs.yaml index c9b969d4f..9d7435bd3 100644 --- a/zuul.d/ironic-jobs.yaml +++ b/zuul.d/ironic-jobs.yaml @@ -217,6 +217,48 @@ s-proxy: False - job: + name: ironic-standalone-anaconda + parent: ironic-standalone-redfish + description: + Test ironic with the anaconda deployment interface. + Test also uses Redfish. + required-projects: + - opendev.org/openstack/sushy-tools + irrelevant-files: + - ^.*\.rst$ + - ^api-ref/.*$ + - ^doc/.*$ + - ^install-guide/.*$ + - ^ironic/locale/.*$ + - ^ironic/tests/.*$ + - ^releasenotes/.*$ + - ^setup.cfg$ + - ^test-requirements.txt$ + - ^tools/.*$ + - ^tox.ini$ + vars: + tempest_test_regex: BaremetalRedfishIPxeAnacondaNoGlance + tempest_test_timeout: 4800 + tempest_concurrency: 2 + devstack_localrc: + IRONIC_ENABLED_DEPLOY_INTERFACES: "anaconda" + IRONIC_VM_COUNT: 2 + IRONIC_VM_VOLUME_COUNT: 1 + IRONIC_VM_SPECS_RAM: 3192 + IRONIC_VM_SPECS_CPU: 3 + IRONIC_ENFORCE_SCOPE: True + # We're using a lot of disk space in this job. Some testing nodes have + # a small root partition, so use /opt which is mounted from a bigger + # ephemeral partition on such nodes + LIBVIRT_STORAGE_POOL_PATH: /opt/libvirt/images + IRONIC_ANACONDA_IMAGE_REF: http://mirror.stream.centos.org/9-stream/BaseOS/x86_64/os/ + IRONIC_ANACONDA_KERNEL_REF: http://mirror.stream.centos.org/9-stream/BaseOS/x86_64/os/images/pxeboot/vmlinuz + IRONIC_ANACONDA_RAMDISK_REF: http://mirror.stream.centos.org/9-stream/BaseOS/x86_64/os/images/pxeboot/initrd.img + IRONIC_ANACONDA_INSECURE_HEARTBEAT: True + IRONIC_DEPLOY_CALLBACK_WAIT_TIMEOUT: 3600 + IRONIC_PXE_BOOT_RETRY_TIMEOUT: 3600 + +- job: name: ironic-tempest-bios-redfish-pxe description: "Deploy ironic node over PXE using BIOS boot mode" parent: ironic-tempest-uefi-redfish-vmedia diff --git a/zuul.d/project.yaml b/zuul.d/project.yaml index 8b821f816..586675f87 100644 --- a/zuul.d/project.yaml +++ b/zuul.d/project.yaml @@ -45,6 +45,8 @@ voting: false - ironic-tempest-ipxe-ipv6: voting: false + - ironic-standalone-anaconda: + voting: false - ironic-inspector-tempest-rbac-scope-enforced: voting: false - bifrost-integration-tinyipa-ubuntu-focal: |