diff options
author | John Garbutt <john.garbutt@stackhpc.com> | 2022-05-18 19:06:36 +0100 |
---|---|---|
committer | Ruby Loo <opensrloo@gmail.com> | 2022-12-16 19:32:42 +0000 |
commit | d71e9f6ec4933f9430db55537a36678b16ce895a (patch) | |
tree | 36f3868a60cd101c61feb2148cb3f6d8051aee30 | |
parent | d92d0934188a14741dd86949ddf98bd1208f3d96 (diff) | |
download | nova-d71e9f6ec4933f9430db55537a36678b16ce895a.tar.gz |
Ironic: retry when node not available
After a baremetal instance is deleted, and its allocation is removed
in placement, the ironic node might start cleaning. Eventually nova
will notice and update the inventory to be reserved.
During this window, a new instance may have already picked this
ironic node.
When that race happens today the build fails with an error:
"Failed to reserve node ..."
This change tries to ensure the remaining alternative hosts are
attempted before aborting the build.
Clearly the race is still there, but this makes it less painful.
Related-Bug: #1974070
Change-Id: Ie5cdc17219c86927ab3769605808cb9d9fa9fa4d
(cherry picked from commit 8a476061c5e034016668cd9e5a20c4430ef6b68d)
-rw-r--r-- | nova/compute/manager.py | 3 | ||||
-rw-r--r-- | nova/tests/unit/compute/test_compute_mgr.py | 36 | ||||
-rw-r--r-- | nova/tests/unit/virt/ironic/test_driver.py | 22 | ||||
-rw-r--r-- | nova/virt/ironic/driver.py | 12 |
4 files changed, 70 insertions, 3 deletions
diff --git a/nova/compute/manager.py b/nova/compute/manager.py index f25d037c50..d29348097f 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -2736,7 +2736,8 @@ class ComputeManager(manager.Manager): block_device_mapping) resources['block_device_info'] = block_device_info except (exception.InstanceNotFound, - exception.UnexpectedDeletingTaskStateError): + exception.UnexpectedDeletingTaskStateError, + exception.ComputeResourcesUnavailable): with excutils.save_and_reraise_exception(): self._build_resources_cleanup(instance, network_info) except (exception.UnexpectedTaskStateError, diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 1a4935f482..e521283acc 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -7927,6 +7927,42 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): @mock.patch.object(virt_driver.ComputeDriver, 'failed_spawn_cleanup') @mock.patch.object(virt_driver.ComputeDriver, 'prepare_for_spawn') + @mock.patch.object(virt_driver.ComputeDriver, + 'prepare_networks_before_block_device_mapping') + @mock.patch.object(virt_driver.ComputeDriver, + 'clean_networks_preparation') + def test_failed_prepare_for_spawn(self, mock_clean, mock_prepnet, + mock_prepspawn, mock_failedspawn): + mock_prepspawn.side_effect = exception.ComputeResourcesUnavailable( + reason="asdf") + with mock.patch.object(self.compute, + '_build_networks_for_instance', + return_value=self.network_info + ) as _build_networks_for_instance: + + try: + with self.compute._build_resources(self.context, self.instance, + self.requested_networks, self.security_groups, + self.image, self.block_device_mapping, + self.resource_provider_mapping, self.accel_uuids): + pass + except Exception as e: + self.assertIsInstance(e, + exception.ComputeResourcesUnavailable) + + _build_networks_for_instance.assert_has_calls( + [mock.call(self.context, self.instance, + self.requested_networks, self.security_groups, + self.resource_provider_mapping, + self.network_arqs)]) + + mock_prepnet.assert_not_called() + mock_clean.assert_called_once_with(self.instance, self.network_info) + mock_prepspawn.assert_called_once_with(self.instance) + mock_failedspawn.assert_called_once_with(self.instance) + + @mock.patch.object(virt_driver.ComputeDriver, 'failed_spawn_cleanup') + @mock.patch.object(virt_driver.ComputeDriver, 'prepare_for_spawn') @mock.patch.object(manager.ComputeManager, '_build_networks_for_instance') def test_build_resources_aborts_on_failed_network_alloc(self, mock_build, mock_prepspawn, diff --git a/nova/tests/unit/virt/ironic/test_driver.py b/nova/tests/unit/virt/ironic/test_driver.py index 6ac7ca464e..9b5e31db83 100644 --- a/nova/tests/unit/virt/ironic/test_driver.py +++ b/nova/tests/unit/virt/ironic/test_driver.py @@ -2500,7 +2500,10 @@ class IronicDriverTestCase(test.NoDBTestCase): @mock.patch.object(cw.IronicClientWrapper, 'call') def test_prepare_for_spawn(self, mock_call): - node = ironic_utils.get_test_node(driver='fake') + node = ironic_utils.get_test_node( + driver='fake', instance_uuid=None, + provision_state=ironic_states.AVAILABLE, + power_state=ironic_states.POWER_OFF) self.mock_conn.get_node.return_value = node instance = fake_instance.fake_instance_obj(self.ctx, node=node.uuid) @@ -2532,7 +2535,10 @@ class IronicDriverTestCase(test.NoDBTestCase): instance) def test_prepare_for_spawn_conflict(self): - node = ironic_utils.get_test_node(driver='fake') + node = ironic_utils.get_test_node( + driver='fake', instance_uuid=None, + provision_state=ironic_states.AVAILABLE, + power_state=ironic_states.POWER_OFF) self.mock_conn.get_node.return_value = node self.mock_conn.update_node.side_effect = sdk_exc.ConflictException instance = fake_instance.fake_instance_obj(self.ctx, node=node.id) @@ -2540,6 +2546,18 @@ class IronicDriverTestCase(test.NoDBTestCase): self.driver.prepare_for_spawn, instance) + def test_prepare_for_spawn_not_available(self): + node = ironic_utils.get_test_node( + driver='fake', instance_uuid=None, + provision_state=ironic_states.CLEANWAIT, + power_state=ironic_states.POWER_OFF) + self.mock_conn.get_node.return_value = node + self.mock_conn.update_node.side_effect = sdk_exc.ConflictException + instance = fake_instance.fake_instance_obj(self.ctx, node=node.id) + self.assertRaises(exception.ComputeResourcesUnavailable, + self.driver.prepare_for_spawn, + instance) + @mock.patch.object(ironic_driver.IronicDriver, '_cleanup_deploy') def test_failed_spawn_cleanup(self, mock_cleanup): node = ironic_utils.get_test_node(driver='fake') diff --git a/nova/virt/ironic/driver.py b/nova/virt/ironic/driver.py index 7496db5a7c..5f5f3a6dd7 100644 --- a/nova/virt/ironic/driver.py +++ b/nova/virt/ironic/driver.py @@ -397,6 +397,18 @@ class IronicDriver(virt_driver.ComputeDriver): _("Ironic node uuid not supplied to " "driver for instance %s.") % instance.uuid) node = self._get_node(node_uuid) + + # Its possible this node has just moved from deleting + # to cleaning. Placement will update the inventory + # as all reserved, but this instance might have got here + # before that happened, but after the previous allocation + # got deleted. We trigger a re-schedule to another node. + if (self._node_resources_used(node) or + self._node_resources_unavailable(node)): + msg = "Chosen ironic node %s is not available" % node_uuid + LOG.info(msg, instance=instance) + raise exception.ComputeResourcesUnavailable(reason=msg) + self._set_instance_id(node, instance) def failed_spawn_cleanup(self, instance): |