summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Garbutt <john.garbutt@stackhpc.com>2022-05-18 19:06:36 +0100
committerRuby Loo <opensrloo@gmail.com>2022-12-16 19:32:42 +0000
commitd71e9f6ec4933f9430db55537a36678b16ce895a (patch)
tree36f3868a60cd101c61feb2148cb3f6d8051aee30
parentd92d0934188a14741dd86949ddf98bd1208f3d96 (diff)
downloadnova-d71e9f6ec4933f9430db55537a36678b16ce895a.tar.gz
Ironic: retry when node not available
After a baremetal instance is deleted, and its allocation is removed in placement, the ironic node might start cleaning. Eventually nova will notice and update the inventory to be reserved. During this window, a new instance may have already picked this ironic node. When that race happens today the build fails with an error: "Failed to reserve node ..." This change tries to ensure the remaining alternative hosts are attempted before aborting the build. Clearly the race is still there, but this makes it less painful. Related-Bug: #1974070 Change-Id: Ie5cdc17219c86927ab3769605808cb9d9fa9fa4d (cherry picked from commit 8a476061c5e034016668cd9e5a20c4430ef6b68d)
-rw-r--r--nova/compute/manager.py3
-rw-r--r--nova/tests/unit/compute/test_compute_mgr.py36
-rw-r--r--nova/tests/unit/virt/ironic/test_driver.py22
-rw-r--r--nova/virt/ironic/driver.py12
4 files changed, 70 insertions, 3 deletions
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index f25d037c50..d29348097f 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -2736,7 +2736,8 @@ class ComputeManager(manager.Manager):
block_device_mapping)
resources['block_device_info'] = block_device_info
except (exception.InstanceNotFound,
- exception.UnexpectedDeletingTaskStateError):
+ exception.UnexpectedDeletingTaskStateError,
+ exception.ComputeResourcesUnavailable):
with excutils.save_and_reraise_exception():
self._build_resources_cleanup(instance, network_info)
except (exception.UnexpectedTaskStateError,
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index 1a4935f482..e521283acc 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -7927,6 +7927,42 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
@mock.patch.object(virt_driver.ComputeDriver, 'failed_spawn_cleanup')
@mock.patch.object(virt_driver.ComputeDriver, 'prepare_for_spawn')
+ @mock.patch.object(virt_driver.ComputeDriver,
+ 'prepare_networks_before_block_device_mapping')
+ @mock.patch.object(virt_driver.ComputeDriver,
+ 'clean_networks_preparation')
+ def test_failed_prepare_for_spawn(self, mock_clean, mock_prepnet,
+ mock_prepspawn, mock_failedspawn):
+ mock_prepspawn.side_effect = exception.ComputeResourcesUnavailable(
+ reason="asdf")
+ with mock.patch.object(self.compute,
+ '_build_networks_for_instance',
+ return_value=self.network_info
+ ) as _build_networks_for_instance:
+
+ try:
+ with self.compute._build_resources(self.context, self.instance,
+ self.requested_networks, self.security_groups,
+ self.image, self.block_device_mapping,
+ self.resource_provider_mapping, self.accel_uuids):
+ pass
+ except Exception as e:
+ self.assertIsInstance(e,
+ exception.ComputeResourcesUnavailable)
+
+ _build_networks_for_instance.assert_has_calls(
+ [mock.call(self.context, self.instance,
+ self.requested_networks, self.security_groups,
+ self.resource_provider_mapping,
+ self.network_arqs)])
+
+ mock_prepnet.assert_not_called()
+ mock_clean.assert_called_once_with(self.instance, self.network_info)
+ mock_prepspawn.assert_called_once_with(self.instance)
+ mock_failedspawn.assert_called_once_with(self.instance)
+
+ @mock.patch.object(virt_driver.ComputeDriver, 'failed_spawn_cleanup')
+ @mock.patch.object(virt_driver.ComputeDriver, 'prepare_for_spawn')
@mock.patch.object(manager.ComputeManager, '_build_networks_for_instance')
def test_build_resources_aborts_on_failed_network_alloc(self, mock_build,
mock_prepspawn,
diff --git a/nova/tests/unit/virt/ironic/test_driver.py b/nova/tests/unit/virt/ironic/test_driver.py
index 6ac7ca464e..9b5e31db83 100644
--- a/nova/tests/unit/virt/ironic/test_driver.py
+++ b/nova/tests/unit/virt/ironic/test_driver.py
@@ -2500,7 +2500,10 @@ class IronicDriverTestCase(test.NoDBTestCase):
@mock.patch.object(cw.IronicClientWrapper, 'call')
def test_prepare_for_spawn(self, mock_call):
- node = ironic_utils.get_test_node(driver='fake')
+ node = ironic_utils.get_test_node(
+ driver='fake', instance_uuid=None,
+ provision_state=ironic_states.AVAILABLE,
+ power_state=ironic_states.POWER_OFF)
self.mock_conn.get_node.return_value = node
instance = fake_instance.fake_instance_obj(self.ctx,
node=node.uuid)
@@ -2532,7 +2535,10 @@ class IronicDriverTestCase(test.NoDBTestCase):
instance)
def test_prepare_for_spawn_conflict(self):
- node = ironic_utils.get_test_node(driver='fake')
+ node = ironic_utils.get_test_node(
+ driver='fake', instance_uuid=None,
+ provision_state=ironic_states.AVAILABLE,
+ power_state=ironic_states.POWER_OFF)
self.mock_conn.get_node.return_value = node
self.mock_conn.update_node.side_effect = sdk_exc.ConflictException
instance = fake_instance.fake_instance_obj(self.ctx, node=node.id)
@@ -2540,6 +2546,18 @@ class IronicDriverTestCase(test.NoDBTestCase):
self.driver.prepare_for_spawn,
instance)
+ def test_prepare_for_spawn_not_available(self):
+ node = ironic_utils.get_test_node(
+ driver='fake', instance_uuid=None,
+ provision_state=ironic_states.CLEANWAIT,
+ power_state=ironic_states.POWER_OFF)
+ self.mock_conn.get_node.return_value = node
+ self.mock_conn.update_node.side_effect = sdk_exc.ConflictException
+ instance = fake_instance.fake_instance_obj(self.ctx, node=node.id)
+ self.assertRaises(exception.ComputeResourcesUnavailable,
+ self.driver.prepare_for_spawn,
+ instance)
+
@mock.patch.object(ironic_driver.IronicDriver, '_cleanup_deploy')
def test_failed_spawn_cleanup(self, mock_cleanup):
node = ironic_utils.get_test_node(driver='fake')
diff --git a/nova/virt/ironic/driver.py b/nova/virt/ironic/driver.py
index 7496db5a7c..5f5f3a6dd7 100644
--- a/nova/virt/ironic/driver.py
+++ b/nova/virt/ironic/driver.py
@@ -397,6 +397,18 @@ class IronicDriver(virt_driver.ComputeDriver):
_("Ironic node uuid not supplied to "
"driver for instance %s.") % instance.uuid)
node = self._get_node(node_uuid)
+
+ # Its possible this node has just moved from deleting
+ # to cleaning. Placement will update the inventory
+ # as all reserved, but this instance might have got here
+ # before that happened, but after the previous allocation
+ # got deleted. We trigger a re-schedule to another node.
+ if (self._node_resources_used(node) or
+ self._node_resources_unavailable(node)):
+ msg = "Chosen ironic node %s is not available" % node_uuid
+ LOG.info(msg, instance=instance)
+ raise exception.ComputeResourcesUnavailable(reason=msg)
+
self._set_instance_id(node, instance)
def failed_spawn_cleanup(self, instance):