Clear rebalanced compute nodes from resource tracker

There is a race condition in nova-compute with the ironic virt driver as nodes get rebalanced. It can lead to compute nodes being removed in the DB and not repopulated. Ultimately this prevents these nodes from being scheduled to. The issue being addressed here is that if a compute node is deleted by a host which thinks it is an orphan, then the compute host that actually owns the node might not recreate it if the node is already in its resource tracker cache. This change fixes the issue by clearing nodes from the resource tracker cache for which a compute node entry does not exist. Then, when the available resource for the node is updated, the compute node object is not found in the cache and gets recreated. Change-Id: I39241223b447fcc671161c370dbf16e1773b684a Partial-Bug: #1853009
author: Stephen Finucane <stephenfin@redhat.com> 2021-04-28 13:53:39 +0100
committer: Lee Yarwood <lyarwood@redhat.com> 2021-08-12 14:26:45 +0100
commit: 32676a9f45807ea8770dc7bdff1e859673af1b61 (patch)
tree: c757d6e88c3ee13042603cf7c1b9437f9df3ba3f /nova/tests/unit/compute
parent: 59d9871e8a0672538f8ffc43ae99b3d1c4b08909 (diff)
download: nova-32676a9f45807ea8770dc7bdff1e859673af1b61.tar.gz
2 files changed, 21 insertions, 2 deletions
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index 3a8fa207db..282f2ee506 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -373,18 +373,20 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
         )
 
         # First node in set should have been removed from DB
+        # Last node in set should have been added to DB.
         for db_node in db_nodes:
             if db_node.hypervisor_hostname == 'node1':
                 db_node.destroy.assert_called_once_with()
                 rc_mock.delete_resource_provider.assert_called_once_with(
                     self.context, db_node, cascade=True)
-                mock_rt.remove_node.assert_called_once_with(
-                    'node1')
+                mock_rt.remove_node.assert_called_once_with('node1')
                 mock_log.error.assert_called_once_with(
                     "Failed to delete compute node resource provider for "
                     "compute node %s: %s", db_node.uuid, mock.ANY)
             else:
                 self.assertFalse(db_node.destroy.called)
+        self.assertEqual(1, mock_rt.remove_node.call_count)
+        mock_rt.clean_compute_node_cache.assert_called_once_with(db_nodes)
 
     @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
                 'delete_resource_provider')
diff --git a/nova/tests/unit/compute/test_resource_tracker.py b/nova/tests/unit/compute/test_resource_tracker.py
index 947e281b98..147a02bc90 100644
--- a/nova/tests/unit/compute/test_resource_tracker.py
+++ b/nova/tests/unit/compute/test_resource_tracker.py
@@ -4177,3 +4177,20 @@ class ProviderConfigTestCases(BaseTestCase):
         mock_log.warning.assert_called_once_with(*expected_log_call)
         self.assertIn(uuids.unknown, self.rt.absent_providers)
         self.assertEqual(result, [])
+
+
+class TestCleanComputeNodeCache(BaseTestCase):
+
+    def setUp(self):
+        super(TestCleanComputeNodeCache, self).setUp()
+        self._setup_rt()
+        self.context = context.RequestContext(
+            mock.sentinel.user_id, mock.sentinel.project_id)
+
+    @mock.patch.object(resource_tracker.ResourceTracker, "remove_node")
+    def test_clean_compute_node_cache(self, mock_remove):
+        invalid_nodename = "invalid-node"
+        self.rt.compute_nodes[_NODENAME] = self.compute
+        self.rt.compute_nodes[invalid_nodename] = mock.sentinel.compute
+        self.rt.clean_compute_node_cache([self.compute])
+        mock_remove.assert_called_once_with(invalid_nodename)
author	Stephen Finucane <stephenfin@redhat.com>	2021-04-28 13:53:39 +0100
committer	Lee Yarwood <lyarwood@redhat.com>	2021-08-12 14:26:45 +0100
commit	32676a9f45807ea8770dc7bdff1e859673af1b61 (patch)
tree	c757d6e88c3ee13042603cf7c1b9437f9df3ba3f /nova/tests/unit/compute
parent	59d9871e8a0672538f8ffc43ae99b3d1c4b08909 (diff)
download	nova-32676a9f45807ea8770dc7bdff1e859673af1b61.tar.gz