Set instance host and drop migration under lock

The _update_available_resources periodic makes resource allocation adjustments while holding the COMPUTE_RESOURCE_SEMAPHORE based on the list of instances assigned to this host of the resource tracker and based on the migrations where the source or the target host is the host of the resource tracker. So if the instance.host or the migration context changes without holding the COMPUTE_RESOURCE_SEMAPHORE while the _update_available_resources task is running there there will be data inconsistency in the resource tracker. This patch makes sure that during evacuation the instance.host and the migration context is changed while holding the semaphore. stable/train specific change: fair locking was introduced from ussuri forward. See Ia5e521e0f0c7a78b5ace5de9f343e84d872553f9 So the train backport does not add fair=True to the synchronized decorator of finish_evacuation as that would interfere with the other functions taking the same lock without fair=True. Change-Id: Ica180165184b319651d22fe77e076af036228860 Closes-Bug: #1896463 (cherry picked from commit 7675964af81472f3dd57db952d704539e61a3e6e) (cherry picked from commit 3ecc098d28addd8f3e1da16b29940f4337209e62) (cherry picked from commit 3b497ad7d41bc84ec50124c4d55f4138c4878751)
author: Balazs Gibizer <balazs.gibizer@est.tech> 2020-09-28 18:44:02 +0200
committer: Balazs Gibizer <gibi@redhat.com> 2022-11-22 17:04:37 +0100
commit: d5506014837097261231b205e9b3acbfa503b354 (patch)
tree: 2695c8955f71f1d95ce15b75792be9df95e86aa1
parent: e558aa2046b649696b69551657e5d429e8117427 (diff)
download: nova-d5506014837097261231b205e9b3acbfa503b354.tar.gz
4 files changed, 40 insertions, 37 deletions
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index d29b58534b..5027362c9b 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -3479,19 +3479,11 @@ class ComputeManager(manager.Manager):
                 self._notify_instance_rebuild_error(context, instance, e, bdms)
                 raise
             else:
-                instance.apply_migration_context()
-                # NOTE (ndipanov): This save will now update the host and node
-                # attributes making sure that next RT pass is consistent since
-                # it will be based on the instance and not the migration DB
-                # entry.
-                instance.host = self.host
-                instance.node = scheduled_node
-                instance.save()
-                instance.drop_migration_context()
-
-                # NOTE (ndipanov): Mark the migration as done only after we
-                # mark the instance as belonging to this host.
-                self._set_migration_status(migration, 'done')
+                # NOTE(gibi): Let the resource tracker set the instance
+                # host and drop the migration context as we need to hold the
+                # COMPUTE_RESOURCE_SEMAPHORE to avoid the race with
+                # _update_available_resources. See bug 1896463.
+                self.rt.finish_evacuation(instance, scheduled_node, migration)
 
     def _do_rebuild_instance_with_claim(self, claim_context, *args, **kwargs):
         """Helper to avoid deep nesting in the top-level method."""
diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py
index b2ba0145cf..c34f705eaa 100644
--- a/nova/compute/resource_tracker.py
+++ b/nova/compute/resource_tracker.py
@@ -1801,3 +1801,21 @@ class ResourceTracker(object):
         """
         self.pci_tracker.free_instance_claims(context, instance)
         self.pci_tracker.save(context)
+
+    @utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE)
+    def finish_evacuation(self, instance, node, migration):
+        instance.apply_migration_context()
+        # NOTE (ndipanov): This save will now update the host and node
+        # attributes making sure that next RT pass is consistent since
+        # it will be based on the instance and not the migration DB
+        # entry.
+        instance.host = self.host
+        instance.node = node
+        instance.save()
+        instance.drop_migration_context()
+
+        # NOTE (ndipanov): Mark the migration as done only after we
+        # mark the instance as belonging to this host.
+        if migration:
+            migration.status = 'done'
+            migration.save()
diff --git a/nova/tests/functional/regressions/test_bug_1896463.py b/nova/tests/functional/regressions/test_bug_1896463.py
index af381b69fc..a6d38f4835 100644
--- a/nova/tests/functional/regressions/test_bug_1896463.py
+++ b/nova/tests/functional/regressions/test_bug_1896463.py
@@ -231,13 +231,4 @@ class TestEvacuateResourceTrackerRace(
         )
 
         self._assert_pci_device_allocated(server['id'], self.compute1_id)
-
-        # This is bug #1896463 as the PCI allocation was deleted by the racing
-        # _update_available_resource periodic task.
-        self._assert_pci_device_allocated(
-            server['id'], self.compute2_id, num=0)
-
-        # FIXME(gibi): When this bug is fixed (or if you remove the sleeps
-        # above to avoid the race condition) then we expect that the PCI
-        # allocation exists on the destination host too.
-        # self._assert_pci_device_allocated(server['id'], self.compute2_id)
+        self._assert_pci_device_allocated(server['id'], self.compute2_id)
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index 17fcb0e99b..dfa14a4c8d 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -5161,18 +5161,21 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
         node = uuidutils.generate_uuid()  # ironic node uuid
         instance = fake_instance.fake_instance_obj(self.context, node=node)
         instance.migration_context = None
+        migration = objects.Migration(status='accepted')
         with test.nested(
             mock.patch.object(self.compute, '_get_compute_info'),
             mock.patch.object(self.compute, '_do_rebuild_instance_with_claim'),
             mock.patch.object(objects.Instance, 'save'),
-            mock.patch.object(self.compute, '_set_migration_status'),
-        ) as (mock_get, mock_rebuild, mock_save, mock_set):
-            self.compute.rebuild_instance(self.context, instance, None, None,
-                                          None, None, None, None, False,
-                                          False, False, None, None, {}, None)
+            mock.patch.object(migration, 'save'),
+        ) as (mock_get, mock_rebuild, mock_save, mock_migration_save):
+            self.compute.rebuild_instance(
+                self.context, instance, None, None,
+                None, None, None, None, False,
+                False, False, migration, None, {}, None)
             self.assertFalse(mock_get.called)
             self.assertEqual(node, instance.node)
-            mock_set.assert_called_once_with(None, 'done')
+            self.assertEqual('done', migration.status)
+            mock_migration_save.assert_called_once_with()
 
     def test_rebuild_node_updated_if_recreate(self):
         dead_node = uuidutils.generate_uuid()
@@ -5185,16 +5188,15 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
         with test.nested(
             mock.patch.object(self.compute, '_get_compute_info'),
             mock.patch.object(self.compute, '_do_rebuild_instance_with_claim'),
-            mock.patch.object(objects.Instance, 'save'),
-            mock.patch.object(self.compute, '_set_migration_status'),
-        ) as (mock_get, mock_rebuild, mock_save, mock_set):
+        ) as (mock_get, mock_rebuild):
             mock_get.return_value.hypervisor_hostname = 'new-node'
-            self.compute.rebuild_instance(self.context, instance, None, None,
-                                          None, None, None, None, True,
-                                          False, False, None, None, {}, None)
+            self.compute.rebuild_instance(
+                self.context, instance, None, None, None, None, None,
+                None, True, False, False, mock.sentinel.migration, None, {},
+                None)
             mock_get.assert_called_once_with(mock.ANY, self.compute.host)
-            self.assertEqual('new-node', instance.node)
-            mock_set.assert_called_once_with(None, 'done')
+            mock_rt.finish_evacuation.assert_called_once_with(
+                instance, 'new-node', mock.sentinel.migration)
             # Make sure the rebuild_claim was called with the proper image_meta
             # from the instance.
             mock_rt.rebuild_claim.assert_called_once()
author	Balazs Gibizer <balazs.gibizer@est.tech>	2020-09-28 18:44:02 +0200
committer	Balazs Gibizer <gibi@redhat.com>	2022-11-22 17:04:37 +0100
commit	d5506014837097261231b205e9b3acbfa503b354 (patch)
tree	2695c8955f71f1d95ce15b75792be9df95e86aa1
parent	e558aa2046b649696b69551657e5d429e8117427 (diff)
download	nova-d5506014837097261231b205e9b3acbfa503b354.tar.gz