diff options
author | Balazs Gibizer <balazs.gibizer@est.tech> | 2020-09-28 18:44:02 +0200 |
---|---|---|
committer | Balazs Gibizer <balazs.gibizer@est.tech> | 2020-11-18 11:36:36 +0100 |
commit | 3ecc098d28addd8f3e1da16b29940f4337209e62 (patch) | |
tree | 617e34fe0681e161fdafab23ad2511cd76e9d523 | |
parent | d768cdbb88d0b0b3ca38623c4bb26d5eabdf1596 (diff) | |
download | nova-3ecc098d28addd8f3e1da16b29940f4337209e62.tar.gz |
Set instance host and drop migration under lock
The _update_available_resources periodic makes resource allocation
adjustments while holding the COMPUTE_RESOURCE_SEMAPHORE based on the
list of instances assigned to this host of the resource tracker and
based on the migrations where the source or the target host is the host
of the resource tracker. So if the instance.host or the migration
context changes without holding the COMPUTE_RESOURCE_SEMAPHORE while
the _update_available_resources task is running there there will be data
inconsistency in the resource tracker.
This patch makes sure that during evacuation the instance.host and the
migration context is changed while holding the semaphore.
Change-Id: Ica180165184b319651d22fe77e076af036228860
Closes-Bug: #1896463
(cherry picked from commit 7675964af81472f3dd57db952d704539e61a3e6e)
-rw-r--r-- | nova/compute/manager.py | 18 | ||||
-rw-r--r-- | nova/compute/resource_tracker.py | 18 | ||||
-rw-r--r-- | nova/tests/functional/regressions/test_bug_1896463.py | 11 | ||||
-rw-r--r-- | nova/tests/unit/compute/test_compute_mgr.py | 21 |
4 files changed, 35 insertions, 33 deletions
diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 8b6041945f..7a3ae9035d 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -3472,19 +3472,11 @@ class ComputeManager(manager.Manager): self._notify_instance_rebuild_error(context, instance, e, bdms) raise else: - instance.apply_migration_context() - # NOTE (ndipanov): This save will now update the host and node - # attributes making sure that next RT pass is consistent since - # it will be based on the instance and not the migration DB - # entry. - instance.host = self.host - instance.node = scheduled_node - instance.save() - instance.drop_migration_context() - - # NOTE (ndipanov): Mark the migration as done only after we - # mark the instance as belonging to this host. - self._set_migration_status(migration, 'done') + # NOTE(gibi): Let the resource tracker set the instance + # host and drop the migration context as we need to hold the + # COMPUTE_RESOURCE_SEMAPHORE to avoid the race with + # _update_available_resources. See bug 1896463. + self.rt.finish_evacuation(instance, scheduled_node, migration) def _do_rebuild_instance_with_claim( self, context, instance, orig_image_ref, image_meta, diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py index 33926e0d47..5b9e64e219 100644 --- a/nova/compute/resource_tracker.py +++ b/nova/compute/resource_tracker.py @@ -1970,3 +1970,21 @@ class ResourceTracker(object): """ self.pci_tracker.free_instance_claims(context, instance) self.pci_tracker.save(context) + + @utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True) + def finish_evacuation(self, instance, node, migration): + instance.apply_migration_context() + # NOTE (ndipanov): This save will now update the host and node + # attributes making sure that next RT pass is consistent since + # it will be based on the instance and not the migration DB + # entry. + instance.host = self.host + instance.node = node + instance.save() + instance.drop_migration_context() + + # NOTE (ndipanov): Mark the migration as done only after we + # mark the instance as belonging to this host. + if migration: + migration.status = 'done' + migration.save() diff --git a/nova/tests/functional/regressions/test_bug_1896463.py b/nova/tests/functional/regressions/test_bug_1896463.py index 40d0048147..dc74791e0e 100644 --- a/nova/tests/functional/regressions/test_bug_1896463.py +++ b/nova/tests/functional/regressions/test_bug_1896463.py @@ -219,13 +219,4 @@ class TestEvacuateResourceTrackerRace( server, {'OS-EXT-SRV-ATTR:host': 'host2', 'status': 'ACTIVE'}) self._assert_pci_device_allocated(server['id'], self.compute1_id) - - # This is bug #1896463 as the PCI allocation was deleted by the racing - # _update_available_resource periodic task. - self._assert_pci_device_allocated( - server['id'], self.compute2_id, num=0) - - # FIXME(gibi): When this bug is fixed (or if you remove the sleeps - # above to avoid the race condition) then we expect that the PCI - # allocation exists on the destination host too. - # self._assert_pci_device_allocated(server['id'], self.compute2_id) + self._assert_pci_device_allocated(server['id'], self.compute2_id) diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 3265f8056b..5d30ed8df6 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -5340,19 +5340,21 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, node = uuidutils.generate_uuid() # ironic node uuid instance = fake_instance.fake_instance_obj(self.context, node=node) instance.migration_context = None + migration = objects.Migration(status='accepted') with test.nested( mock.patch.object(self.compute, '_get_compute_info'), mock.patch.object(self.compute, '_do_rebuild_instance_with_claim'), mock.patch.object(objects.Instance, 'save'), - mock.patch.object(self.compute, '_set_migration_status'), - ) as (mock_get, mock_rebuild, mock_save, mock_set): + mock.patch.object(migration, 'save'), + ) as (mock_get, mock_rebuild, mock_save, mock_migration_save): self.compute.rebuild_instance( self.context, instance, None, None, None, None, None, None, False, - False, False, None, None, {}, None, []) + False, False, migration, None, {}, None, []) self.assertFalse(mock_get.called) self.assertEqual(node, instance.node) - mock_set.assert_called_once_with(None, 'done') + self.assertEqual('done', migration.status) + mock_migration_save.assert_called_once_with() def test_rebuild_node_updated_if_recreate(self): dead_node = uuidutils.generate_uuid() @@ -5365,16 +5367,15 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, with test.nested( mock.patch.object(self.compute, '_get_compute_info'), mock.patch.object(self.compute, '_do_rebuild_instance'), - mock.patch.object(objects.Instance, 'save'), - mock.patch.object(self.compute, '_set_migration_status'), - ) as (mock_get, mock_rebuild, mock_save, mock_set): + ) as (mock_get, mock_rebuild): mock_get.return_value.hypervisor_hostname = 'new-node' self.compute.rebuild_instance( self.context, instance, None, None, None, None, None, - None, True, False, False, None, None, {}, None, []) + None, True, False, False, mock.sentinel.migration, None, {}, + None, []) mock_get.assert_called_once_with(mock.ANY, self.compute.host) - self.assertEqual('new-node', instance.node) - mock_set.assert_called_once_with(None, 'done') + mock_rt.finish_evacuation.assert_called_once_with( + instance, 'new-node', mock.sentinel.migration) # Make sure the rebuild_claim was called with the proper image_meta # from the instance. mock_rt.rebuild_claim.assert_called_once() |