Don't unset Instance.old_flavor, new_flavor until necessary

Since change Ia6d8a7909081b0b856bd7e290e234af7e42a2b38, the resource tracker's 'drop_move_claim' method has been capable of freeing up resource usage. However, this relies on accurate resource reporting. It transpires that there's a race whereby the resource tracker's 'update_available_resource' periodic task can end up not accounting for usage from migrations that are in the process of being completed. The root cause is the resource tracker's reliance on the stashed flavor in a given migration record [1]. Previously, this information was deleted by the compute manager at the start of the confirm migration operation [2]. The compute manager would then call the virt driver [3], which could take a not insignificant amount of time to return, before finally dropping the move claim. If the periodic task ran between the clearing of the stashed flavor and the return of the virt driver, it would find a migration record with no stashed flavor and would therefore ignore this record for accounting purposes [4], resulting in an incorrect record for the compute node, and an exception when the 'drop_move_claim' attempts to free up the resources that aren't being tracked. The solution to this issue is pretty simple. Instead of unsetting the old flavor record from the migration at the start of the various move operations, do it afterwards. Conflicts: nova/compute/manager.py NOTE(stephenfin): Conflicts are due to a number of missing cross-cell resize changes. [1] https://github.com/openstack/nova/blob/6557d67/nova/compute/resource_tracker.py#L1288 [2] https://github.com/openstack/nova/blob/6557d67/nova/compute/manager.py#L4310-L4315 [3] https://github.com/openstack/nova/blob/6557d67/nova/compute/manager.py#L4330-L4331 [4] https://github.com/openstack/nova/blob/6557d67/nova/compute/resource_tracker.py#L1300 Change-Id: I4760b01b695c94fa371b72216d398388cf981d28 Signed-off-by: Stephen Finucane <stephenfin@redhat.com> Partial-Bug: #1879878 Related-Bug: #1834349 Related-Bug: #1818914 (cherry picked from commit 44376d2e212e0f9405a58dc7fc4d5b38d70ac42e) (cherry picked from commit ce95af2caf69cb1b650459718fd4fa5f00ff28f5)
author: Stephen Finucane <stephenfin@redhat.com> 2020-08-05 14:27:06 +0100
committer: Lee Yarwood <lyarwood@redhat.com> 2021-02-04 17:08:15 +0000
commit: 75f9b288f8edfd24affe5ecbc1f3efb6a63726e4 (patch)
tree: e739c7c8c5f6b84f88c7164c4b267274d02175f0
parent: 6d8d2cb24af123c64e65d30988c9b74c0191075b (diff)
download: nova-75f9b288f8edfd24affe5ecbc1f3efb6a63726e4.tar.gz
3 files changed, 45 insertions, 33 deletions
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index eaedc0238f..590f4edb26 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -4246,14 +4246,17 @@ class ComputeManager(manager.Manager):
                             instance=instance)
                 finally:
                     # Whether an error occurred or not, at this point the
-                    # instance is on the dest host so to avoid leaking
-                    # allocations in placement, delete them here.
+                    # instance is on the dest host. Avoid leaking allocations
+                    # in placement by deleting them here...
                     self._delete_allocation_after_move(
                         context, instance, migration)
-                    # Also as the instance is not any more on this host, update
-                    # the scheduler about the move
+                    # ...inform the scheduler about the move...
                     self._delete_scheduler_instance_info(
                         context, instance.uuid)
+                    # ...and unset the cached flavor information (this is done
+                    # last since the resource tracker relies on it for its
+                    # periodic tasks)
+                    self._delete_stashed_flavor_info(instance)
 
         do_confirm_resize(context, instance, migration.id)
 
@@ -4292,13 +4295,6 @@ class ComputeManager(manager.Manager):
             self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
             phase=fields.NotificationPhase.START)
 
-        # NOTE(danms): delete stashed migration information
-        old_instance_type = instance.old_flavor
-        instance.old_flavor = None
-        instance.new_flavor = None
-        instance.system_metadata.pop('old_vm_state', None)
-        instance.save()
-
         # NOTE(tr3buchet): tear down networks on source host
         self.network_api.setup_networks_on_host(context, instance,
                            migration.source_compute, teardown=True)
@@ -4323,8 +4319,9 @@ class ComputeManager(manager.Manager):
         # instance.migration_context so make sure to not call
         # instance.drop_migration_context() until after drop_move_claim
         # is called.
-        self.rt.drop_move_claim(context, instance, migration.source_node,
-                                old_instance_type, prefix='old_')
+        self.rt.drop_move_claim(
+            context, instance, migration.source_node, instance.old_flavor,
+            prefix='old_')
         instance.drop_migration_context()
 
         # NOTE(mriedem): The old_vm_state could be STOPPED but the user
@@ -4375,6 +4372,13 @@ class ComputeManager(manager.Manager):
                        'migration_uuid': migration.uuid})
             raise
 
+    def _delete_stashed_flavor_info(self, instance):
+        """Remove information about the flavor change after a resize."""
+        instance.old_flavor = None
+        instance.new_flavor = None
+        instance.system_metadata.pop('old_vm_state', None)
+        instance.save()
+
     @wrap_exception()
     @reverts_task_state
     @wrap_instance_event(prefix='compute')
@@ -4492,6 +4496,16 @@ class ComputeManager(manager.Manager):
         revert the resized attributes in the database.
 
         """
+        try:
+            self._finish_revert_resize(
+                context, instance, migration, request_spec)
+        finally:
+            self._delete_stashed_flavor_info(instance)
+
+    def _finish_revert_resize(
+        self, context, instance, migration, request_spec=None,
+    ):
+        """Inner version of finish_revert_resize."""
         with self._error_out_instance_on_exception(context, instance):
             bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
                 context, instance.uuid)
@@ -4501,18 +4515,16 @@ class ComputeManager(manager.Manager):
                 self.host, action=fields.NotificationAction.RESIZE_REVERT,
                     phase=fields.NotificationPhase.START, bdms=bdms)
 
-            # NOTE(mriedem): delete stashed old_vm_state information; we
-            # default to ACTIVE for backwards compatibility if old_vm_state
-            # is not set
-            old_vm_state = instance.system_metadata.pop('old_vm_state',
-                                                        vm_states.ACTIVE)
+            # Get stashed old_vm_state information to determine if guest should
+            # be powered on after spawn; we default to ACTIVE for backwards
+            # compatibility if old_vm_state is not set
+            old_vm_state = instance.system_metadata.get(
+                'old_vm_state', vm_states.ACTIVE)
 
             self._set_instance_info(instance, instance.old_flavor)
-            instance.old_flavor = None
-            instance.new_flavor = None
             instance.host = migration.source_compute
             instance.node = migration.source_node
-            instance.save()
+            instance.save(expected_task_state=[task_states.RESIZE_REVERTING])
 
             try:
                 source_allocations = self._revert_allocation(
diff --git a/nova/tests/functional/libvirt/test_numa_servers.py b/nova/tests/functional/libvirt/test_numa_servers.py
index d73bfc4503..97b81e82c4 100644
--- a/nova/tests/functional/libvirt/test_numa_servers.py
+++ b/nova/tests/functional/libvirt/test_numa_servers.py
@@ -679,8 +679,7 @@ class NUMAServersTest(NUMAServersTestBase):
                     self.ctxt, dst_host,
                 ).numa_topology,
             )
-            # FIXME(stephenfin): There should still be two pinned cores here
-            self.assertEqual(0, len(src_numa_topology.cells[0].pinned_cpus))
+            self.assertEqual(2, len(src_numa_topology.cells[0].pinned_cpus))
             self.assertEqual(2, len(dst_numa_topology.cells[0].pinned_cpus))
 
             # before continuing with the actualy confirm process
@@ -724,14 +723,10 @@ class NUMAServersTest(NUMAServersTestBase):
 
         # Now confirm the resize
 
-        # FIXME(stephenfin): This should be successful, but it's failing with a
-        # HTTP 500 due to bug #1879878
         post = {'confirmResize': None}
-        exc = self.assertRaises(
-            client.OpenStackApiException,
-            self.api.post_server_action, server['id'], post)
-        self.assertEqual(500, exc.response.status_code)
-        self.assertIn('CPUUnpinningInvalid', str(exc))
+        self.api.post_server_action(server['id'], post)
+
+        server = self._wait_for_state_change(server, 'ACTIVE')
 
 
 class NUMAServerTestWithCountingQuotaFromPlacement(NUMAServersTest):
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index 67dc20f6bd..c3548e054c 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -8237,9 +8237,14 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
                                         self.migration)
             mock_delete.assert_called_once_with(self.context, self.instance,
                                                 self.migration)
-            mock_save.assert_called_with(expected_task_state=
-                                         [None, task_states.DELETING,
-                                         task_states.SOFT_DELETING])
+            mock_save.assert_has_calls([
+                mock.call(
+                    expected_task_state=[
+                        None, task_states.DELETING, task_states.SOFT_DELETING,
+                    ],
+                ),
+                mock.call(),
+            ])
             mock_delete_scheduler_info.assert_called_once_with(
                 self.context, self.instance.uuid)
author	Stephen Finucane <stephenfin@redhat.com>	2020-08-05 14:27:06 +0100
committer	Lee Yarwood <lyarwood@redhat.com>	2021-02-04 17:08:15 +0000
commit	75f9b288f8edfd24affe5ecbc1f3efb6a63726e4 (patch)
tree	e739c7c8c5f6b84f88c7164c4b267274d02175f0
parent	6d8d2cb24af123c64e65d30988c9b74c0191075b (diff)
download	nova-75f9b288f8edfd24affe5ecbc1f3efb6a63726e4.tar.gz