[rt] Apply migration context for incoming migrations

There is a race condition between an incoming resize and an update_available_resource periodic in the resource tracker. The race window starts when the resize_instance RPC finishes and ends when the finish_resize compute RPC finally applies the migration context on the instance. In the race window, if the update_available_resource periodic is run on the destination node, then it will see the instance as being tracked on this host as the instance.node is already pointing to the dest. But the instance.numa_topology still points to the source host topology as the migration context is not applied yet. This leads to CPU pinning error if the source topology does not fit to the dest topology. Also it stops the periodic task and leaves the tracker in an inconsistent state. The inconsistent state only cleanup up after the periodic is run outside of the race window. This patch applies the migration context temporarily to the specific instances during the periodic to keep resource accounting correct. Conflicts: on resource_tracker: changed 'MigrationList.get_in_progress_and_error' call back to 'MigrationList.get_in_progress_by_host_and_node', since this change was only added by 255b3f2f918843ca5dd9b99e109ecd2189b6b749, and is not present in stable/ussuri. Change-Id: Icaad155e22c9e2d86e464a0deb741c73f0dfb28a Closes-Bug: #1953359 Closes-Bug: #1952915 (cherry picked from commit 32c1044d86a8d02712c8e3abdf8b3e4cff234a9c) (cherry picked from commit 1235dc324ebc1c6ac6dc94da0f45ffffcc546d2c) (cherry picked from commit 5f2f283a75243d2e2629d3c5f7e5ef4b3994972d) (cherry picked from commit d54bd316b331d439a26a7318ca68cab5f6280ab2)
author: Balazs Gibizer <balazs.gibizer@est.tech> 2021-12-06 17:06:51 +0100
committer: Balazs Gibizer <gibi@redhat.com> 2022-04-25 17:42:58 +0200
commit: 1d0b7051da430ed00ae49901a32ec6af46c1a64e (patch)
tree: 86c30b5eb342d182db50ecdd67d7f3a4bda74e73
parent: 9b8e5cec303a621824366e1794665d6b849fefad (diff)
download: nova-1d0b7051da430ed00ae49901a32ec6af46c1a64e.tar.gz
2 files changed, 40 insertions, 24 deletions
diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py
index a7658f218e..c12ada21e7 100644
--- a/nova/compute/resource_tracker.py
+++ b/nova/compute/resource_tracker.py
@@ -919,14 +919,41 @@ class ResourceTracker(object):
                             'flavor', 'migration_context',
                             'resources'])
 
-        # Now calculate usage based on instance utilization:
-        instance_by_uuid = self._update_usage_from_instances(
-            context, instances, nodename)
-
         # Grab all in-progress migrations:
         migrations = objects.MigrationList.get_in_progress_by_host_and_node(
                 context, self.host, nodename)
 
+        # Check for tracked instances with in-progress, incoming, but not
+        # finished migrations. For those instance the migration context
+        # is not applied yet (it will be during finish_resize when the
+        # migration goes to finished state). We need to manually and
+        # temporary apply the migration context here when the resource usage is
+        # updated. See bug 1953359 for more details.
+        instance_by_uuid = {instance.uuid: instance for instance in instances}
+        for migration in migrations:
+            if (
+                migration.instance_uuid in instance_by_uuid and
+                migration.dest_compute == self.host and
+                migration.dest_node == nodename
+            ):
+                # we does not check for the 'post-migrating' migration status
+                # as applying the migration context for an instance already
+                # in finished migration status is a no-op anyhow.
+                instance = instance_by_uuid[migration.instance_uuid]
+                LOG.debug(
+                    'Applying migration context for instance %s as it has an '
+                    'incoming, in-progress migration %s. '
+                    'Migration status is %s',
+                    migration.instance_uuid, migration.uuid, migration.status
+                )
+                # It is OK not to revert the migration context at the end of
+                # the periodic as the instance is not saved during the periodic
+                instance.apply_migration_context()
+
+        # Now calculate usage based on instance utilization:
+        instance_by_uuid = self._update_usage_from_instances(
+            context, instances, nodename)
+
         self._pair_instances_to_migrations(migrations, instance_by_uuid)
         self._update_usage_from_migrations(context, migrations, nodename)
 
diff --git a/nova/tests/functional/libvirt/test_numa_servers.py b/nova/tests/functional/libvirt/test_numa_servers.py
index 773e6f56d3..5423e89aff 100644
--- a/nova/tests/functional/libvirt/test_numa_servers.py
+++ b/nova/tests/functional/libvirt/test_numa_servers.py
@@ -825,11 +825,11 @@ class NUMAServersTest(NUMAServersTestBase):
                 'vCPUs mapping: [(0, 1)]',
                 log,
             )
-            # But the periodic fails as it tries to apply the source topology
-            # on the dest. This is bug 1953359.
+            # We expect that the periodic not fails as bug 1953359 is fixed.
             log = self.stdlog.logger.output
-            self.assertIn('Error updating resources for node compute2', log)
-            self.assertIn(
+            self.assertIn('Running periodic for compute (compute2)', log)
+            self.assertNotIn('Error updating resources for node compute2', log)
+            self.assertNotIn(
                 'nova.exception.CPUPinningInvalid: CPU set to pin [0] must be '
                 'a subset of free CPU set [1]',
                 log,
@@ -847,27 +847,16 @@ class NUMAServersTest(NUMAServersTestBase):
                 new=fake_finish_resize,
             ):
                 post = {'migrate': None}
-                # this is expected to succeed but logs are emitted
-                # from the racing periodic task. See fake_finish_resize
-                # for the asserts
+                # this is expected to succeed
                 self.admin_api.post_server_action(server['id'], post)
 
         server = self._wait_for_state_change(server, 'VERIFY_RESIZE')
 
-        # as the periodic job raced and failed during the resize if we revert
-        # the instance now then it tries to unpin its cpus from the dest host
-        # but those was never pinned as the periodic failed. So the unpinning
-        # will fail too.
+        # As bug 1953359 is fixed the revert should succeed too
         post = {'revertResize': {}}
-        ex = self.assertRaises(
-            client.OpenStackApiException,
-            self.admin_api.post_server_action, server['id'], post
-        )
-        # This is still bug 1953359.
-        self.assertEqual(500, ex.response.status_code)
-        server = self.api.get_server(server['id'])
-        self.assertEqual('ERROR', server['status'])
-        self.assertIn(
+        self.admin_api.post_server_action(server['id'], post)
+        self._wait_for_state_change(server, 'ACTIVE')
+        self.assertNotIn(
             'nova.exception.CPUUnpinningInvalid: CPU set to unpin [1] must be '
             'a subset of pinned CPU set [0]',
             self.stdlog.logger.output,
author	Balazs Gibizer <balazs.gibizer@est.tech>	2021-12-06 17:06:51 +0100
committer	Balazs Gibizer <gibi@redhat.com>	2022-04-25 17:42:58 +0200
commit	1d0b7051da430ed00ae49901a32ec6af46c1a64e (patch)
tree	86c30b5eb342d182db50ecdd67d7f3a4bda74e73
parent	9b8e5cec303a621824366e1794665d6b849fefad (diff)
download	nova-1d0b7051da430ed00ae49901a32ec6af46c1a64e.tar.gz