From a22d1b04de9e6ebc33b5ab9871b86f8e4022e7a9 Mon Sep 17 00:00:00 2001
From: Rodrigo Barbieri <rodrigo.barbieri2010@gmail.com>
Date: Wed, 31 Mar 2021 11:06:49 -0300
Subject: Error anti-affinity violation on migrations

Error-out the migrations (cold and live) whenever the
anti-affinity policy is violated. This addresses
violations when multiple concurrent migrations are
requested.

Added detection on:
- prep_resize
- check_can_live_migration_destination
- pre_live_migration

The improved method of detection now locks based on group_id
and considers other migrations in-progress as well.

Closes-bug: #1821755
Change-Id: I32e6214568bb57f7613ddeba2c2c46da0320fabc
(cherry picked from commit 33c8af1f8c46c9c37fcc28fb3409fbd3a78ae39f)
(cherry picked from commit 8b62a4ec9bf617dfb2da046c25a9f76b33516508)
(cherry picked from commit 6ede6df7f41db809de19e124d3d4994180598f19)
(cherry picked from commit bf90a1e06181f6b328b967124e538c6e2579b2e5)
---
 nova/compute/manager.py                            | 110 +++++++++++++++++---
 nova/tests/unit/compute/test_compute_mgr.py        | 114 +++++++++++++++++++--
 .../notes/bug-1821755-7bd03319e34b6b10.yaml        |  11 ++
 3 files changed, 211 insertions(+), 24 deletions(-)
 create mode 100644 releasenotes/notes/bug-1821755-7bd03319e34b6b10.yaml

diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index e5f6ace00f..dbf8308515 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -1587,7 +1587,11 @@ class ComputeManager(manager.Manager):
         return [_decode(f) for f in injected_files]
 
     def _validate_instance_group_policy(self, context, instance,
-                                        scheduler_hints):
+                                        scheduler_hints=None):
+
+        if CONF.workarounds.disable_group_policy_check_upcall:
+            return
+
         # NOTE(russellb) Instance group policy is enforced by the scheduler.
         # However, there is a race condition with the enforcement of
         # the policy.  Since more than one instance may be scheduled at the
@@ -1596,29 +1600,63 @@ class ComputeManager(manager.Manager):
         # multiple instances with an affinity policy could end up on different
         # hosts.  This is a validation step to make sure that starting the
         # instance here doesn't violate the policy.
-        group_hint = scheduler_hints.get('group')
-        if not group_hint:
-            return
-
-        # The RequestSpec stores scheduler_hints as key=list pairs so we need
-        # to check the type on the value and pull the single entry out. The
-        # API request schema validates that the 'group' hint is a single value.
-        if isinstance(group_hint, list):
-            group_hint = group_hint[0]
+        if scheduler_hints is not None:
+            # only go through here if scheduler_hints is provided, even if it
+            # is empty.
+            group_hint = scheduler_hints.get('group')
+            if not group_hint:
+                return
+            else:
+                # The RequestSpec stores scheduler_hints as key=list pairs so
+                # we need to check the type on the value and pull the single
+                # entry out. The API request schema validates that
+                # the 'group' hint is a single value.
+                if isinstance(group_hint, list):
+                    group_hint = group_hint[0]
+
+                group = objects.InstanceGroup.get_by_hint(context, group_hint)
+        else:
+            # TODO(ganso): a call to DB can be saved by adding request_spec
+            # to rpcapi payload of live_migration, pre_live_migration and
+            # check_can_live_migrate_destination
+            try:
+                group = objects.InstanceGroup.get_by_instance_uuid(
+                    context, instance.uuid)
+            except exception.InstanceGroupNotFound:
+                return
 
-        @utils.synchronized(group_hint)
-        def _do_validation(context, instance, group_hint):
-            group = objects.InstanceGroup.get_by_hint(context, group_hint)
+        @utils.synchronized(group['uuid'])
+        def _do_validation(context, instance, group):
             if group.policy and 'anti-affinity' == group.policy:
+
+                # instances on host
                 instances_uuids = objects.InstanceList.get_uuids_by_host(
                     context, self.host)
                 ins_on_host = set(instances_uuids)
+
+                # instance param is just for logging, the nodename obtained is
+                # not actually related to the instance at all
+                nodename = self._get_nodename(instance)
+
+                # instances being migrated to host
+                migrations = (
+                    objects.MigrationList.get_in_progress_by_host_and_node(
+                        context, self.host, nodename))
+                migration_vm_uuids = set([mig['instance_uuid']
+                                          for mig in migrations])
+
+                total_instances = migration_vm_uuids | ins_on_host
+
+                # refresh group to get updated members within locked block
+                group = objects.InstanceGroup.get_by_uuid(context,
+                                                          group['uuid'])
                 members = set(group.members)
                 # Determine the set of instance group members on this host
                 # which are not the instance in question. This is used to
                 # determine how many other members from the same anti-affinity
                 # group can be on this host.
-                members_on_host = ins_on_host & members - set([instance.uuid])
+                members_on_host = (total_instances & members -
+                                   set([instance.uuid]))
                 rules = group.rules
                 if rules and 'max_server_per_host' in rules:
                     max_server = rules['max_server_per_host']
@@ -1630,6 +1668,12 @@ class ComputeManager(manager.Manager):
                     raise exception.RescheduledException(
                             instance_uuid=instance.uuid,
                             reason=msg)
+
+            # NOTE(ganso): The check for affinity below does not work and it
+            # can easily be violated because the lock happens in different
+            # compute hosts.
+            # The only fix seems to be a DB lock to perform the check whenever
+            # setting the host field to an instance.
             elif group.policy and 'affinity' == group.policy:
                 group_hosts = group.get_hosts(exclude=[instance.uuid])
                 if group_hosts and self.host not in group_hosts:
@@ -1638,8 +1682,7 @@ class ComputeManager(manager.Manager):
                             instance_uuid=instance.uuid,
                             reason=msg)
 
-        if not CONF.workarounds.disable_group_policy_check_upcall:
-            _do_validation(context, instance, group_hint)
+        _do_validation(context, instance, group)
 
     def _log_original_error(self, exc_info, instance_uuid):
         LOG.error('Error: %s', exc_info[1], instance_uuid=instance_uuid,
@@ -4773,10 +4816,24 @@ class ComputeManager(manager.Manager):
         with self._error_out_instance_on_exception(
                 context, instance, instance_state=instance_state),\
                 errors_out_migration_ctxt(migration):
+
             self._send_prep_resize_notifications(
                 context, instance, fields.NotificationPhase.START,
                 instance_type)
             try:
+                scheduler_hints = self._get_scheduler_hints(filter_properties,
+                                                            request_spec)
+                # Error out if this host cannot accept the new instance due
+                # to anti-affinity. At this point the migration is already
+                # in-progress, so this is the definitive moment to abort due to
+                # the policy violation. Also, exploding here is covered by the
+                # cleanup methods in except block.
+                try:
+                    self._validate_instance_group_policy(context, instance,
+                                                         scheduler_hints)
+                except exception.RescheduledException as e:
+                    raise exception.InstanceFaultRollback(inner_exception=e)
+
                 self._prep_resize(context, image, instance,
                                   instance_type, filter_properties,
                                   node, migration, request_spec,
@@ -6811,6 +6868,20 @@ class ComputeManager(manager.Manager):
         :param limits: objects.SchedulerLimits object for this live migration.
         :returns: a LiveMigrateData object (hypervisor-dependent)
         """
+
+        # Error out if this host cannot accept the new instance due
+        # to anti-affinity. This check at this moment is not very accurate, as
+        # multiple requests may be happening concurrently and miss the lock,
+        # but when it works it provides a better user experience by failing
+        # earlier. Also, it should be safe to explode here, error becomes
+        # NoValidHost and instance status remains ACTIVE.
+        try:
+            self._validate_instance_group_policy(ctxt, instance)
+        except exception.RescheduledException as e:
+            msg = ("Failed to validate instance group policy "
+                   "due to: {}".format(e))
+            raise exception.MigrationPreCheckError(reason=msg)
+
         src_compute_info = obj_base.obj_to_primitive(
             self._get_compute_info(ctxt, instance.host))
         dst_compute_info = obj_base.obj_to_primitive(
@@ -6949,6 +7020,13 @@ class ComputeManager(manager.Manager):
         """
         LOG.debug('pre_live_migration data is %s', migrate_data)
 
+        # Error out if this host cannot accept the new instance due
+        # to anti-affinity. At this point the migration is already in-progress,
+        # so this is the definitive moment to abort due to the policy
+        # violation. Also, it should be safe to explode here. The instance
+        # status remains ACTIVE, migration status failed.
+        self._validate_instance_group_policy(context, instance)
+
         migrate_data.old_vol_attachment_ids = {}
         bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
             context, instance.uuid)
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index 8981b6f7ba..93acfee330 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -3198,12 +3198,16 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
                 CONF.host, instance.uuid)
             return result
 
+    @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock(
+        side_effect=exception.InstanceGroupNotFound(group_uuid='')))
     def test_check_can_live_migrate_destination_success(self):
         self.useFixture(std_fixtures.MonkeyPatch(
             'nova.network.neutronv2.api.API.supports_port_binding_extension',
             lambda *args: True))
         self._test_check_can_live_migrate_destination()
 
+    @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock(
+        side_effect=exception.InstanceGroupNotFound(group_uuid='')))
     def test_check_can_live_migrate_destination_fail(self):
         self.useFixture(std_fixtures.MonkeyPatch(
             'nova.network.neutronv2.api.API.supports_port_binding_extension',
@@ -3213,7 +3217,9 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
             self._test_check_can_live_migrate_destination,
             do_raise=True)
 
-    def test_check_can_live_migrate_destination_contins_vifs(self):
+    @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock(
+        side_effect=exception.InstanceGroupNotFound(group_uuid='')))
+    def test_check_can_live_migrate_destination_contains_vifs(self):
         self.useFixture(std_fixtures.MonkeyPatch(
             'nova.network.neutronv2.api.API.supports_port_binding_extension',
             lambda *args: True))
@@ -3221,6 +3227,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
         self.assertIn('vifs', migrate_data)
         self.assertIsNotNone(migrate_data.vifs)
 
+    @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock(
+        side_effect=exception.InstanceGroupNotFound(group_uuid='')))
     def test_check_can_live_migrate_destination_no_binding_extended(self):
         self.useFixture(std_fixtures.MonkeyPatch(
             'nova.network.neutronv2.api.API.supports_port_binding_extension',
@@ -3228,18 +3236,40 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
         migrate_data = self._test_check_can_live_migrate_destination()
         self.assertNotIn('vifs', migrate_data)
 
+    @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock(
+        side_effect=exception.InstanceGroupNotFound(group_uuid='')))
     def test_check_can_live_migrate_destination_src_numa_lm_false(self):
         self.useFixture(std_fixtures.MonkeyPatch(
             'nova.network.neutronv2.api.API.supports_port_binding_extension',
             lambda *args: True))
         self._test_check_can_live_migrate_destination(src_numa_lm=False)
 
+    @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock(
+        side_effect=exception.InstanceGroupNotFound(group_uuid='')))
     def test_check_can_live_migrate_destination_src_numa_lm_true(self):
         self.useFixture(std_fixtures.MonkeyPatch(
             'nova.network.neutronv2.api.API.supports_port_binding_extension',
             lambda *args: True))
         self._test_check_can_live_migrate_destination(src_numa_lm=True)
 
+    @mock.patch.object(compute_utils, 'add_instance_fault_from_exc')
+    def test_check_can_live_migrate_destination_fail_group_policy(
+            self, mock_fail_db):
+
+        instance = fake_instance.fake_instance_obj(
+            self.context, host=self.compute.host, vm_state=vm_states.ACTIVE,
+            node='fake-node')
+
+        ex = exception.RescheduledException(
+                instance_uuid=instance.uuid, reason="policy violated")
+
+        with mock.patch.object(self.compute, '_validate_instance_group_policy',
+                               side_effect=ex):
+            self.assertRaises(
+                exception.MigrationPreCheckError,
+                self.compute.check_can_live_migrate_destination,
+                self.context, instance, None, None, None, None)
+
     def test_dest_can_numa_live_migrate(self):
         positive_dest_check_data = objects.LibvirtLiveMigrateData(
             dst_supports_numa_live_migration=True)
@@ -6979,7 +7009,8 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
     def test_validate_policy_honors_workaround_disabled(self, mock_get):
         instance = objects.Instance(uuid=uuids.instance)
         hints = {'group': 'foo'}
-        mock_get.return_value = objects.InstanceGroup(policy=None)
+        mock_get.return_value = objects.InstanceGroup(policy=None,
+                                                      uuid=uuids.group)
         self.compute._validate_instance_group_policy(self.context,
                                                      instance, hints)
         mock_get.assert_called_once_with(self.context, 'foo')
@@ -7005,10 +7036,14 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
                                                      instance, hints)
         mock_get.assert_called_once_with(self.context, uuids.group_hint)
 
+    @mock.patch('nova.objects.InstanceGroup.get_by_uuid')
     @mock.patch('nova.objects.InstanceList.get_uuids_by_host')
     @mock.patch('nova.objects.InstanceGroup.get_by_hint')
-    def test_validate_instance_group_policy_with_rules(self, mock_get_by_hint,
-                                                       mock_get_by_host):
+    @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
+    @mock.patch('nova.objects.MigrationList.get_in_progress_by_host_and_node')
+    def test_validate_instance_group_policy_with_rules(
+            self, migration_list, nodes, mock_get_by_hint, mock_get_by_host,
+            mock_get_by_uuid):
         # Create 2 instance in same host, inst2 created before inst1
         instance = objects.Instance(uuid=uuids.inst1)
         hints = {'group': [uuids.group_hint]}
@@ -7017,17 +7052,26 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
         mock_get_by_host.return_value = existing_insts
 
         # if group policy rules limit to 1, raise RescheduledException
-        mock_get_by_hint.return_value = objects.InstanceGroup(
+        group = objects.InstanceGroup(
             policy='anti-affinity', rules={'max_server_per_host': '1'},
-            hosts=['host1'], members=members_uuids)
+            hosts=['host1'], members=members_uuids,
+            uuid=uuids.group)
+        mock_get_by_hint.return_value = group
+        mock_get_by_uuid.return_value = group
+        nodes.return_value = ['nodename']
+        migration_list.return_value = [objects.Migration(
+            uuid=uuids.migration, instance_uuid=uuids.instance)]
         self.assertRaises(exception.RescheduledException,
                           self.compute._validate_instance_group_policy,
                           self.context, instance, hints)
 
         # if group policy rules limit change to 2, validate OK
-        mock_get_by_hint.return_value = objects.InstanceGroup(
+        group2 = objects.InstanceGroup(
             policy='anti-affinity', rules={'max_server_per_host': 2},
-            hosts=['host1'], members=members_uuids)
+            hosts=['host1'], members=members_uuids,
+            uuid=uuids.group)
+        mock_get_by_hint.return_value = group2
+        mock_get_by_uuid.return_value = group2
         self.compute._validate_instance_group_policy(self.context,
                                                      instance, hints)
 
@@ -8554,6 +8598,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
         manager.ComputeManager()
         mock_executor.assert_called_once_with()
 
+    @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock(
+        side_effect=exception.InstanceGroupNotFound(group_uuid='')))
     def test_pre_live_migration_cinder_v3_api(self):
         # This tests that pre_live_migration with a bdm with an
         # attachment_id, will create a new attachment and update
@@ -8631,6 +8677,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
 
         _test()
 
+    @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock(
+        side_effect=exception.InstanceGroupNotFound(group_uuid='')))
     def test_pre_live_migration_exception_cinder_v3_api(self):
         # The instance in this test has 2 attachments. The second attach_create
         # will throw an exception. This will test that the first attachment
@@ -8700,6 +8748,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
                 self.assertGreater(len(m.mock_calls), 0)
         _test()
 
+    @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock(
+        side_effect=exception.InstanceGroupNotFound(group_uuid='')))
     def test_pre_live_migration_exceptions_delete_attachments(self):
         # The instance in this test has 2 attachments. The call to
         # driver.pre_live_migration will raise an exception. This will test
@@ -10036,6 +10086,54 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
         # (_error_out_instance_on_exception will set to ACTIVE by default).
         self.assertEqual(vm_states.STOPPED, instance.vm_state)
 
+    @mock.patch('nova.compute.utils.notify_usage_exists')
+    @mock.patch('nova.compute.manager.ComputeManager.'
+                '_notify_about_instance_usage')
+    @mock.patch('nova.compute.utils.notify_about_resize_prep_instance')
+    @mock.patch('nova.objects.Instance.save')
+    @mock.patch('nova.compute.manager.ComputeManager._revert_allocation')
+    @mock.patch('nova.compute.manager.ComputeManager.'
+                '_reschedule_resize_or_reraise')
+    @mock.patch('nova.compute.utils.add_instance_fault_from_exc')
+    # this is almost copy-paste from test_prep_resize_fails_rollback
+    def test_prep_resize_fails_group_validation(
+            self, add_instance_fault_from_exc, _reschedule_resize_or_reraise,
+            _revert_allocation, mock_instance_save,
+            notify_about_resize_prep_instance, _notify_about_instance_usage,
+            notify_usage_exists):
+        """Tests that if _validate_instance_group_policy raises
+        InstanceFaultRollback, the instance.vm_state is reset properly in
+        _error_out_instance_on_exception
+        """
+        instance = fake_instance.fake_instance_obj(
+            self.context, host=self.compute.host, vm_state=vm_states.STOPPED,
+            node='fake-node', expected_attrs=['system_metadata', 'flavor'])
+        migration = mock.MagicMock(spec='nova.objects.Migration')
+        request_spec = mock.MagicMock(spec='nova.objects.RequestSpec')
+        ex = exception.RescheduledException(
+            instance_uuid=instance.uuid, reason="policy violated")
+        ex2 = exception.InstanceFaultRollback(
+            inner_exception=ex)
+
+        def fake_reschedule_resize_or_reraise(*args, **kwargs):
+            raise ex2
+
+        _reschedule_resize_or_reraise.side_effect = (
+            fake_reschedule_resize_or_reraise)
+
+        with mock.patch.object(self.compute, '_validate_instance_group_policy',
+                               side_effect=ex):
+            self.assertRaises(
+                # _error_out_instance_on_exception should reraise the
+                # RescheduledException inside InstanceFaultRollback.
+                exception.RescheduledException, self.compute.prep_resize,
+                self.context, instance.image_meta, instance, instance.flavor,
+                request_spec, filter_properties={}, node=instance.node,
+                clean_shutdown=True, migration=migration, host_list=[])
+        # The instance.vm_state should remain unchanged
+        # (_error_out_instance_on_exception will set to ACTIVE by default).
+        self.assertEqual(vm_states.STOPPED, instance.vm_state)
+
     @mock.patch('nova.compute.rpcapi.ComputeAPI.resize_instance')
     @mock.patch('nova.compute.resource_tracker.ResourceTracker.resize_claim')
     @mock.patch('nova.objects.Instance.save')
diff --git a/releasenotes/notes/bug-1821755-7bd03319e34b6b10.yaml b/releasenotes/notes/bug-1821755-7bd03319e34b6b10.yaml
new file mode 100644
index 0000000000..4c6135311b
--- /dev/null
+++ b/releasenotes/notes/bug-1821755-7bd03319e34b6b10.yaml
@@ -0,0 +1,11 @@
+---
+fixes:
+  - |
+    Improved detection of anti-affinity policy violation when performing live
+    and cold migrations. Most of the violations caused by race conditions due
+    to performing concurrent live or cold migrations should now be addressed
+    by extra checks in the compute service. Upon detection, cold migration
+    operations are automatically rescheduled, while live migrations have two
+    checks and will be rescheduled if detected by the first one, otherwise the
+    live migration will fail cleanly and revert the instance state back to its
+    previous value.
-- 
cgit v1.2.1