11 files changed, 389 insertions, 45 deletions
diff --git a/nova/tests/functional/regressions/test_bug_1595962.py b/nova/tests/functional/regressions/test_bug_1595962.py
index bf1c7e53be..8eef0d3b7a 100644
--- a/nova/tests/functional/regressions/test_bug_1595962.py
+++ b/nova/tests/functional/regressions/test_bug_1595962.py
@@ -57,7 +57,6 @@ class TestSerialConsoleLiveMigrate(test.TestCase):
         self.flags(enabled=True, group="serial_console")
         self.flags(enabled=False, group="vnc")
         self.flags(enabled=False, group="spice")
-        self.flags(use_usb_tablet=False, group="libvirt")
 
         self.start_service('conductor')
         self.start_service('scheduler')
diff --git a/nova/tests/functional/regressions/test_bug_1669054.py b/nova/tests/functional/regressions/test_bug_1669054.py
index 751466fd41..6180dbfbaa 100644
--- a/nova/tests/functional/regressions/test_bug_1669054.py
+++ b/nova/tests/functional/regressions/test_bug_1669054.py
@@ -57,14 +57,9 @@ class ResizeEvacuateTestCase(integrated_helpers._IntegratedTestBase):
         host2.stop()
         self.api.force_down_service('host2', 'nova-compute', forced_down=True)
         # Now try to evacuate the server back to the original source compute.
-        req = {'evacuate': {'onSharedStorage': False}}
-        self.api.post_server_action(server['id'], req)
-        server = self._wait_for_state_change(server, 'ACTIVE')
-        # The evacuate flow in the compute manager is annoying in that it
-        # sets the instance status to ACTIVE before updating the host, so we
-        # have to wait for the migration record to be 'done' to avoid a race.
-        self._wait_for_migration_status(server, ['done'])
-        self.assertEqual(self.compute.host, server['OS-EXT-SRV-ATTR:host'])
+        server = self._evacuate_server(
+            server, {'onSharedStorage': 'False'},
+            expected_host=self.compute.host, expected_migration_status='done')
 
         # Assert the RequestSpec.ignore_hosts field is not populated.
         reqspec = objects.RequestSpec.get_by_instance_uuid(
diff --git a/nova/tests/functional/regressions/test_bug_1702454.py b/nova/tests/functional/regressions/test_bug_1702454.py
index 097f6cedae..808665b24c 100644
--- a/nova/tests/functional/regressions/test_bug_1702454.py
+++ b/nova/tests/functional/regressions/test_bug_1702454.py
@@ -110,15 +110,14 @@ class SchedulerOnlyChecksTargetTest(test.TestCase,
         # only possibility the instance can end up on it is because the
         # scheduler should only verify the requested destination as host2
         # is weighed lower than host3.
-        evacuate = {
-            'evacuate': {
-                'host': 'host2'
-            }
+        target_host = 'host2'
+
+        post_args = {
+            'host': target_host
         }
-        self.admin_api.post_server_action(server['id'], evacuate)
 
-        self._wait_for_state_change(server, 'ACTIVE')
-        server = self.admin_api.get_server(server_id)
+        server = self._evacuate_server(
+            server, extra_post_args=post_args, expected_host=target_host)
 
         # Yeepee, that works!
-        self.assertEqual('host2', server['OS-EXT-SRV-ATTR:host'])
+        self.assertEqual(target_host, server['OS-EXT-SRV-ATTR:host'])
diff --git a/nova/tests/functional/regressions/test_bug_1713783.py b/nova/tests/functional/regressions/test_bug_1713783.py
index 521d447079..86e9ae919c 100644
--- a/nova/tests/functional/regressions/test_bug_1713783.py
+++ b/nova/tests/functional/regressions/test_bug_1713783.py
@@ -96,14 +96,11 @@ class FailedEvacuateStateTests(test.TestCase,
         fake_notifier.reset()
 
         # Initiate evacuation
-        post = {'evacuate': {}}
-        self.api.post_server_action(server['id'], post)
-
+        self._evacuate_server(
+            server, expected_state='ERROR', expected_host=self.hostname,
+            expected_migration_status='error')
         self._wait_for_notification_event_type('compute_task.rebuild_server')
 
-        server = self._wait_for_state_change(server, 'ERROR')
-        self.assertEqual(self.hostname, server['OS-EXT-SRV-ATTR:host'])
-
         # Check migrations
         migrations = self.api.get_migrations()
         self.assertEqual(1, len(migrations))
diff --git a/nova/tests/functional/regressions/test_bug_1764883.py b/nova/tests/functional/regressions/test_bug_1764883.py
index d8d97276e6..431af81d86 100644
--- a/nova/tests/functional/regressions/test_bug_1764883.py
+++ b/nova/tests/functional/regressions/test_bug_1764883.py
@@ -97,12 +97,9 @@ class TestEvacuationWithSourceReturningDuringRebuild(
         self.computes.get(self.source_compute).stop()
         self.api.force_down_service(self.source_compute, 'nova-compute', True)
 
-        # Start evacuating the instance from the source_host
-        self.api.post_server_action(server['id'], {'evacuate': {}})
-
-        # Wait for the instance to go into an ACTIVE state
-        self._wait_for_state_change(server, 'ACTIVE')
-        server = self.api.get_server(server['id'])
+        # Evacuate the instance from the source_host
+        server = self._evacuate_server(
+            server, expected_migration_status='done')
         host = server['OS-EXT-SRV-ATTR:host']
         migrations = self.api.get_migrations()
 
diff --git a/nova/tests/functional/regressions/test_bug_1794996.py b/nova/tests/functional/regressions/test_bug_1794996.py
index ee0756e603..15ed5e0647 100644
--- a/nova/tests/functional/regressions/test_bug_1794996.py
+++ b/nova/tests/functional/regressions/test_bug_1794996.py
@@ -52,12 +52,7 @@ class TestEvacuateDeleteServerRestartOriginalCompute(
             source_compute_id, {'forced_down': 'true'})
 
         # evacuate the server
-        post = {'evacuate': {}}
-        self.api.post_server_action(
-            server['id'], post)
-        expected_params = {'OS-EXT-SRV-ATTR:host': dest_hostname,
-                           'status': 'ACTIVE'}
-        server = self._wait_for_server_parameter(server, expected_params)
+        server = self._evacuate_server(server, expected_host=dest_hostname)
 
         # Expect to have allocation and usages on both computes as the
         # source compute is still down
diff --git a/nova/tests/functional/regressions/test_bug_1815153.py b/nova/tests/functional/regressions/test_bug_1815153.py
index cadd20c8d8..5860187e71 100644
--- a/nova/tests/functional/regressions/test_bug_1815153.py
+++ b/nova/tests/functional/regressions/test_bug_1815153.py
@@ -142,11 +142,9 @@ class NonPersistentFieldNotResetTest(
         # Its status becomes 'ACTIVE'.
         # If requested_destination is not reset, a status of the server
         # becomes 'ERROR' because the target host is down.
-        self.api.post_server_action(
-            server['id'], {'evacuate': {'host': target_host}})
-        expected_params = {'OS-EXT-SRV-ATTR:host': original_host,
-                           'status': 'ERROR'}
-        server = self._wait_for_server_parameter(server, expected_params)
+        server = self._evacuate_server(
+            server, {'host': target_host}, expected_host=original_host,
+            expected_state='ERROR', expected_migration_status='error')
 
         # Make sure 'is_bfv' is set.
         reqspec = objects.RequestSpec.get_by_instance_uuid(self.ctxt,
diff --git a/nova/tests/functional/regressions/test_bug_1823370.py b/nova/tests/functional/regressions/test_bug_1823370.py
index 30aa88a183..5e69905f5f 100644
--- a/nova/tests/functional/regressions/test_bug_1823370.py
+++ b/nova/tests/functional/regressions/test_bug_1823370.py
@@ -64,8 +64,6 @@ class MultiCellEvacuateTestCase(integrated_helpers._IntegratedTestBase):
         # Now evacuate the server which should send it to host3 since it is
         # in the same cell as host1, even though host2 in cell2 is weighed
         # higher than host3.
-        req = {'evacuate': {'onSharedStorage': False}}
-        self.api.post_server_action(server['id'], req)
-        self._wait_for_migration_status(server, ['done'])
-        server = self._wait_for_state_change(server, 'ACTIVE')
-        self.assertEqual('host3', server['OS-EXT-SRV-ATTR:host'])
+        self._evacuate_server(
+            server, {'onSharedStorage': 'False'}, expected_host='host3',
+            expected_migration_status='done')
diff --git a/nova/tests/functional/regressions/test_bug_1896463.py b/nova/tests/functional/regressions/test_bug_1896463.py
new file mode 100644
index 0000000000..dc74791e0e
--- /dev/null
+++ b/nova/tests/functional/regressions/test_bug_1896463.py
@@ -0,0 +1,222 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import fixtures
+import time
+
+from oslo_config import cfg
+
+from nova import context
+from nova import objects
+from nova import test
+from nova.tests import fixtures as nova_fixtures
+from nova.tests.functional import fixtures as func_fixtures
+from nova.tests.functional import integrated_helpers
+from nova import utils
+from nova.virt import fake
+
+
+CONF = cfg.CONF
+
+
+class TestEvacuateResourceTrackerRace(
+    test.TestCase, integrated_helpers.InstanceHelperMixin,
+):
+    """Demonstrate bug #1896463.
+
+    Trigger a race condition between an almost finished evacuation that is
+    dropping the migration context, and the _update_available_resource()
+    periodic task that already loaded the instance list but haven't loaded the
+    migration list yet. The result is that the PCI allocation made by the
+    evacuation is deleted by the overlapping periodic task run and the instance
+    will not have PCI allocation after the evacuation.
+    """
+
+    def setUp(self):
+        super().setUp()
+        self.neutron = self.useFixture(nova_fixtures.NeutronFixture(self))
+        self.glance = self.useFixture(nova_fixtures.GlanceFixture(self))
+        self.placement = self.useFixture(func_fixtures.PlacementFixture()).api
+
+        self.api_fixture = self.useFixture(nova_fixtures.OSAPIFixture(
+            api_version='v2.1'))
+
+        self.admin_api = self.api_fixture.admin_api
+        self.admin_api.microversion = 'latest'
+        self.api = self.admin_api
+
+        self.start_service('conductor')
+        self.start_service('scheduler')
+
+        self.flags(compute_driver='fake.FakeDriverWithPciResources')
+        self.useFixture(
+            fake.FakeDriverWithPciResources.
+                FakeDriverWithPciResourcesConfigFixture())
+
+        self.compute1 = self._start_compute('host1')
+        self.compute1_id = self._get_compute_node_id_by_host('host1')
+        self.compute1_service_id = self.admin_api.get_services(
+            host='host1', binary='nova-compute')[0]['id']
+
+        self.compute2 = self._start_compute('host2')
+        self.compute2_id = self._get_compute_node_id_by_host('host2')
+        self.compute2_service_id = self.admin_api.get_services(
+            host='host2', binary='nova-compute')[0]['id']
+
+        # add extra ports and the related network to the neutron fixture
+        # specifically for these tests. It cannot be added globally in the
+        # fixture init as it adds a second network that makes auto allocation
+        # based test to fail due to ambiguous networks.
+        self.neutron._ports[self.neutron.sriov_port['id']] = \
+            copy.deepcopy(self.neutron.sriov_port)
+        self.neutron._networks[
+            self.neutron.network_2['id']] = self.neutron.network_2
+        self.neutron._subnets[
+            self.neutron.subnet_2['id']] = self.neutron.subnet_2
+
+        self.ctxt = context.get_admin_context()
+
+    def _get_compute_node_id_by_host(self, host):
+        # we specifically need the integer id of the node not the UUID so we
+        # need to use the old microversion
+        with utils.temporary_mutation(self.admin_api, microversion='2.52'):
+            hypers = self.admin_api.api_get(
+                'os-hypervisors').body['hypervisors']
+            for hyper in hypers:
+                if hyper['hypervisor_hostname'] == host:
+                    return hyper['id']
+
+            self.fail('Hypervisor with hostname=%s not found' % host)
+
+    def _assert_pci_device_allocated(
+            self, instance_uuid, compute_node_id, num=1):
+        """Assert that a given number of PCI devices are allocated to the
+        instance on the given host.
+        """
+
+        devices = objects.PciDeviceList.get_by_instance_uuid(
+            self.ctxt, instance_uuid)
+        devices_on_host = [dev for dev in devices
+                           if dev.compute_node_id == compute_node_id]
+        self.assertEqual(num, len(devices_on_host))
+
+    def test_evacuate_races_with_update_available_resource(self):
+        # Create a server with a direct port to have PCI allocation
+        server = self._create_server(
+            name='test-server-for-bug-1896463',
+            networks=[{'port': self.neutron.sriov_port['id']}],
+            host='host1'
+        )
+
+        self._assert_pci_device_allocated(server['id'], self.compute1_id)
+        self._assert_pci_device_allocated(
+            server['id'], self.compute2_id, num=0)
+
+        # stop and force down the compute the instance is on to allow
+        # evacuation
+        self.compute1.stop()
+        self.admin_api.put_service(
+            self.compute1_service_id, {'forced_down': 'true'})
+
+        # Inject some sleeps both in the Instance.drop_migration_context and
+        # the MigrationList.get_in_progress_and_error code to make them
+        # overlap.
+        # We want to create the following execution scenario:
+        # 1) The evacuation makes a move claim on the dest including the PCI
+        #    claim. This means there is a migration context. But the evacuation
+        #    is not complete yet so the instance.host does not point to the
+        #    dest host.
+        # 2) The dest resource tracker starts an _update_available_resource()
+        #    periodic task and this task loads the list of instances on its
+        #    host from the DB. Our instance is not in this list due to #1.
+        # 3) The evacuation finishes, the instance.host is set to the dest host
+        #    and the migration context is deleted.
+        # 4) The periodic task now loads the list of in-progress migration from
+        #    the DB to check for incoming our outgoing migrations. However due
+        #    to #3 our instance is not in this list either.
+        # 5) The periodic task cleans up every lingering PCI claim that is not
+        #    connected to any instance collected above from the instance list
+        #    and from the migration list. As our instance is not in either of
+        #    the lists, the resource tracker  cleans up the PCI allocation for
+        #    the already finished evacuation of our instance.
+        #
+        # Unfortunately we cannot reproduce the above situation without sleeps.
+        # We need that the evac starts first then the periodic starts, but not
+        # finishes, then evac finishes, then periodic finishes. If I trigger
+        # and run the whole periodic in a wrapper of drop_migration_context
+        # then I could not reproduce the situation described at #4). In general
+        # it is not
+        #
+        #   evac
+        #    |
+        #    |
+        #    |     periodic
+        #    |        |
+        #    |        |
+        #    |        x
+        #    |
+        #    |
+        #    x
+        #
+        # but
+        #
+        #   evac
+        #    |
+        #    |
+        #    |     periodic
+        #    |        |
+        #    |        |
+        #    |        |
+        #    x        |
+        #             |
+        #             x
+        #
+        # what is needed need.
+        #
+        # Starting the periodic from the test in a separate thread at
+        # drop_migration_context() might work but that is an extra complexity
+        # in the test code. Also it might need a sleep still to make the
+        # reproduction stable but only one sleep instead of two.
+        orig_drop = objects.Instance.drop_migration_context
+
+        def slow_drop(*args, **kwargs):
+            time.sleep(1)
+            return orig_drop(*args, **kwargs)
+
+        self.useFixture(
+            fixtures.MockPatch(
+                'nova.objects.instance.Instance.drop_migration_context',
+                new=slow_drop))
+
+        orig_get_mig = objects.MigrationList.get_in_progress_and_error
+
+        def slow_get_mig(*args, **kwargs):
+            time.sleep(2)
+            return orig_get_mig(*args, **kwargs)
+
+        self.useFixture(
+            fixtures.MockPatch(
+                'nova.objects.migration.MigrationList.'
+                'get_in_progress_and_error',
+                new=slow_get_mig))
+
+        self.admin_api.post_server_action(server['id'], {'evacuate': {}})
+        # we trigger the _update_available_resource periodic to overlap with
+        # the already started evacuation
+        self._run_periodics()
+
+        self._wait_for_server_parameter(
+            server, {'OS-EXT-SRV-ATTR:host': 'host2', 'status': 'ACTIVE'})
+
+        self._assert_pci_device_allocated(server['id'], self.compute1_id)
+        self._assert_pci_device_allocated(server['id'], self.compute2_id)
diff --git a/nova/tests/functional/regressions/test_bug_1899649.py b/nova/tests/functional/regressions/test_bug_1899649.py
new file mode 100644
index 0000000000..be75ea947f
--- /dev/null
+++ b/nova/tests/functional/regressions/test_bug_1899649.py
@@ -0,0 +1,100 @@
+# Copyright 2020, Red Hat, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+from nova.tests import fixtures as nova_fixtures
+from nova.tests.functional.libvirt import base
+from nova.tests.unit.virt.libvirt import fakelibvirt
+
+
+class TestVolAttachmentsAfterFailureToScheduleOrBuild(base.ServersTestBase):
+    """Regression test for bug .
+
+    This regression test aims to ensure a volume attachment remains in place
+    after a failure to either schedule a server or when building a server
+    directly on a compute after skipping the scheduler.
+
+    A volume attachment is required to remain after such failures to ensure the
+    volume itself remains marked as reserved.
+
+    To ensure this is as accurate as possible the tests use the libvirt
+    functional base class to mimic a real world example with NUMA nodes being
+    requested via flavor extra specs. The underlying compute being unable to
+    meet this request ensuring a failure.
+    """
+
+    microversion = 'latest'
+
+    def setUp(self):
+        super().setUp()
+
+        # Launch a single libvirt based compute service with a single NUMA node
+        host_info = fakelibvirt.HostInfo(
+            cpu_nodes=1, cpu_sockets=1, cpu_cores=2, kB_mem=15740000)
+        self.start_compute(host_info=host_info, hostname='compute1')
+
+        # Use a flavor requesting 2 NUMA nodes that we know will always fail
+        self.flavor_id = self._create_flavor(extra_spec={'hw:numa_nodes': '2'})
+
+        # Craft a common bfv server request for use within each test
+        self.volume_id = nova_fixtures.CinderFixture.IMAGE_BACKED_VOL
+        self.server = {
+            'name': 'test',
+            'flavorRef': self.flavor_id,
+            'imageRef': '',
+            'networks': 'none',
+            'block_device_mapping_v2': [{
+                'source_type': 'volume',
+                'destination_type': 'volume',
+                'boot_index': 0,
+                'uuid': self.volume_id}]
+        }
+
+    def _assert_failure_and_volume_attachments(self, server):
+        # Assert that the server is in an ERROR state
+        self._wait_for_state_change(server, 'ERROR')
+
+        # Assert that the volume is in a reserved state. As this isn't modelled
+        # by the CinderFixture we just assert that a single volume attachment
+        # remains after the failure and that it is referenced by the server.
+        attachments = self.cinder.volume_to_attachment.get(self.volume_id)
+        self.assertEqual(1, len(attachments))
+        self.assertIn(
+            self.volume_id, self.cinder.volume_ids_for_instance(server['id']))
+
+    def test_failure_to_schedule(self):
+        # Assert that a volume attachment remains after a failure to schedule
+        server = self.api.post_server({'server': self.server})
+        self._assert_failure_and_volume_attachments(server)
+
+    def test_failure_to_schedule_with_az(self):
+        # Assert that a volume attachment remains after a failure to schedule
+        # with the addition of an availability_zone in the request
+        self.server['availability_zone'] = 'nova'
+        server = self.api.post_server({'server': self.server})
+        self._assert_failure_and_volume_attachments(server)
+
+    def test_failure_to_schedule_with_host(self):
+        # Assert that a volume attachment remains after a failure to schedule
+        # using the optional host parameter introduced in microversion 2.74
+        self.server['host'] = 'compute1'
+        server = self.admin_api.post_server({'server': self.server})
+        self._assert_failure_and_volume_attachments(server)
+
+    def test_failure_to_build_with_az_and_host(self):
+        # Assert that a volume attachments remain after a failure to
+        # build and reschedule by providing an availability_zone *and* host,
+        # skipping the scheduler. This is bug #1899649.
+        self.server['availability_zone'] = 'nova:compute1'
+        server = self.admin_api.post_server({'server': self.server})
+        self._assert_failure_and_volume_attachments(server)
diff --git a/nova/tests/functional/regressions/test_bug_1902925.py b/nova/tests/functional/regressions/test_bug_1902925.py
new file mode 100644
index 0000000000..fb5f5251e5
--- /dev/null
+++ b/nova/tests/functional/regressions/test_bug_1902925.py
@@ -0,0 +1,44 @@
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+from nova.tests.functional import integrated_helpers
+from nova.tests.unit import cast_as_call
+
+
+class ComputeVersion5xPinnedRpcTests(integrated_helpers._IntegratedTestBase):
+
+    compute_driver = 'fake.MediumFakeDriver'
+    ADMIN_API = True
+    api_major_version = 'v2.1'
+    microversion = 'latest'
+
+    def setUp(self):
+        super(ComputeVersion5xPinnedRpcTests, self).setUp()
+        self.useFixture(cast_as_call.CastAsCall(self))
+
+        self.compute1 = self._start_compute(host='host1')
+
+    def _test_rebuild_instance_with_compute_rpc_pin(self, version_cap):
+        self.flags(compute=version_cap, group='upgrade_levels')
+
+        server_req = self._build_server(networks='none')
+        server = self.api.post_server({'server': server_req})
+        server = self._wait_for_state_change(server, 'ACTIVE')
+
+        self.api.post_server_action(server['id'], {'rebuild': {
+            'imageRef': '155d900f-4e14-4e4c-a73d-069cbf4541e6'
+        }})
+
+    def test_rebuild_instance_5_0(self):
+        self._test_rebuild_instance_with_compute_rpc_pin('5.0')
+
+    def test_rebuild_instance_5_12(self):
+        self._test_rebuild_instance_with_compute_rpc_pin('5.12')