Delete allocations even if _confirm_resize raises

When we are confirming a resize, the guest is on the dest host and the instance host/node values in the database are pointing at the dest host, so the _confirm_resize method on the source is really best effort. If something fails, we should not leak allocations in placement for the source compute node resource provider since the instance is not actually consuming the source node provider resources. This change refactors the error handling around the _confirm_resize call so the big nesting for _error_out_instance_on_exception is moved to confirm_resize and then a try/finally is added around _confirm_resize so we can be sure to try and cleanup the allocations even if _confirm_resize fails in some obscure way. If _confirm_resize does fail, the error gets re-raised along with logging a traceback and hint about how to correct the instance state in the DB by hard rebooting the server on the dest host. Change-Id: I29c5f491ec20a71283190a1599e7732541de736f Closes-Bug: #1821594 (cherry picked from commit 03a6d26691c1f182224d59190b79f48df278099e) (cherry picked from commit 5f515060f6c79f113f7b8107596e41056445c79f) (cherry picked from commit 37ac54a42ec91821d62864d63c486c002608491b)
author: Matt Riedemann <mriedem.os@gmail.com> 2019-03-25 14:02:17 -0400
committer: Tony Breeds <tony@bakeyournoodle.com> 2019-05-29 00:50:29 +0000
commit: b8435d6001c30e67ac277e414b11708d4d660555 (patch)
tree: 901a14cdfab23568fcc28e138467a1ee5a72941e
parent: 076c576eff7f6ef05ad1d3842e251bd7911acf29 (diff)
download: nova-b8435d6001c30e67ac277e414b11708d4d660555.tar.gz
2 files changed, 89 insertions, 57 deletions
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 135995c9c3..26562f94d7 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -3746,7 +3746,31 @@ class ComputeManager(manager.Manager):
                          instance=instance)
                 return
 
-            self._confirm_resize(context, instance, migration=migration)
+            with self._error_out_instance_on_exception(context, instance):
+                old_instance_type = instance.old_flavor
+                try:
+                    self._confirm_resize(
+                        context, instance, migration=migration)
+                except Exception:
+                    # Something failed when cleaning up the source host so
+                    # log a traceback and leave a hint about hard rebooting
+                    # the server to correct its state in the DB.
+                    with excutils.save_and_reraise_exception(logger=LOG):
+                        LOG.exception(
+                            'Confirm resize failed on source host %s. '
+                            'Resource allocations in the placement service '
+                            'will be removed regardless because the instance '
+                            'is now on the destination host %s. You can try '
+                            'hard rebooting the instance to correct its '
+                            'state.', self.host, migration.dest_compute,
+                            instance=instance)
+                finally:
+                    # Whether an error occurred or not, at this point the
+                    # instance is on the dest host so to avoid leaking
+                    # allocations in placement, delete them here.
+                    self._delete_allocation_after_move(
+                        context, instance, migration, old_instance_type,
+                        migration.source_node)
 
         do_confirm_resize(context, instance, migration.id)
 
@@ -3758,63 +3782,59 @@ class ComputeManager(manager.Manager):
             self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
             phase=fields.NotificationPhase.START)
 
-        with self._error_out_instance_on_exception(context, instance):
-            # NOTE(danms): delete stashed migration information
-            old_instance_type = instance.old_flavor
-            instance.old_flavor = None
-            instance.new_flavor = None
-            instance.system_metadata.pop('old_vm_state', None)
-            instance.save()
-
-            # NOTE(tr3buchet): tear down networks on source host
-            self.network_api.setup_networks_on_host(context, instance,
-                               migration.source_compute, teardown=True)
+        # NOTE(danms): delete stashed migration information
+        old_instance_type = instance.old_flavor
+        instance.old_flavor = None
+        instance.new_flavor = None
+        instance.system_metadata.pop('old_vm_state', None)
+        instance.save()
 
-            network_info = self.network_api.get_instance_nw_info(context,
-                                                                 instance)
-            # TODO(mriedem): Get BDMs here and pass them to the driver.
-            self.driver.confirm_migration(context, migration, instance,
-                                          network_info)
+        # NOTE(tr3buchet): tear down networks on source host
+        self.network_api.setup_networks_on_host(context, instance,
+                           migration.source_compute, teardown=True)
 
-            migration.status = 'confirmed'
-            with migration.obj_as_admin():
-                migration.save()
+        network_info = self.network_api.get_instance_nw_info(context,
+                                                             instance)
+        # TODO(mriedem): Get BDMs here and pass them to the driver.
+        self.driver.confirm_migration(context, migration, instance,
+                                      network_info)
 
-            rt = self._get_resource_tracker()
-            rt.drop_move_claim(context, instance, migration.source_node,
-                               old_instance_type, prefix='old_')
-            self._delete_allocation_after_move(context, instance, migration,
-                                               old_instance_type,
-                                               migration.source_node)
-            instance.drop_migration_context()
+        migration.status = 'confirmed'
+        with migration.obj_as_admin():
+            migration.save()
 
-            # NOTE(mriedem): The old_vm_state could be STOPPED but the user
-            # might have manually powered up the instance to confirm the
-            # resize/migrate, so we need to check the current power state
-            # on the instance and set the vm_state appropriately. We default
-            # to ACTIVE because if the power state is not SHUTDOWN, we
-            # assume _sync_instance_power_state will clean it up.
-            p_state = instance.power_state
-            vm_state = None
-            if p_state == power_state.SHUTDOWN:
-                vm_state = vm_states.STOPPED
-                LOG.debug("Resized/migrated instance is powered off. "
-                          "Setting vm_state to '%s'.", vm_state,
-                          instance=instance)
-            else:
-                vm_state = vm_states.ACTIVE
+        rt = self._get_resource_tracker()
+        rt.drop_move_claim(context, instance, migration.source_node,
+                           old_instance_type, prefix='old_')
+        instance.drop_migration_context()
+
+        # NOTE(mriedem): The old_vm_state could be STOPPED but the user
+        # might have manually powered up the instance to confirm the
+        # resize/migrate, so we need to check the current power state
+        # on the instance and set the vm_state appropriately. We default
+        # to ACTIVE because if the power state is not SHUTDOWN, we
+        # assume _sync_instance_power_state will clean it up.
+        p_state = instance.power_state
+        vm_state = None
+        if p_state == power_state.SHUTDOWN:
+            vm_state = vm_states.STOPPED
+            LOG.debug("Resized/migrated instance is powered off. "
+                      "Setting vm_state to '%s'.", vm_state,
+                      instance=instance)
+        else:
+            vm_state = vm_states.ACTIVE
 
-            instance.vm_state = vm_state
-            instance.task_state = None
-            instance.save(expected_task_state=[None, task_states.DELETING,
-                                               task_states.SOFT_DELETING])
+        instance.vm_state = vm_state
+        instance.task_state = None
+        instance.save(expected_task_state=[None, task_states.DELETING,
+                                           task_states.SOFT_DELETING])
 
-            self._notify_about_instance_usage(
-                context, instance, "resize.confirm.end",
-                network_info=network_info)
-            compute_utils.notify_about_instance_action(context, instance,
-                   self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
-                   phase=fields.NotificationPhase.END)
+        self._notify_about_instance_usage(
+            context, instance, "resize.confirm.end",
+            network_info=network_info)
+        compute_utils.notify_about_instance_action(context, instance,
+               self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
+               phase=fields.NotificationPhase.END)
 
     def _delete_allocation_after_move(self, context, instance, migration,
                                       flavor, nodename):
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index 05e5b3391e..445713266d 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -6264,6 +6264,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
             dest_node=None,
             dest_host=None,
             source_compute='source_compute',
+            source_node='source_node',
             status='migrating')
         self.migration.save = mock.MagicMock()
         self.useFixture(fixtures.SpawnIsSynchronousFixture())
@@ -6617,6 +6618,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
         do_finish_revert_resize()
 
     def test_confirm_resize_deletes_allocations(self):
+        @mock.patch('nova.objects.Instance.get_by_uuid')
+        @mock.patch('nova.objects.Migration.get_by_id')
         @mock.patch.object(self.migration, 'save')
         @mock.patch.object(self.compute, '_notify_about_instance_usage')
         @mock.patch.object(self.compute, 'network_api')
@@ -6627,12 +6630,15 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
         @mock.patch.object(self.instance, 'save')
         def do_confirm_resize(mock_save, mock_drop, mock_delete, mock_get_rt,
                               mock_confirm, mock_nwapi, mock_notify,
-                              mock_mig_save):
+                              mock_mig_save, mock_mig_get, mock_inst_get):
             self.instance.migration_context = objects.MigrationContext()
             self.migration.source_compute = self.instance['host']
             self.migration.source_node = self.instance['node']
-            self.compute._confirm_resize(self.context, self.instance,
-                                         self.migration)
+            self.migration.status = 'confirming'
+            mock_mig_get.return_value = self.migration
+            mock_inst_get.return_value = self.instance
+            self.compute.confirm_resize(self.context, self.instance,
+                                        self.migration)
             mock_delete.assert_called_once_with(self.context, self.instance,
                                                 self.migration,
                                                 self.instance.old_flavor,
@@ -6688,9 +6694,10 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
         with test.nested(
             mock.patch.object(self.compute, 'network_api'),
             mock.patch.object(self.compute.driver, 'confirm_migration',
-                              side_effect=error)
+                              side_effect=error),
+            mock.patch.object(self.compute, '_delete_allocation_after_move')
         ) as (
-            network_api, confirm_migration
+            network_api, confirm_migration, delete_allocation
         ):
             self.assertRaises(exception.HypervisorUnavailable,
                               self.compute.confirm_resize,
@@ -6704,6 +6711,11 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
         self.assertEqual(2, instance_save.call_count)
         # The migration.status should have been saved.
         self.migration.save.assert_called_once_with()
+        # Allocations should always be cleaned up even if cleaning up the
+        # source host fails.
+        delete_allocation.assert_called_once_with(
+            self.context, self.instance, self.migration,
+            self.instance.old_flavor, self.migration.source_node)
         # Assert other mocks we care less about.
         notify_usage.assert_called_once()
         notify_action.assert_called_once()
author	Matt Riedemann <mriedem.os@gmail.com>	2019-03-25 14:02:17 -0400
committer	Tony Breeds <tony@bakeyournoodle.com>	2019-05-29 00:50:29 +0000
commit	b8435d6001c30e67ac277e414b11708d4d660555 (patch)
tree	901a14cdfab23568fcc28e138467a1ee5a72941e
parent	076c576eff7f6ef05ad1d3842e251bd7911acf29 (diff)
download	nova-b8435d6001c30e67ac277e414b11708d4d660555.tar.gz