Delete the temporary volume if migration fails

Issues resolved in this patch include the following changes: * A temporary volume is created on the destination host before migrating the data from the source. However, if the creation of this volume fails, its record will be left in the database as redundant information. This patch will remove the database record if the creation fails. * If attaching the remote dest volme fails at initialize_connection due to timeout, we need to terminate the connection. Otherwise, the dest volume will not be released and successfully deleted. Closes-bug: #1398177 Change-Id: I5bc75bd7652841fddd66481ee001fb682168215c
author: Vincent Hou <sbhou@cn.ibm.com> 2015-03-03 16:04:41 +0800
committer: Vincent Hou <sbhou@cn.ibm.com> 2015-04-08 15:33:40 +0800
commit: 5221f181f305336a7a14851e89764af2a3a5bba1 (patch)
tree: 2a62910cbe033cdc258080f09254515744bb572b
parent: 1c06834e8c0c7f4a55103cdcc08744e52f0d234d (diff)
download: cinder-5221f181f305336a7a14851e89764af2a3a5bba1.tar.gz
3 files changed, 199 insertions, 40 deletions
diff --git a/cinder/tests/test_volume.py b/cinder/tests/test_volume.py
index 8bf72e358..671bce85f 100644
--- a/cinder/tests/test_volume.py
+++ b/cinder/tests/test_volume.py
@@ -3828,6 +3828,108 @@ class VolumeTestCase(BaseVolumeTestCase):
             self.assertIsNone(volume['migration_status'])
             self.assertEqual('available', volume['status'])
 
+    def test_clean_temporary_volume(self):
+        def fake_delete_volume(ctxt, volume):
+            db.volume_destroy(ctxt, volume['id'])
+
+        fake_volume = tests_utils.create_volume(self.context, size=1,
+                                                host=CONF.host)
+        fake_new_volume = tests_utils.create_volume(self.context, size=1,
+                                                    host=CONF.host)
+        # Check when the migrated volume is in migration
+        db.volume_update(self.context, fake_volume['id'],
+                         {'migration_status': 'migrating'})
+        # 1. Only clean the db
+        self.volume._clean_temporary_volume(self.context, fake_volume['id'],
+                                            fake_new_volume['id'],
+                                            clean_db_only=True)
+        self.assertRaises(exception.VolumeNotFound,
+                          db.volume_get, self.context,
+                          fake_new_volume['id'])
+
+        # 2. Delete the backend storage
+        fake_new_volume = tests_utils.create_volume(self.context, size=1,
+                                                    host=CONF.host)
+        with mock.patch.object(volume_rpcapi.VolumeAPI, 'delete_volume') as \
+                mock_delete_volume:
+            mock_delete_volume.side_effect = fake_delete_volume
+            self.volume._clean_temporary_volume(self.context,
+                                                fake_volume['id'],
+                                                fake_new_volume['id'],
+                                                clean_db_only=False)
+            self.assertRaises(exception.VolumeNotFound,
+                              db.volume_get, self.context,
+                              fake_new_volume['id'])
+
+        # Check when the migrated volume is not in migration
+        fake_new_volume = tests_utils.create_volume(self.context, size=1,
+                                                    host=CONF.host)
+        db.volume_update(self.context, fake_volume['id'],
+                         {'migration_status': 'non-migrating'})
+        self.volume._clean_temporary_volume(self.context, fake_volume['id'],
+                                            fake_new_volume['id'])
+        volume = db.volume_get(context.get_admin_context(),
+                               fake_new_volume['id'])
+        self.assertIsNone(volume['migration_status'])
+
+    def test_migrate_volume_generic_create_volume_error(self):
+        def fake_create_volume(ctxt, volume, host, req_spec, filters,
+                               allow_reschedule=True):
+            db.volume_update(ctxt, volume['id'],
+                             {'status': 'error'})
+
+        with mock.patch.object(self.volume.driver, 'migrate_volume'), \
+                mock.patch.object(volume_rpcapi.VolumeAPI, 'create_volume') as \
+                mock_create_volume, \
+                mock.patch.object(self.volume, '_clean_temporary_volume') as \
+                clean_temporary_volume:
+
+            # Exception case at the creation of the new temporary volume
+            mock_create_volume.side_effect = fake_create_volume
+            volume = tests_utils.create_volume(self.context, size=0,
+                                               host=CONF.host)
+            host_obj = {'host': 'newhost', 'capabilities': {}}
+            self.assertRaises(exception.VolumeMigrationFailed,
+                              self.volume.migrate_volume,
+                              self.context,
+                              volume['id'],
+                              host_obj,
+                              True)
+            volume = db.volume_get(context.get_admin_context(), volume['id'])
+            self.assertIsNone(volume['migration_status'])
+            self.assertEqual('available', volume['status'])
+            self.assertTrue(clean_temporary_volume.called)
+
+    def test_migrate_volume_generic_timeout_error(self):
+        CONF.set_override("migration_create_volume_timeout_secs", 2)
+
+        def fake_create_volume(ctxt, volume, host, req_spec, filters,
+                               allow_reschedule=True):
+            db.volume_update(ctxt, volume['id'],
+                             {'status': 'creating'})
+
+        with mock.patch.object(self.volume.driver, 'migrate_volume'), \
+                mock.patch.object(volume_rpcapi.VolumeAPI, 'create_volume') as \
+                mock_create_volume, \
+                mock.patch.object(self.volume, '_clean_temporary_volume') as \
+                clean_temporary_volume:
+
+            # Exception case at the timeout of the volume creation
+            mock_create_volume.side_effect = fake_create_volume
+            volume = tests_utils.create_volume(self.context, size=0,
+                                               host=CONF.host)
+            host_obj = {'host': 'newhost', 'capabilities': {}}
+            self.assertRaises(exception.VolumeMigrationFailed,
+                              self.volume.migrate_volume,
+                              self.context,
+                              volume['id'],
+                              host_obj,
+                              True)
+            volume = db.volume_get(context.get_admin_context(), volume['id'])
+            self.assertIsNone(volume['migration_status'])
+            self.assertEqual('available', volume['status'])
+            self.assertTrue(clean_temporary_volume.called)
+
     def test_migrate_volume_generic_create_export_error(self):
         def fake_create_volume(ctxt, volume, host, req_spec, filters,
                                allow_reschedule=True):
diff --git a/cinder/volume/driver.py b/cinder/volume/driver.py
index 5e4a70ce1..b9dfea00f 100644
--- a/cinder/volume/driver.py
+++ b/cinder/volume/driver.py
@@ -640,7 +640,23 @@ class BaseVD(object):
             # Call remote manager's initialize_connection which includes
             # driver's create_export and initialize_connection
             rpcapi = volume_rpcapi.VolumeAPI()
-            conn = rpcapi.initialize_connection(context, volume, properties)
+            try:
+                conn = rpcapi.initialize_connection(context, volume,
+                                                    properties)
+            except Exception:
+                with excutils.save_and_reraise_exception():
+                    # It is possible that initialize_connection fails due to
+                    # timeout. In fact, the volume is already attached after
+                    # the timeout error is raised, so the connection worths
+                    # a try of terminating.
+                    try:
+                        rpcapi.terminate_connection(context, volume,
+                                                    properties, force=True)
+                    except Exception:
+                        LOG.warning(_LW("Failed terminating the connection "
+                                        "of volume %(volume_id)s, but it is "
+                                        "acceptable."),
+                                    {'volume_id': volume['id']})
         else:
             # Call local driver's create_export and initialize_connection.
             # NOTE(avishay) This is copied from the manager's code - need to
diff --git a/cinder/volume/manager.py b/cinder/volume/manager.py
index 2bb52257b..3adfa8c83 100644
--- a/cinder/volume/manager.py
+++ b/cinder/volume/manager.py
@@ -528,6 +528,10 @@ class VolumeManager(manager.SchedulerDependentManager):
             raise exception.InvalidVolume(
                 reason=_("volume is not local to this node"))
 
+        is_migrating = volume_ref['migration_status'] is not None
+        is_migrating_dest = (is_migrating and
+                             volume_ref['migration_status'].startswith(
+                                 'target:'))
         self._notify_about_volume_usage(context, volume_ref, "delete.start")
         try:
             # NOTE(flaper87): Verify the driver is enabled
@@ -545,19 +549,17 @@ class VolumeManager(manager.SchedulerDependentManager):
         except exception.VolumeIsBusy:
             LOG.error(_LE("Cannot delete volume %s: volume is busy"),
                       volume_ref['id'])
-            self.db.volume_update(context, volume_ref['id'],
-                                  {'status': 'available'})
+            # If this is a destination volume, we have to clear the database
+            # record to avoid user confusion.
+            self._clear_db(context, is_migrating_dest, volume_ref,
+                           'available')
             return True
         except Exception:
             with excutils.save_and_reraise_exception():
-                self.db.volume_update(context,
-                                      volume_ref['id'],
-                                      {'status': 'error_deleting'})
-
-        is_migrating = volume_ref['migration_status'] is not None
-        is_migrating_dest = (is_migrating and
-                             volume_ref['migration_status'].startswith(
-                                 'target:'))
+                # If this is a destination volume, we have to clear the
+                # database record to avoid user confusion.
+                self._clear_db(context, is_migrating_dest, volume_ref,
+                               'error_deleting')
 
         # If deleting source/destination volume in a migration, we should
         # skip quotas.
@@ -613,6 +615,21 @@ class VolumeManager(manager.SchedulerDependentManager):
 
         return True
 
+    def _clear_db(self, context, is_migrating_dest, volume_ref, status):
+        # This method is called when driver.unmanage() or
+        # driver.delete_volume() fails in delete_volume(), so it is already
+        # in the exception handling part.
+        if is_migrating_dest:
+            self.db.volume_destroy(context, volume_ref['id'])
+            LOG.error(_LE("Unable to delete the destination volume %s "
+                          "during volume migration, but the database "
+                          "record needs to be deleted."),
+                      volume_ref['id'])
+        else:
+            self.db.volume_update(context,
+                                  volume_ref['id'],
+                                  {'status': status})
+
     def create_snapshot(self, context, volume_id, snapshot):
         """Creates and exports the snapshot."""
         context = context.elevated()
@@ -1224,13 +1241,19 @@ class VolumeManager(manager.SchedulerDependentManager):
         new_volume = self.db.volume_get(ctxt, new_volume['id'])
         tries = 0
         while new_volume['status'] != 'available':
-            tries = tries + 1
+            tries += 1
             now = time.time()
             if new_volume['status'] == 'error':
                 msg = _("failed to create new_volume on destination host")
+                self._clean_temporary_volume(ctxt, volume['id'],
+                                             new_volume['id'],
+                                             clean_db_only=True)
                 raise exception.VolumeMigrationFailed(reason=msg)
             elif now > deadline:
                 msg = _("timeout creating new_volume on destination host")
+                self._clean_temporary_volume(ctxt, volume['id'],
+                                             new_volume['id'],
+                                             clean_db_only=True)
                 raise exception.VolumeMigrationFailed(reason=msg)
             else:
                 time.sleep(tries ** 2)
@@ -1257,34 +1280,11 @@ class VolumeManager(manager.SchedulerDependentManager):
                                                   new_volume['id'])
         except Exception:
             with excutils.save_and_reraise_exception():
-                msg = _("Failed to copy volume %(vol1)s to %(vol2)s")
-                LOG.error(msg % {'vol1': volume['id'],
-                                 'vol2': new_volume['id']})
-                volume = self.db.volume_get(ctxt, volume['id'])
-
-                # If we're in the migrating phase, we need to cleanup
-                # destination volume because source volume is remaining
-                if volume['migration_status'] == 'migrating':
-                    rpcapi.delete_volume(ctxt, new_volume)
-                else:
-                    # If we're in the completing phase don't delete the
-                    # destination because we may have already deleted the
-                    # source! But the migration_status in database should
-                    # be cleared to handle volume after migration failure
-                    try:
-                        updates = {'migration_status': None}
-                        self.db.volume_update(ctxt, new_volume['id'], updates)
-                    except exception.VolumeNotFound:
-                        LOG.info(_LI("Couldn't find destination volume "
-                                     "%(vol)s in database. The entry might be "
-                                     "successfully deleted during migration "
-                                     "completion phase."),
-                                 {'vol': new_volume['id']})
-
-                    LOG.warn(_LW("Failed to migrate volume. The destination "
-                                 "volume %(vol)s is not deleted since the "
-                                 "source volume may have already deleted."),
-                             {'vol': new_volume['id']})
+                msg = _LE("Failed to copy volume %(vol1)s to %(vol2)s")
+                LOG.error(msg, {'vol1': volume['id'],
+                                'vol2': new_volume['id']})
+                self._clean_temporary_volume(ctxt, volume['id'],
+                                             new_volume['id'])
 
     def _get_original_status(self, volume):
         attachments = volume['volume_attachment']
@@ -1293,6 +1293,47 @@ class VolumeManager(manager.SchedulerDependentManager):
         else:
             return 'in-use'
 
+    def _clean_temporary_volume(self, ctxt, volume_id, new_volume_id,
+                                clean_db_only=False):
+        volume = self.db.volume_get(ctxt, volume_id)
+        # If we're in the migrating phase, we need to cleanup
+        # destination volume because source volume is remaining
+        if volume['migration_status'] == 'migrating':
+            try:
+                if clean_db_only:
+                    # The temporary volume is not created, only DB data
+                    # is created
+                    self.db.volume_destroy(ctxt, new_volume_id)
+                else:
+                    # The temporary volume is already created
+                    rpcapi = volume_rpcapi.VolumeAPI()
+                    volume = self.db.volume_get(ctxt, new_volume_id)
+                    rpcapi.delete_volume(ctxt, volume)
+            except exception.VolumeNotFound:
+                LOG.info(_LI("Couldn't find the temporary volume "
+                             "%(vol)s in the database. There is no need "
+                             "to clean up this volume."),
+                         {'vol': new_volume_id})
+        else:
+            # If we're in the completing phase don't delete the
+            # destination because we may have already deleted the
+            # source! But the migration_status in database should
+            # be cleared to handle volume after migration failure
+            try:
+                updates = {'migration_status': None}
+                self.db.volume_update(ctxt, new_volume_id, updates)
+            except exception.VolumeNotFound:
+                LOG.info(_LI("Couldn't find destination volume "
+                             "%(vol)s in the database. The entry might be "
+                             "successfully deleted during migration "
+                             "completion phase."),
+                         {'vol': new_volume_id})
+
+                LOG.warning(_LW("Failed to migrate volume. The destination "
+                                "volume %(vol)s is not deleted since the "
+                                "source volume may have been deleted."),
+                            {'vol': new_volume_id})
+
     def migrate_volume_completion(self, ctxt, volume_id, new_volume_id,
                                   error=False):
         try:
author	Vincent Hou <sbhou@cn.ibm.com>	2015-03-03 16:04:41 +0800
committer	Vincent Hou <sbhou@cn.ibm.com>	2015-04-08 15:33:40 +0800
commit	5221f181f305336a7a14851e89764af2a3a5bba1 (patch)
tree	2a62910cbe033cdc258080f09254515744bb572b
parent	1c06834e8c0c7f4a55103cdcc08744e52f0d234d (diff)
download	cinder-5221f181f305336a7a14851e89764af2a3a5bba1.tar.gz