Eliminate client race condition in convergence delete

Previously when doing a delete in convergence, we spawned a new thread to start the delete. This was to ensure the request returned without waiting for potentially slow operations like deleting snapshots and stopping existing workers (which could have caused RPC timeouts). The result, however, was that the stack was not guaranteed to be DELETE_IN_PROGRESS by the time the request returned. In the case where a previous delete had failed, a client request to show the stack issued soon after the delete had returned would likely show the stack status as DELETE_FAILED still. Only a careful examination of the updated_at timestamp would reveal that this corresponded to the previous delete and not the one just issued. In the case of a nested stack, this could leave the parent stack effectively undeletable. (Since the updated_at time is not modified on delete in the legacy path, we never checked it when deleting a nested stack.) To prevent this, change the order of operations so that the stack is first put into the DELETE_IN_PROGRESS state before the delete_stack call returns. Only after the state is stored, spawn a thread to complete the operation. Since there is no stack lock in convergence, this gives us the flexibility to cancel other in-progress workers after we've already written to the Stack itself to start a new traversal. The previous patch in the series means that snapshots are now also deleted after the stack is marked as DELETE_IN_PROGRESS. This is consistent with the legacy path. Change-Id: Ib767ce8b39293c2279bf570d8399c49799cbaa70 Story: #1669608 Task: 23174
author: Zane Bitter <zbitter@redhat.com> 2018-07-30 20:48:25 -0400
committer: Zane Bitter <zbitter@redhat.com> 2018-07-30 20:48:28 -0400
commit: e63778efc91869efbd41e2f96b0c554aef1c80e5 (patch)
tree: 002dab60a9705d25f22548b5ce3edaee2ebb831c /heat/engine
parent: 15554a2debda84e3f49d3c0b2e056a1372f3de13 (diff)
download: heat-e63778efc91869efbd41e2f96b0c554aef1c80e5.tar.gz
3 files changed, 22 insertions, 13 deletions
diff --git a/heat/engine/service.py b/heat/engine/service.py
index 0ba2ef183..0101e5e3c 100644
--- a/heat/engine/service.py
+++ b/heat/engine/service.py
@@ -1374,14 +1374,19 @@ class EngineService(service.ServiceBase):
         self.resource_enforcer.enforce_stack(stack, is_registered_policy=True)
 
         if stack.convergence and cfg.CONF.convergence_engine:
-            def convergence_delete():
-                stack.thread_group_mgr = self.thread_group_mgr
+            stack.thread_group_mgr = self.thread_group_mgr
+            template = templatem.Template.create_empty_template(
+                from_template=stack.t)
+
+            # stop existing traversal; mark stack as FAILED
+            if stack.status == stack.IN_PROGRESS:
+                self.worker_service.stop_traversal(stack)
+
+            def stop_workers():
                 self.worker_service.stop_all_workers(stack)
-                template = templatem.Template.create_empty_template(
-                    from_template=stack.t)
-                stack.converge_stack(template=template, action=stack.DELETE)
 
-            self.thread_group_mgr.start(stack.id, convergence_delete)
+            stack.converge_stack(template=template, action=stack.DELETE,
+                                 pre_converge=stop_workers)
             return
 
         lock = stack_lock.StackLock(cnxt, stack.id, self.engine_id)
diff --git a/heat/engine/stack.py b/heat/engine/stack.py
index 6a1e954b9..7dab91a7d 100644
--- a/heat/engine/stack.py
+++ b/heat/engine/stack.py
@@ -1300,7 +1300,8 @@ class Stack(collections.Mapping):
         updater()
 
     @profiler.trace('Stack.converge_stack', hide_args=False)
-    def converge_stack(self, template, action=UPDATE, new_stack=None):
+    def converge_stack(self, template, action=UPDATE, new_stack=None,
+                       pre_converge=None):
         """Update the stack template and trigger convergence for resources."""
         if action not in [self.CREATE, self.ADOPT]:
             # no back-up template for create action
@@ -1354,9 +1355,10 @@ class Stack(collections.Mapping):
 
         # TODO(later): lifecycle_plugin_utils.do_pre_ops
 
-        self.thread_group_mgr.start(self.id, self._converge_create_or_update)
+        self.thread_group_mgr.start(self.id, self._converge_create_or_update,
+                                    pre_converge=pre_converge)
 
-    def _converge_create_or_update(self):
+    def _converge_create_or_update(self, pre_converge=None):
         current_resources = self._update_or_store_resources()
         self._compute_convg_dependencies(self.ext_rsrcs_db, self.dependencies,
                                          current_resources)
@@ -1373,6 +1375,8 @@ class Stack(collections.Mapping):
                          'action': self.action})
             return
 
+        if callable(pre_converge):
+            pre_converge()
         if self.action == self.DELETE:
             try:
                 self.delete_all_snapshots()
diff --git a/heat/engine/worker.py b/heat/engine/worker.py
index 2e38e7acb..274dd436f 100644
--- a/heat/engine/worker.py
+++ b/heat/engine/worker.py
@@ -125,11 +125,11 @@ class WorkerService(object):
                 _stop_traversal(child)
 
     def stop_all_workers(self, stack):
-        # stop the traversal
-        if stack.status == stack.IN_PROGRESS:
-            self.stop_traversal(stack)
+        """Cancel all existing worker threads for the stack.
 
-        # cancel existing workers
+        Threads will stop running at their next yield point, whether or not the
+        resource operations are complete.
+        """
         cancelled = _cancel_workers(stack, self.thread_group_mgr,
                                     self.engine_id, self._rpc_client)
         if not cancelled:
author	Zane Bitter <zbitter@redhat.com>	2018-07-30 20:48:25 -0400
committer	Zane Bitter <zbitter@redhat.com>	2018-07-30 20:48:28 -0400
commit	e63778efc91869efbd41e2f96b0c554aef1c80e5 (patch)
tree	002dab60a9705d25f22548b5ce3edaee2ebb831c /heat/engine
parent	15554a2debda84e3f49d3c0b2e056a1372f3de13 (diff)
download	heat-e63778efc91869efbd41e2f96b0c554aef1c80e5.tar.gz