diff options
author | Matt Riedemann <mriedem@us.ibm.com> | 2014-06-27 06:46:10 -0700 |
---|---|---|
committer | Matt Riedemann <mriedem@us.ibm.com> | 2014-06-27 15:02:18 -0700 |
commit | 9a01e62693a28a73120544b27ee2104558e0250e (patch) | |
tree | 8f1278e568deebec924a4af5afb46348c011a1b6 /nova/api/ec2 | |
parent | 2bda6b724c73d6ff9c80ea6b724d214d51daf56f (diff) | |
download | nova-9a01e62693a28a73120544b27ee2104558e0250e.tar.gz |
Enforce task_state is None in ec2 create_image stop instance wait loop
We're hitting races in the gate where the instance.vm_state is STOPPED
but the task_state is POWERING_OFF so when the compute_api.start method
is called, we're in an invalid task state and fail.
The compute manager's stop_instance method is correctly setting the
vm_state to STOPPED and the task_state to None when the instance is
powered off via the virt driver, so we must be hitting this from races
in the ec2 API as noted in the TODO above the method definition.
This change simply checks the task_state in addition to the vm_state
in the wait loop before continuing. The error message is also updated
for context by including the instance uuid, vm_state and task_state,
and removes the timeout value in the message since it was in
milliseconds, not seconds, to begin with.
There is already a unit test that covers this change (which was racing,
hence the bug). There are no changes to that unit test since it's really
an integration test that's running through the compute API and compute
manager code, so the fix tests itself.
Closes-Bug: #1334345
Change-Id: I13f0c743cadda6439ae15607a9ef6e4e4985626d
Diffstat (limited to 'nova/api/ec2')
-rw-r--r-- | nova/api/ec2/cloud.py | 23 |
1 files changed, 17 insertions, 6 deletions
diff --git a/nova/api/ec2/cloud.py b/nova/api/ec2/cloud.py index 1b22d280b0..1394712d6c 100644 --- a/nova/api/ec2/cloud.py +++ b/nova/api/ec2/cloud.py @@ -1670,6 +1670,10 @@ class CloudController(object): # manipulating instances/volumes/snapshots. # As other code doesn't take it into consideration, here we don't # care of it for now. Ostrich algorithm + # TODO(mriedem): Consider auto-locking the instance when stopping it and + # doing the snapshot, then unlock it when that is done. Locking the + # instance in the database would prevent other APIs from changing the state + # of the instance during this operation for non-admin users. def create_image(self, context, instance_id, **kwargs): # NOTE(yamahata): name/description are ignored by register_image(), # do so here @@ -1700,21 +1704,28 @@ class CloudController(object): if vm_state == vm_states.ACTIVE: restart_instance = True - self.compute_api.stop(context, instance) + # NOTE(mriedem): We do a call here so that we're sure the + # stop request is complete before we begin polling the state. + self.compute_api.stop(context, instance, do_cast=False) - # wait instance for really stopped + # wait instance for really stopped (and not transitioning tasks) start_time = time.time() - while vm_state != vm_states.STOPPED: + while (vm_state != vm_states.STOPPED and + instance.task_state is not None): time.sleep(1) - instance = self.compute_api.get(context, instance_uuid, - want_objects=True) + instance.refresh() vm_state = instance.vm_state # NOTE(yamahata): timeout and error. 1 hour for now for safety. # Is it too short/long? # Or is there any better way? timeout = 1 * 60 * 60 if time.time() > start_time + timeout: - err = _("Couldn't stop instance within %d sec") % timeout + err = (_("Couldn't stop instance %(instance)s within " + "1 hour. Current vm_state: %(vm_state)s, " + "current task_state: %(task_state)s") % + {'instance': instance_uuid, + 'vm_state': vm_state, + 'task_state': instance.task_state}) raise exception.InternalError(message=err) glance_uuid = instance.image_ref |