summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZuul <zuul@review.opendev.org>2020-10-06 02:28:17 +0000
committerGerrit Code Review <review@openstack.org>2020-10-06 02:28:17 +0000
commit8f0c3e999f72a78b9bff0d1446c80d5929cbaab8 (patch)
tree6536967263f6ea32463f0e885a6687eece032fee
parent4cf72ea6bfc58d33da894f248184c08c36055884 (diff)
parentefc35b1c5293c7c6c85f8cf9fd9d8cd8de71d1d5 (diff)
downloadnova-8f0c3e999f72a78b9bff0d1446c80d5929cbaab8.tar.gz
Merge "Sanity check instance mapping during scheduling" into stable/train
-rw-r--r--nova/conductor/manager.py79
-rw-r--r--nova/tests/unit/conductor/test_conductor.py58
2 files changed, 120 insertions, 17 deletions
diff --git a/nova/conductor/manager.py b/nova/conductor/manager.py
index dc2b844496..1c644cf299 100644
--- a/nova/conductor/manager.py
+++ b/nova/conductor/manager.py
@@ -1307,8 +1307,37 @@ class ComputeTaskManager(base.Base):
updates = {'vm_state': vm_states.ERROR, 'task_state': None}
for instance in instances_by_uuid.values():
+
+ inst_mapping = None
+ try:
+ # We don't need the cell0-targeted context here because the
+ # instance mapping is in the API DB.
+ inst_mapping = \
+ objects.InstanceMapping.get_by_instance_uuid(
+ context, instance.uuid)
+ except exception.InstanceMappingNotFound:
+ # The API created the instance mapping record so it should
+ # definitely be here. Log an error but continue to create the
+ # instance in the cell0 database.
+ LOG.error('While burying instance in cell0, no instance '
+ 'mapping was found.', instance=instance)
+
+ # Perform a final sanity check that the instance is not mapped
+ # to some other cell already because of maybe some crazy
+ # clustered message queue weirdness.
+ if inst_mapping and inst_mapping.cell_mapping is not None:
+ LOG.error('When attempting to bury instance in cell0, the '
+ 'instance is already mapped to cell %s. Ignoring '
+ 'bury in cell0 attempt.',
+ inst_mapping.cell_mapping.identity,
+ instance=instance)
+ continue
+
with obj_target_cell(instance, cell0) as cctxt:
instance.create()
+ if inst_mapping:
+ inst_mapping.cell_mapping = cell0
+ inst_mapping.save()
# Record an instance action with a failed event.
self._create_instance_action_for_cell0(
@@ -1328,16 +1357,6 @@ class ComputeTaskManager(base.Base):
self._set_vm_state_and_notify(
cctxt, instance.uuid, 'build_instances', updates,
exc, request_spec)
- try:
- # We don't need the cell0-targeted context here because the
- # instance mapping is in the API DB.
- inst_mapping = \
- objects.InstanceMapping.get_by_instance_uuid(
- context, instance.uuid)
- inst_mapping.cell_mapping = cell0
- inst_mapping.save()
- except exception.InstanceMappingNotFound:
- pass
for build_request in build_requests:
try:
@@ -1506,13 +1525,8 @@ class ComputeTaskManager(base.Base):
instance.tags = instance_tags if instance_tags \
else objects.TagList()
- # Update mapping for instance. Normally this check is guarded by
- # a try/except but if we're here we know that a newer nova-api
- # handled the build process and would have created the mapping
- inst_mapping = objects.InstanceMapping.get_by_instance_uuid(
- context, instance.uuid)
- inst_mapping.cell_mapping = cell
- inst_mapping.save()
+ # Update mapping for instance.
+ self._map_instance_to_cell(context, instance, cell)
if not self._delete_build_request(
context, build_request, instance, cell, instance_bdms,
@@ -1540,6 +1554,37 @@ class ComputeTaskManager(base.Base):
host=host.service_host, node=host.nodename,
limits=host.limits, host_list=host_list)
+ @staticmethod
+ def _map_instance_to_cell(context, instance, cell):
+ """Update the instance mapping to point at the given cell.
+
+ During initial scheduling once a host and cell is selected in which
+ to build the instance this method is used to update the instance
+ mapping to point at that cell.
+
+ :param context: nova auth RequestContext
+ :param instance: Instance object being built
+ :param cell: CellMapping representing the cell in which the instance
+ was created and is being built.
+ :returns: InstanceMapping object that was updated.
+ """
+ inst_mapping = objects.InstanceMapping.get_by_instance_uuid(
+ context, instance.uuid)
+ # Perform a final sanity check that the instance is not mapped
+ # to some other cell already because of maybe some crazy
+ # clustered message queue weirdness.
+ if inst_mapping.cell_mapping is not None:
+ LOG.error('During scheduling instance is already mapped to '
+ 'another cell: %s. This should not happen and is an '
+ 'indication of bigger problems. If you see this you '
+ 'should report it to the nova team. Overwriting '
+ 'the mapping to point at cell %s.',
+ inst_mapping.cell_mapping.identity, cell.identity,
+ instance=instance)
+ inst_mapping.cell_mapping = cell
+ inst_mapping.save()
+ return inst_mapping
+
def _cleanup_build_artifacts(self, context, exc, instances, build_requests,
request_specs, block_device_mappings, tags,
cell_mapping_cache):
diff --git a/nova/tests/unit/conductor/test_conductor.py b/nova/tests/unit/conductor/test_conductor.py
index 0f6689eba2..525d673f11 100644
--- a/nova/tests/unit/conductor/test_conductor.py
+++ b/nova/tests/unit/conductor/test_conductor.py
@@ -2281,6 +2281,27 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
self.params['request_specs'][0].requested_resources = []
self._do_schedule_and_build_instances_test(self.params)
+ def test_map_instance_to_cell_already_mapped(self):
+ """Tests a scenario where an instance is already mapped to a cell
+ during scheduling.
+ """
+ build_request = self.params['build_requests'][0]
+ instance = build_request.get_new_instance(self.ctxt)
+ # Simulate MQ split brain craziness by updating the instance mapping
+ # to point at cell0.
+ inst_mapping = objects.InstanceMapping.get_by_instance_uuid(
+ self.ctxt, instance.uuid)
+ inst_mapping.cell_mapping = self.cell_mappings['cell0']
+ inst_mapping.save()
+ cell1 = self.cell_mappings['cell1']
+ inst_mapping = self.conductor._map_instance_to_cell(
+ self.ctxt, instance, cell1)
+ # Assert that the instance mapping was updated to point at cell1 but
+ # also that an error was logged.
+ self.assertEqual(cell1.uuid, inst_mapping.cell_mapping.uuid)
+ self.assertIn('During scheduling instance is already mapped to '
+ 'another cell', self.stdlog.logger.output)
+
@mock.patch('nova.objects.InstanceMapping.get_by_instance_uuid')
def test_cleanup_build_artifacts(self, inst_map_get):
"""Simple test to ensure the order of operations in the cleanup method
@@ -2378,6 +2399,17 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
build_requests = objects.BuildRequestList.get_all(self.ctxt)
instances = objects.InstanceList.get_all(self.ctxt)
+ # Verify instance mappings.
+ inst_mappings = objects.InstanceMappingList.get_by_cell_id(
+ self.ctxt, self.cell_mappings['cell0'].id)
+ # bare_br is the only instance that has a mapping from setUp.
+ self.assertEqual(1, len(inst_mappings))
+ # Since we did not setup instance mappings for the other fake build
+ # requests used in this test, we should see a message logged about
+ # there being no instance mappings.
+ self.assertIn('While burying instance in cell0, no instance mapping '
+ 'was found', self.stdlog.logger.output)
+
self.assertEqual(0, len(build_requests))
self.assertEqual(4, len(instances))
inst_states = {inst.uuid: (inst.deleted, inst.vm_state)
@@ -2475,6 +2507,32 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
test.MatchType(context.RequestContext), inst.uuid,
self.params['tags'])
+ @mock.patch('nova.objects.Instance.create')
+ def test_bury_in_cell0_already_mapped(self, mock_inst_create):
+ """Tests a scenario where the instance mapping is already mapped to a
+ cell when we attempt to bury the instance in cell0.
+ """
+ build_request = self.params['build_requests'][0]
+ # Simulate MQ split brain craziness by updating the instance mapping
+ # to point at cell1.
+ inst_mapping = objects.InstanceMapping.get_by_instance_uuid(
+ self.ctxt, build_request.instance_uuid)
+ inst_mapping.cell_mapping = self.cell_mappings['cell1']
+ inst_mapping.save()
+ # Now attempt to bury the instance in cell0.
+ with mock.patch.object(inst_mapping, 'save') as mock_inst_map_save:
+ self.conductor._bury_in_cell0(
+ self.ctxt, self.params['request_specs'][0],
+ exc.NoValidHost(reason='idk'), build_requests=[build_request])
+ # We should have exited without creating the instance in cell0 nor
+ # should the instance mapping have been updated to point at cell0.
+ mock_inst_create.assert_not_called()
+ mock_inst_map_save.assert_not_called()
+ # And we should have logged an error.
+ self.assertIn('When attempting to bury instance in cell0, the '
+ 'instance is already mapped to cell',
+ self.stdlog.logger.output)
+
def test_reset(self):
with mock.patch('nova.compute.rpcapi.ComputeAPI') as mock_rpc:
old_rpcapi = self.conductor_manager.compute_rpcapi