diff options
author | Zuul <zuul@review.opendev.org> | 2020-10-06 02:28:17 +0000 |
---|---|---|
committer | Gerrit Code Review <review@openstack.org> | 2020-10-06 02:28:17 +0000 |
commit | 8f0c3e999f72a78b9bff0d1446c80d5929cbaab8 (patch) | |
tree | 6536967263f6ea32463f0e885a6687eece032fee | |
parent | 4cf72ea6bfc58d33da894f248184c08c36055884 (diff) | |
parent | efc35b1c5293c7c6c85f8cf9fd9d8cd8de71d1d5 (diff) | |
download | nova-8f0c3e999f72a78b9bff0d1446c80d5929cbaab8.tar.gz |
Merge "Sanity check instance mapping during scheduling" into stable/train
-rw-r--r-- | nova/conductor/manager.py | 79 | ||||
-rw-r--r-- | nova/tests/unit/conductor/test_conductor.py | 58 |
2 files changed, 120 insertions, 17 deletions
diff --git a/nova/conductor/manager.py b/nova/conductor/manager.py index dc2b844496..1c644cf299 100644 --- a/nova/conductor/manager.py +++ b/nova/conductor/manager.py @@ -1307,8 +1307,37 @@ class ComputeTaskManager(base.Base): updates = {'vm_state': vm_states.ERROR, 'task_state': None} for instance in instances_by_uuid.values(): + + inst_mapping = None + try: + # We don't need the cell0-targeted context here because the + # instance mapping is in the API DB. + inst_mapping = \ + objects.InstanceMapping.get_by_instance_uuid( + context, instance.uuid) + except exception.InstanceMappingNotFound: + # The API created the instance mapping record so it should + # definitely be here. Log an error but continue to create the + # instance in the cell0 database. + LOG.error('While burying instance in cell0, no instance ' + 'mapping was found.', instance=instance) + + # Perform a final sanity check that the instance is not mapped + # to some other cell already because of maybe some crazy + # clustered message queue weirdness. + if inst_mapping and inst_mapping.cell_mapping is not None: + LOG.error('When attempting to bury instance in cell0, the ' + 'instance is already mapped to cell %s. Ignoring ' + 'bury in cell0 attempt.', + inst_mapping.cell_mapping.identity, + instance=instance) + continue + with obj_target_cell(instance, cell0) as cctxt: instance.create() + if inst_mapping: + inst_mapping.cell_mapping = cell0 + inst_mapping.save() # Record an instance action with a failed event. self._create_instance_action_for_cell0( @@ -1328,16 +1357,6 @@ class ComputeTaskManager(base.Base): self._set_vm_state_and_notify( cctxt, instance.uuid, 'build_instances', updates, exc, request_spec) - try: - # We don't need the cell0-targeted context here because the - # instance mapping is in the API DB. - inst_mapping = \ - objects.InstanceMapping.get_by_instance_uuid( - context, instance.uuid) - inst_mapping.cell_mapping = cell0 - inst_mapping.save() - except exception.InstanceMappingNotFound: - pass for build_request in build_requests: try: @@ -1506,13 +1525,8 @@ class ComputeTaskManager(base.Base): instance.tags = instance_tags if instance_tags \ else objects.TagList() - # Update mapping for instance. Normally this check is guarded by - # a try/except but if we're here we know that a newer nova-api - # handled the build process and would have created the mapping - inst_mapping = objects.InstanceMapping.get_by_instance_uuid( - context, instance.uuid) - inst_mapping.cell_mapping = cell - inst_mapping.save() + # Update mapping for instance. + self._map_instance_to_cell(context, instance, cell) if not self._delete_build_request( context, build_request, instance, cell, instance_bdms, @@ -1540,6 +1554,37 @@ class ComputeTaskManager(base.Base): host=host.service_host, node=host.nodename, limits=host.limits, host_list=host_list) + @staticmethod + def _map_instance_to_cell(context, instance, cell): + """Update the instance mapping to point at the given cell. + + During initial scheduling once a host and cell is selected in which + to build the instance this method is used to update the instance + mapping to point at that cell. + + :param context: nova auth RequestContext + :param instance: Instance object being built + :param cell: CellMapping representing the cell in which the instance + was created and is being built. + :returns: InstanceMapping object that was updated. + """ + inst_mapping = objects.InstanceMapping.get_by_instance_uuid( + context, instance.uuid) + # Perform a final sanity check that the instance is not mapped + # to some other cell already because of maybe some crazy + # clustered message queue weirdness. + if inst_mapping.cell_mapping is not None: + LOG.error('During scheduling instance is already mapped to ' + 'another cell: %s. This should not happen and is an ' + 'indication of bigger problems. If you see this you ' + 'should report it to the nova team. Overwriting ' + 'the mapping to point at cell %s.', + inst_mapping.cell_mapping.identity, cell.identity, + instance=instance) + inst_mapping.cell_mapping = cell + inst_mapping.save() + return inst_mapping + def _cleanup_build_artifacts(self, context, exc, instances, build_requests, request_specs, block_device_mappings, tags, cell_mapping_cache): diff --git a/nova/tests/unit/conductor/test_conductor.py b/nova/tests/unit/conductor/test_conductor.py index 0f6689eba2..525d673f11 100644 --- a/nova/tests/unit/conductor/test_conductor.py +++ b/nova/tests/unit/conductor/test_conductor.py @@ -2281,6 +2281,27 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase): self.params['request_specs'][0].requested_resources = [] self._do_schedule_and_build_instances_test(self.params) + def test_map_instance_to_cell_already_mapped(self): + """Tests a scenario where an instance is already mapped to a cell + during scheduling. + """ + build_request = self.params['build_requests'][0] + instance = build_request.get_new_instance(self.ctxt) + # Simulate MQ split brain craziness by updating the instance mapping + # to point at cell0. + inst_mapping = objects.InstanceMapping.get_by_instance_uuid( + self.ctxt, instance.uuid) + inst_mapping.cell_mapping = self.cell_mappings['cell0'] + inst_mapping.save() + cell1 = self.cell_mappings['cell1'] + inst_mapping = self.conductor._map_instance_to_cell( + self.ctxt, instance, cell1) + # Assert that the instance mapping was updated to point at cell1 but + # also that an error was logged. + self.assertEqual(cell1.uuid, inst_mapping.cell_mapping.uuid) + self.assertIn('During scheduling instance is already mapped to ' + 'another cell', self.stdlog.logger.output) + @mock.patch('nova.objects.InstanceMapping.get_by_instance_uuid') def test_cleanup_build_artifacts(self, inst_map_get): """Simple test to ensure the order of operations in the cleanup method @@ -2378,6 +2399,17 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase): build_requests = objects.BuildRequestList.get_all(self.ctxt) instances = objects.InstanceList.get_all(self.ctxt) + # Verify instance mappings. + inst_mappings = objects.InstanceMappingList.get_by_cell_id( + self.ctxt, self.cell_mappings['cell0'].id) + # bare_br is the only instance that has a mapping from setUp. + self.assertEqual(1, len(inst_mappings)) + # Since we did not setup instance mappings for the other fake build + # requests used in this test, we should see a message logged about + # there being no instance mappings. + self.assertIn('While burying instance in cell0, no instance mapping ' + 'was found', self.stdlog.logger.output) + self.assertEqual(0, len(build_requests)) self.assertEqual(4, len(instances)) inst_states = {inst.uuid: (inst.deleted, inst.vm_state) @@ -2475,6 +2507,32 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase): test.MatchType(context.RequestContext), inst.uuid, self.params['tags']) + @mock.patch('nova.objects.Instance.create') + def test_bury_in_cell0_already_mapped(self, mock_inst_create): + """Tests a scenario where the instance mapping is already mapped to a + cell when we attempt to bury the instance in cell0. + """ + build_request = self.params['build_requests'][0] + # Simulate MQ split brain craziness by updating the instance mapping + # to point at cell1. + inst_mapping = objects.InstanceMapping.get_by_instance_uuid( + self.ctxt, build_request.instance_uuid) + inst_mapping.cell_mapping = self.cell_mappings['cell1'] + inst_mapping.save() + # Now attempt to bury the instance in cell0. + with mock.patch.object(inst_mapping, 'save') as mock_inst_map_save: + self.conductor._bury_in_cell0( + self.ctxt, self.params['request_specs'][0], + exc.NoValidHost(reason='idk'), build_requests=[build_request]) + # We should have exited without creating the instance in cell0 nor + # should the instance mapping have been updated to point at cell0. + mock_inst_create.assert_not_called() + mock_inst_map_save.assert_not_called() + # And we should have logged an error. + self.assertIn('When attempting to bury instance in cell0, the ' + 'instance is already mapped to cell', + self.stdlog.logger.output) + def test_reset(self): with mock.patch('nova.compute.rpcapi.ComputeAPI') as mock_rpc: old_rpcapi = self.conductor_manager.compute_rpcapi |