diff options
author | Zuul <zuul@review.openstack.org> | 2018-06-12 11:21:55 +0000 |
---|---|---|
committer | Gerrit Code Review <review@openstack.org> | 2018-06-12 11:21:55 +0000 |
commit | 710be2a04cd6ef0e8ddad0d957873133e1f60ebc (patch) | |
tree | b97c2013649b59aec8398d14a0e084e86b62d42f /nova | |
parent | 76d83fd620ea815e643f075af6a3aab683603c10 (diff) | |
parent | 43a84dbc1ebf147d43451610b76c700a31e08f4b (diff) | |
download | nova-710be2a04cd6ef0e8ddad0d957873133e1f60ebc.tar.gz |
Merge "Change consecutive build failure limit to a weigher" into stable/queens
Diffstat (limited to 'nova')
-rw-r--r-- | nova/compute/manager.py | 35 | ||||
-rw-r--r-- | nova/compute/resource_tracker.py | 6 | ||||
-rw-r--r-- | nova/compute/stats.py | 8 | ||||
-rw-r--r-- | nova/conf/compute.py | 20 | ||||
-rw-r--r-- | nova/conf/scheduler.py | 28 | ||||
-rw-r--r-- | nova/scheduler/host_manager.py | 3 | ||||
-rw-r--r-- | nova/scheduler/weights/compute.py | 33 | ||||
-rw-r--r-- | nova/test.py | 5 | ||||
-rw-r--r-- | nova/tests/functional/test_servers.py | 38 | ||||
-rw-r--r-- | nova/tests/unit/compute/test_compute_mgr.py | 70 | ||||
-rw-r--r-- | nova/tests/unit/compute/test_resource_tracker.py | 2 | ||||
-rw-r--r-- | nova/tests/unit/compute/test_stats.py | 16 | ||||
-rw-r--r-- | nova/tests/unit/scheduler/test_caching_scheduler.py | 1 | ||||
-rw-r--r-- | nova/tests/unit/scheduler/weights/test_weights_compute.py | 57 |
14 files changed, 249 insertions, 73 deletions
diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 5ff1b5ffa2..653bcc1082 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -528,7 +528,6 @@ class ComputeManager(manager.Manager): CONF.max_concurrent_live_migrations) else: self._live_migration_semaphore = compute_utils.UnlimitedSemaphore() - self._failed_builds = 0 super(ComputeManager, self).__init__(service_name="compute", *args, **kwargs) @@ -1692,29 +1691,15 @@ class ComputeManager(manager.Manager): return block_device_info def _build_failed(self): - self._failed_builds += 1 - limit = CONF.compute.consecutive_build_service_disable_threshold - if limit and self._failed_builds >= limit: - # NOTE(danms): If we're doing a bunch of parallel builds, - # it is possible (although not likely) that we have already - # failed N-1 builds before this and we race with a successful - # build and disable ourselves here when we might've otherwise - # not. - LOG.error('Disabling service due to %(fails)i ' - 'consecutive build failures', - {'fails': self._failed_builds}) - ctx = nova.context.get_admin_context() - service = objects.Service.get_by_compute_host(ctx, CONF.host) - service.disabled = True - service.disabled_reason = ( - 'Auto-disabled due to %i build failures' % self._failed_builds) - service.save() - # NOTE(danms): Reset our counter now so that when the admin - # re-enables us we can start fresh - self._failed_builds = 0 - elif self._failed_builds > 1: - LOG.warning('%(fails)i consecutive build failures', - {'fails': self._failed_builds}) + if CONF.compute.consecutive_build_service_disable_threshold: + rt = self._get_resource_tracker() + # NOTE(danms): Update our counter, but wait for the next + # update_available_resource() periodic to flush it to the DB + rt.stats.build_failed() + + def _build_succeeded(self): + rt = self._get_resource_tracker() + rt.stats.build_succeeded() @wrap_exception() @reverts_task_state @@ -1766,7 +1751,7 @@ class ComputeManager(manager.Manager): build_results.RESCHEDULED): self._build_failed() else: - self._failed_builds = 0 + self._build_succeeded() # NOTE(danms): We spawn here to return the RPC worker thread back to # the pool. Since what follows could take a really long time, we don't diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py index 4f32a9eff2..c8c4f85dd3 100644 --- a/nova/compute/resource_tracker.py +++ b/nova/compute/resource_tracker.py @@ -604,7 +604,13 @@ class ResourceTracker(object): def _copy_resources(self, compute_node, resources): """Copy resource values to supplied compute_node.""" # purge old stats and init with anything passed in by the driver + # NOTE(danms): Preserve 'failed_builds' across the stats clearing, + # as that is not part of resources + # TODO(danms): Stop doing this when we get a column to store this + # directly + prev_failed_builds = self.stats.get('failed_builds', 0) self.stats.clear() + self.stats['failed_builds'] = prev_failed_builds self.stats.digest_stats(resources.get('stats')) compute_node.stats = copy.deepcopy(self.stats) diff --git a/nova/compute/stats.py b/nova/compute/stats.py index 9509c908ed..cfbee2e6bc 100644 --- a/nova/compute/stats.py +++ b/nova/compute/stats.py @@ -138,3 +138,11 @@ class Stats(dict): os_type=os_type, project_id=project_id) return (vm_state, task_state, os_type, project_id) + + def build_failed(self): + self['failed_builds'] = self.get('failed_builds', 0) + 1 + + def build_succeeded(self): + # FIXME(danms): Make this more graceful, either by time-based aging or + # a fixed decline upon success + self['failed_builds'] = 0 diff --git a/nova/conf/compute.py b/nova/conf/compute.py index ecf39b721c..38921f54da 100644 --- a/nova/conf/compute.py +++ b/nova/conf/compute.py @@ -628,20 +628,20 @@ compute_group_opts = [ cfg.IntOpt('consecutive_build_service_disable_threshold', default=10, help=""" -Number of consecutive failed builds that result in disabling a compute service. +Enables reporting of build failures to the scheduler. -This option will cause nova-compute to set itself to a disabled state -if a certain number of consecutive build failures occur. This will -prevent the scheduler from continuing to send builds to a compute node that is -consistently failing. Note that all failures qualify and count towards this -score, including reschedules that may have been due to racy scheduler behavior. -Since the failures must be consecutive, it is unlikely that occasional expected -reschedules will actually disable a compute node. +Any nonzero value will enable sending build failure statistics to the +scheduler for use by the BuildFailureWeigher. Possible values: -* Any positive integer representing a build failure count. -* Zero to never auto-disable. +* Any positive integer enables reporting build failures. +* Zero to disable reporting build failures. + +Related options: + +* [filter_scheduler]/build_failure_weight_multiplier + """), cfg.IntOpt('resource_provider_association_refresh', default=300, diff --git a/nova/conf/scheduler.py b/nova/conf/scheduler.py index 84eb790566..3bbfdcd867 100644 --- a/nova/conf/scheduler.py +++ b/nova/conf/scheduler.py @@ -503,6 +503,34 @@ Possible values: meaningful, as negative values would make this behave as a soft affinity weigher. """), + cfg.FloatOpt( + "build_failure_weight_multiplier", + default=1000000.0, + help=""" +Multiplier used for weighing hosts that have had recent build failures. + +This option determines how much weight is placed on a compute node with +recent build failures. Build failures may indicate a failing, misconfigured, +or otherwise ailing compute node, and avoiding it during scheduling may be +beneficial. The weight is inversely proportional to the number of recent +build failures the compute node has experienced. This value should be +set to some high value to offset weight given by other enabled weighers +due to available resources. To disable weighing compute hosts by the +number of recent failures, set this to zero. + +This option is only used by the FilterScheduler and its subclasses; if you use +a different scheduler, this option has no effect. + +Possible values: + +* An integer or float value, where the value corresponds to the multiplier + ratio for this weigher. + +Related options: + +* [compute]/consecutive_build_service_disable_threshold - Must be nonzero + for a compute to report data considered by this weigher. +"""), cfg.BoolOpt( "shuffle_best_same_weighed_hosts", default=False, diff --git a/nova/scheduler/host_manager.py b/nova/scheduler/host_manager.py index f1cd505d6d..5a419a18b2 100644 --- a/nova/scheduler/host_manager.py +++ b/nova/scheduler/host_manager.py @@ -257,6 +257,9 @@ class HostState(object): self.ram_allocation_ratio = compute.ram_allocation_ratio self.disk_allocation_ratio = compute.disk_allocation_ratio + # update failed_builds counter reported by the compute + self.failed_builds = int(self.stats.get('failed_builds', 0)) + def consume_from_request(self, spec_obj): """Incrementally update host state from a RequestSpec object.""" diff --git a/nova/scheduler/weights/compute.py b/nova/scheduler/weights/compute.py new file mode 100644 index 0000000000..39e7813e05 --- /dev/null +++ b/nova/scheduler/weights/compute.py @@ -0,0 +1,33 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +""" +BuildFailure Weigher. Weigh hosts by the number of recent failed boot attempts. + +""" + +import nova.conf +from nova.scheduler import weights + +CONF = nova.conf.CONF + + +class BuildFailureWeigher(weights.BaseHostWeigher): + def weight_multiplier(self): + """Override the weight multiplier. Note this is negated.""" + return -1 * CONF.filter_scheduler.build_failure_weight_multiplier + + def _weigh_object(self, host_state, weight_properties): + """Higher weights win. Our multiplier is negative, so reduce our + weight by number of failed builds. + """ + return host_state.failed_builds diff --git a/nova/test.py b/nova/test.py index 1294d90198..031618a71d 100644 --- a/nova/test.py +++ b/nova/test.py @@ -330,6 +330,11 @@ class TestCase(testtools.TestCase): # NOTE(mikal): make sure we don't load a privsep helper accidentally self.useFixture(nova_fixtures.PrivsepNoHelperFixture()) + # FIXME(danms): Disable this for all tests by default to avoid breaking + # any that depend on default/previous ordering + self.flags(build_failure_weight_multiplier=0.0, + group='filter_scheduler') + def _setup_cells(self): """Setup a normal cellsv2 environment. diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py index b4d91b0bf1..8f522a4e1d 100644 --- a/nova/tests/functional/test_servers.py +++ b/nova/tests/functional/test_servers.py @@ -73,6 +73,7 @@ class ServersTestBase(integrated_helpers._IntegratedTestBase): _min_count_parameter = 'min_count' def setUp(self): + self.computes = {} super(ServersTestBase, self).setUp() # The network service is called as part of server creates but no # networks have been populated in the db, so stub the methods. @@ -133,6 +134,30 @@ class ServersTest(ServersTestBase): for server in servers: LOG.debug("server: %s", server) + def _get_node_build_failures(self): + ctxt = context.get_admin_context() + computes = objects.ComputeNodeList.get_all(ctxt) + return { + node.hypervisor_hostname: int(node.stats.get('failed_builds', 0)) + for node in computes} + + def _run_periodics(self): + """Run the update_available_resource task on every compute manager + + This runs periodics on the computes in an undefined order; some child + class redefined this function to force a specific order. + """ + + if self.compute.host not in self.computes: + self.computes[self.compute.host] = self.compute + + ctx = context.get_admin_context() + for compute in self.computes.values(): + LOG.info('Running periodic for compute (%s)', + compute.manager.host) + compute.manager.update_available_resource(ctx) + LOG.info('Finished with periodics') + def test_create_server_with_error(self): # Create a server which will enter error state. @@ -154,6 +179,12 @@ class ServersTest(ServersTestBase): self.assertEqual('ERROR', found_server['status']) self._delete_server(created_server_id) + # We should have no (persisted) build failures until we update + # resources, after which we should have one + self.assertEqual([0], list(self._get_node_build_failures().values())) + self._run_periodics() + self.assertEqual([1], list(self._get_node_build_failures().values())) + def _test_create_server_with_error_with_retries(self): # Create a server which will enter error state. @@ -161,6 +192,7 @@ class ServersTest(ServersTestBase): self.addCleanup(fake.restore_nodes) self.flags(host='host2') self.compute2 = self.start_service('compute', host='host2') + self.computes['compute2'] = self.compute2 fails = [] @@ -188,11 +220,17 @@ class ServersTest(ServersTestBase): self.flags(max_attempts=2, group='scheduler') fails = self._test_create_server_with_error_with_retries() self.assertEqual(2, fails) + self._run_periodics() + self.assertEqual( + [1, 1], list(self._get_node_build_failures().values())) def test_create_server_with_error_with_no_retries(self): self.flags(max_attempts=1, group='scheduler') fails = self._test_create_server_with_error_with_retries() self.assertEqual(1, fails) + self._run_periodics() + self.assertEqual( + [0, 1], list(sorted(self._get_node_build_failures().values()))) def test_create_and_delete_server(self): # Creates and deletes a server. diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index f5861cd64d..ada2c52527 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -5178,24 +5178,19 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): nil_out_host_and_node=True) @mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance') - @mock.patch('nova.objects.Service.get_by_compute_host') - def test_build_failures_disable_service(self, mock_service, mock_dbari): + @mock.patch('nova.compute.stats.Stats.build_failed') + def test_build_failures_reported(self, mock_failed, mock_dbari): mock_dbari.return_value = build_results.FAILED instance = objects.Instance(uuid=uuids.instance) for i in range(0, 10): self.compute.build_and_run_instance(self.context, instance, None, None, None) - service = mock_service.return_value - self.assertTrue(service.disabled) - self.assertEqual('Auto-disabled due to 10 build failures', - service.disabled_reason) - service.save.assert_called_once_with() - self.assertEqual(0, self.compute._failed_builds) + + self.assertEqual(10, mock_failed.call_count) @mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance') - @mock.patch('nova.objects.Service.get_by_compute_host') - def test_build_failures_not_disable_service(self, mock_service, - mock_dbari): + @mock.patch('nova.compute.stats.Stats.build_failed') + def test_build_failures_not_reported(self, mock_failed, mock_dbari): self.flags(consecutive_build_service_disable_threshold=0, group='compute') mock_dbari.return_value = build_results.FAILED @@ -5203,14 +5198,15 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): for i in range(0, 10): self.compute.build_and_run_instance(self.context, instance, None, None, None) - service = mock_service.return_value - self.assertFalse(service.save.called) - self.assertEqual(10, self.compute._failed_builds) + + mock_failed.assert_not_called() @mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance') - @mock.patch('nova.objects.Service.get_by_compute_host') - def test_transient_build_failures_no_disable_service(self, mock_service, - mock_dbari): + @mock.patch.object(manager.ComputeManager, '_build_failed') + @mock.patch.object(manager.ComputeManager, '_build_succeeded') + def test_transient_build_failures_no_report(self, mock_succeeded, + mock_failed, + mock_dbari): results = [build_results.FAILED, build_results.ACTIVE, build_results.RESCHEDULED] @@ -5226,31 +5222,34 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): for i in range(0, 10): self.compute.build_and_run_instance(self.context, instance, None, None, None) - service = mock_service.return_value - self.assertFalse(service.save.called) - self.assertEqual(0, self.compute._failed_builds) + + self.assertEqual(2, mock_failed.call_count) + self.assertEqual(8, mock_succeeded.call_count) @mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance') - @mock.patch('nova.objects.Service.get_by_compute_host') - def test_build_reschedules_disable_service(self, mock_service, mock_dbari): + @mock.patch.object(manager.ComputeManager, '_build_failed') + @mock.patch.object(manager.ComputeManager, '_build_succeeded') + def test_build_reschedules_reported(self, mock_succeeded, + mock_failed, + mock_dbari): mock_dbari.return_value = build_results.RESCHEDULED instance = objects.Instance(uuid=uuids.instance) for i in range(0, 10): self.compute.build_and_run_instance(self.context, instance, None, None, None) - service = mock_service.return_value - self.assertTrue(service.disabled) - self.assertEqual('Auto-disabled due to 10 build failures', - service.disabled_reason) - service.save.assert_called_once_with() - self.assertEqual(0, self.compute._failed_builds) + + self.assertEqual(10, mock_failed.call_count) + mock_succeeded.assert_not_called() @mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance') - @mock.patch('nova.objects.Service.get_by_compute_host') @mock.patch('nova.exception_wrapper._emit_exception_notification') @mock.patch('nova.compute.utils.add_instance_fault_from_exc') - def test_build_exceptions_disable_service(self, mock_if, mock_notify, - mock_service, mock_dbari): + @mock.patch.object(manager.ComputeManager, '_build_failed') + @mock.patch.object(manager.ComputeManager, '_build_succeeded') + def test_build_exceptions_reported(self, mock_succeeded, + mock_failed, + mock_if, mock_notify, + mock_dbari): mock_dbari.side_effect = test.TestingException() instance = objects.Instance(uuid=uuids.instance, task_state=None) @@ -5259,12 +5258,9 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): self.compute.build_and_run_instance, self.context, instance, None, None, None) - service = mock_service.return_value - self.assertTrue(service.disabled) - self.assertEqual('Auto-disabled due to 10 build failures', - service.disabled_reason) - service.save.assert_called_once_with() - self.assertEqual(0, self.compute._failed_builds) + + self.assertEqual(10, mock_failed.call_count) + mock_succeeded.assert_not_called() @mock.patch.object(manager.ComputeManager, '_shutdown_instance') @mock.patch.object(manager.ComputeManager, '_build_networks_for_instance') diff --git a/nova/tests/unit/compute/test_resource_tracker.py b/nova/tests/unit/compute/test_resource_tracker.py index f57d4c10a1..232c9120eb 100644 --- a/nova/tests/unit/compute/test_resource_tracker.py +++ b/nova/tests/unit/compute/test_resource_tracker.py @@ -1149,7 +1149,7 @@ class TestInitComputeNode(BaseTestCase): ram_allocation_ratio=1.0, cpu_allocation_ratio=1.0, disk_allocation_ratio=1.0, - stats={}, + stats={'failed_builds': 0}, pci_device_pools=objects.PciDevicePoolList(objects=[]), uuid=uuids.compute_node_uuid ) diff --git a/nova/tests/unit/compute/test_stats.py b/nova/tests/unit/compute/test_stats.py index 3010a5d5cb..f58d8bdad4 100644 --- a/nova/tests/unit/compute/test_stats.py +++ b/nova/tests/unit/compute/test_stats.py @@ -238,3 +238,19 @@ class StatsTestCase(test.NoDBTestCase): self.assertEqual(0, len(self.stats)) self.assertEqual(0, len(self.stats.states)) + + def test_build_failed_succeded(self): + self.assertEqual('not-set', self.stats.get('failed_builds', 'not-set')) + self.stats.build_failed() + self.assertEqual(1, self.stats['failed_builds']) + self.stats.build_failed() + self.assertEqual(2, self.stats['failed_builds']) + self.stats.build_succeeded() + self.assertEqual(0, self.stats['failed_builds']) + self.stats.build_succeeded() + self.assertEqual(0, self.stats['failed_builds']) + + def test_build_succeeded_first(self): + self.assertEqual('not-set', self.stats.get('failed_builds', 'not-set')) + self.stats.build_succeeded() + self.assertEqual(0, self.stats['failed_builds']) diff --git a/nova/tests/unit/scheduler/test_caching_scheduler.py b/nova/tests/unit/scheduler/test_caching_scheduler.py index dd72e73a26..2760f92629 100644 --- a/nova/tests/unit/scheduler/test_caching_scheduler.py +++ b/nova/tests/unit/scheduler/test_caching_scheduler.py @@ -163,6 +163,7 @@ class CachingSchedulerTestCase(test_scheduler.SchedulerTestCase): host_state.ram_allocation_ratio = 1.5 host_state.disk_allocation_ratio = 1.0 host_state.metrics = objects.MonitorMetricList(objects=[]) + host_state.failed_builds = 0 return host_state @mock.patch('nova.db.instance_extra_get_by_instance_uuid', diff --git a/nova/tests/unit/scheduler/weights/test_weights_compute.py b/nova/tests/unit/scheduler/weights/test_weights_compute.py new file mode 100644 index 0000000000..7e6e3814c9 --- /dev/null +++ b/nova/tests/unit/scheduler/weights/test_weights_compute.py @@ -0,0 +1,57 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +""" +Tests For Scheduler build failure weights. +""" + +from nova.scheduler import weights +from nova.scheduler.weights import compute +from nova import test +from nova.tests.unit.scheduler import fakes + + +class BuildFailureWeigherTestCase(test.NoDBTestCase): + def setUp(self): + super(BuildFailureWeigherTestCase, self).setUp() + self.weight_handler = weights.HostWeightHandler() + self.weighers = [compute.BuildFailureWeigher()] + + def _get_weighed_host(self, hosts): + return self.weight_handler.get_weighed_objects(self.weighers, + hosts, {}) + + def _get_all_hosts(self): + host_values = [ + ('host1', 'node1', {'failed_builds': 0}), + ('host2', 'node2', {'failed_builds': 1}), + ('host3', 'node3', {'failed_builds': 10}), + ('host4', 'node4', {'failed_builds': 100}) + ] + return [fakes.FakeHostState(host, node, values) + for host, node, values in host_values] + + def test_build_failure_weigher_disabled(self): + self.flags(build_failure_weight_multiplier=0.0, + group='filter_scheduler') + hosts = self._get_all_hosts() + weighed_hosts = self._get_weighed_host(hosts) + self.assertTrue(all([wh.weight == 0.0 + for wh in weighed_hosts])) + + def test_build_failure_weigher_scaled(self): + self.flags(build_failure_weight_multiplier=1000.0, + group='filter_scheduler') + hosts = self._get_all_hosts() + weighed_hosts = self._get_weighed_host(hosts) + self.assertEqual([0, -10, -100, -1000], + [wh.weight for wh in weighed_hosts]) |