summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZuul <zuul@review.opendev.org>2022-08-17 20:27:17 +0000
committerGerrit Code Review <review@openstack.org>2022-08-17 20:27:17 +0000
commit4b9eba6a9d4bad169ecf3ebfbef06ccfce763f35 (patch)
treed2dff4ace9ebfdbcf8d19fcb7e3f1108cc7b6e9f
parent8097c2b2153ff952a266395d4e351fc39f914c6b (diff)
parentbc5fc2bc688056bc18cf3ae581d8e23592d110da (diff)
downloadnova-4b9eba6a9d4bad169ecf3ebfbef06ccfce763f35.tar.gz
Merge "[ironic] Minimize window for a resource provider to be lost" into stable/victoria
-rw-r--r--nova/tests/unit/virt/ironic/test_driver.py12
-rw-r--r--nova/virt/ironic/driver.py16
-rw-r--r--releasenotes/notes/minimize-bug-1841481-race-window-f76912d4985770ad.yaml13
3 files changed, 40 insertions, 1 deletions
diff --git a/nova/tests/unit/virt/ironic/test_driver.py b/nova/tests/unit/virt/ironic/test_driver.py
index 9b8a6a0472..801846c4dd 100644
--- a/nova/tests/unit/virt/ironic/test_driver.py
+++ b/nova/tests/unit/virt/ironic/test_driver.py
@@ -3441,6 +3441,9 @@ class NodeCacheTestCase(test.NoDBTestCase):
mock_instances.return_value = instances
mock_nodes.return_value = nodes
mock_hosts.side_effect = hosts
+ parent_mock = mock.MagicMock()
+ parent_mock.attach_mock(mock_nodes, 'get_node_list')
+ parent_mock.attach_mock(mock_instances, 'get_uuids_by_host')
if not can_send_146:
mock_can_send.side_effect = (
exception.IronicAPIVersionNotAvailable(version='1.46'))
@@ -3453,6 +3456,15 @@ class NodeCacheTestCase(test.NoDBTestCase):
self.driver._refresh_cache()
+ # assert if get_node_list() is called before get_uuids_by_host()
+ parent_mock.assert_has_calls(
+ [
+ mock.call.get_node_list(fields=ironic_driver._NODE_FIELDS,
+ **kwargs),
+ mock.call.get_uuids_by_host(mock.ANY, self.host)
+ ]
+ )
+
mock_hash_ring.assert_called_once_with(mock.ANY)
mock_instances.assert_called_once_with(mock.ANY, self.host)
mock_nodes.assert_called_once_with(fields=ironic_driver._NODE_FIELDS,
diff --git a/nova/virt/ironic/driver.py b/nova/virt/ironic/driver.py
index aa00ef6839..3bd4d18ecc 100644
--- a/nova/virt/ironic/driver.py
+++ b/nova/virt/ironic/driver.py
@@ -788,10 +788,15 @@ class IronicDriver(virt_driver.ComputeDriver):
def _refresh_cache(self):
ctxt = nova_context.get_admin_context()
self._refresh_hash_ring(ctxt)
- instances = objects.InstanceList.get_uuids_by_host(ctxt, CONF.host)
node_cache = {}
def _get_node_list(**kwargs):
+ # NOTE(TheJulia): This call can take a substantial amount
+ # of time as it may be attempting to retrieve thousands of
+ # baremetal nodes. Depending on the version of Ironic,
+ # this can be as long as 2-10 seconds per every thousand
+ # nodes, and this call may retrieve all nodes in a deployment,
+ # depending on if any filter paramters are applied.
return self._get_node_list(fields=_NODE_FIELDS, **kwargs)
# NOTE(jroll) if partition_key is set, we need to limit nodes that
@@ -815,6 +820,15 @@ class IronicDriver(virt_driver.ComputeDriver):
else:
nodes = _get_node_list()
+ # NOTE(saga): As _get_node_list() will take a long
+ # time to return in large clusters we need to call it before
+ # get_uuids_by_host() method. Otherwise the instances list we get from
+ # get_uuids_by_host() method will become stale.
+ # A stale instances list can cause a node that is managed by this
+ # compute host to be excluded in error and cause the compute node
+ # to be orphaned and associated resource provider to be deleted.
+ instances = objects.InstanceList.get_uuids_by_host(ctxt, CONF.host)
+
for node in nodes:
# NOTE(jroll): we always manage the nodes for instances we manage
if node.instance_uuid in instances:
diff --git a/releasenotes/notes/minimize-bug-1841481-race-window-f76912d4985770ad.yaml b/releasenotes/notes/minimize-bug-1841481-race-window-f76912d4985770ad.yaml
new file mode 100644
index 0000000000..a49d64c6ac
--- /dev/null
+++ b/releasenotes/notes/minimize-bug-1841481-race-window-f76912d4985770ad.yaml
@@ -0,0 +1,13 @@
+---
+fixes:
+ - |
+ Minimizes a race condition window when using the ``ironic`` virt driver
+ where the data generated for the Resource Tracker may attempt to compare
+ potentially stale instance information with the latest known baremetal
+ node information. While this doesn't completely prevent nor resolve the
+ underlying race condition identified in
+ `bug 1841481 <https://bugs.launchpad.net/nova/+bug/1841481>`_,
+ this change allows Nova to have the latest state information, as opposed
+ to state information which may be out of date due to the time which it may
+ take to retrieve the status from Ironic. This issue was most observable
+ on baremetal clusters with several thousand physical nodes.