summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHarald Jensås <hjensas@redhat.com>2022-04-28 09:22:14 +0200
committerHarald Jensås <hjensas@redhat.com>2022-04-29 13:50:26 +0000
commit86e326a59126a24024649e8454b9c9253206c58b (patch)
treee065cbb844edd872cb998ad1e509fff2fcda2ff0
parente5626ab9c7d1b71f078b90e4c785a4aaff6d05b3 (diff)
downloadironic-86e326a59126a24024649e8454b9c9253206c58b.tar.gz
Exclude current conductor from offline_conductors
In some cases the current conductor may have failed to updated the heartbeat timestamp due to failure of resource starvation. When this occurs the dbapi get_offline_conductors method will include the current conductor in its return value. In this scenario the conductor may end up forcefully remove node reservations or allocations from itself, triggering takeover which fail on-going operations. This change adds a wrapper to exclude the current conductor. The wrapper will log a warning to raise the issue. Related-Bug: #1970484 Stroy: 2010016 Task: 45204 Depends-On: https://review.opendev.org/839910 Change-Id: I6a8f38934b475f792433be6f0882540b82ca26c1 (cherry picked from commit 4cf0147e86039e8fa2232c3de6d45f1405a515d0)
-rw-r--r--ironic/conductor/manager.py6
-rw-r--r--ironic/conductor/utils.py21
-rw-r--r--ironic/tests/unit/conductor/test_utils.py10
-rw-r--r--releasenotes/notes/exclude-current-conductor-from-offline-conductors-2e2ef401a8b7d7e8.yaml12
4 files changed, 47 insertions, 2 deletions
diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py
index d11224852..7f6470e38 100644
--- a/ironic/conductor/manager.py
+++ b/ironic/conductor/manager.py
@@ -1604,7 +1604,8 @@ class ConductorManager(base_manager.BaseConductorManager):
:param context: request context.
"""
- offline_conductors = self.dbapi.get_offline_conductors()
+ offline_conductors = utils.exclude_current_conductor(
+ self.host, self.dbapi.get_offline_conductors())
if not offline_conductors:
return
@@ -3436,7 +3437,8 @@ class ConductorManager(base_manager.BaseConductorManager):
:param context: request context.
"""
- offline_conductors = self.dbapi.get_offline_conductors(field='id')
+ offline_conductors = utils.exclude_current_conductor(
+ self.conductor.id, self.dbapi.get_offline_conductors(field='id'))
for conductor_id in offline_conductors:
filters = {'state': states.ALLOCATING,
'conductor_affinity': conductor_id}
diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py
index 4a0d68a5d..b418d9d0a 100644
--- a/ironic/conductor/utils.py
+++ b/ironic/conductor/utils.py
@@ -1671,3 +1671,24 @@ def update_image_type(context, node):
'image_type',
images.IMAGE_TYPE_WHOLE_DISK if iwdi else images.IMAGE_TYPE_PARTITION)
return True
+
+
+def exclude_current_conductor(current_conductor, offline_conductors):
+ """Wrapper to exclude current conductor from offline_conductors
+
+ In some cases the current conductor may have failed to update
+ the heartbeat timestamp due to failure or resource starvation.
+ When this occurs the dbapi get_offline_conductors method will
+ include the current conductor in its return value.
+
+ :param current_conductor: id or hostname of the current conductor
+ :param offline_conductors: List of offline conductors.
+ :return: List of offline conductors, excluding current conductor
+ """
+ if current_conductor in offline_conductors:
+ LOG.warning('Current conductor %s will be excluded from offline '
+ 'conductors. Conductor heartbeat has failed to update the '
+ 'database timestamp. This is sign of resource starvation.',
+ current_conductor)
+
+ return [x for x in offline_conductors if x != current_conductor]
diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py
index 5363fe801..7d8e70a1f 100644
--- a/ironic/tests/unit/conductor/test_utils.py
+++ b/ironic/tests/unit/conductor/test_utils.py
@@ -1921,6 +1921,16 @@ class MiscTestCase(db_base.DbTestCase):
conductor_utils.restore_power_state_if_needed(task, power_state)
self.assertEqual(0, power_action_mock.call_count)
+ @mock.patch.object(conductor_utils.LOG, 'warning', autospec=True)
+ def test_exclude_current_conductor(self, mock_log):
+ current_conductor = 'foo'
+ offline_conductos = ['foo', 'bar']
+ result = conductor_utils.exclude_current_conductor(current_conductor,
+ offline_conductos)
+ self.assertTrue(mock_log.called)
+ self.assertIn('bar', result)
+ self.assertNotIn('foo', result)
+
class ValidateInstanceInfoTraitsTestCase(tests_base.TestCase):
diff --git a/releasenotes/notes/exclude-current-conductor-from-offline-conductors-2e2ef401a8b7d7e8.yaml b/releasenotes/notes/exclude-current-conductor-from-offline-conductors-2e2ef401a8b7d7e8.yaml
new file mode 100644
index 000000000..c7f3e4acc
--- /dev/null
+++ b/releasenotes/notes/exclude-current-conductor-from-offline-conductors-2e2ef401a8b7d7e8.yaml
@@ -0,0 +1,12 @@
+---
+fixes:
+ - |
+ Fixes an issue where a conductor would attempt local takeover. In case of
+ heartbeat failure due to resource starvation, the current conductor was
+ detected as offline when querying the database. In this scenario the
+ conductor would forcibly remove reservations of it's own and initiate
+ takeover. Current conductor is now excluded from the list of offline
+ conductors, so that local takeover does not occur for this case. A warning
+ is logged to highlight the potential resource starvation issue.
+ See bug: `2010016 <https://storyboard.openstack.org/#!/story/2010016>`_.
+