summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLucas Alvares Gomes <lucasagomes@gmail.com>2014-06-18 17:19:37 +0100
committerLucas Alvares Gomes <lucasagomes@gmail.com>2014-07-24 17:44:10 +0100
commit8b554deed62d86d19d7459ab1ac36cf2dc27745a (patch)
tree086f958d21a7668c7f10cc97194413c520d9369c
parent401eae8a89cee131198325b89a2c7208d74d94c2 (diff)
downloadironic-8b554deed62d86d19d7459ab1ac36cf2dc27745a.tar.gz
Fix nodes left in an incosistent state if no workers
This patch is fixing the problem of leaving the nodes in an inconsistent state if there's no free conductor workers available to deploy or the tear down a node, the patch is using the set_spawn_error_hook() method of TaskManager to run some custom code that will rollback the nodes to the previous provision_state and target_provision_state in case NoFreeConductorWorker is raised. Closes-Bug: #1331494 Change-Id: I5d6e8e2c69cbdf1f9abe169afe617aa79783e57d
-rw-r--r--ironic/conductor/manager.py54
-rw-r--r--ironic/tests/conductor/test_manager.py27
2 files changed, 73 insertions, 8 deletions
diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py
index d8162525d..ad81df4a9 100644
--- a/ironic/conductor/manager.py
+++ b/ironic/conductor/manager.py
@@ -53,6 +53,7 @@ from oslo import messaging
from ironic.common import driver_factory
from ironic.common import exception
from ironic.common import hash_ring as hash
+from ironic.common import i18n
from ironic.common import neutron
from ironic.common import states
from ironic.conductor import task_manager
@@ -60,7 +61,6 @@ from ironic.conductor import utils
from ironic.db import api as dbapi
from ironic import objects
from ironic.openstack.common import excutils
-from ironic.openstack.common.gettextutils import _LI
from ironic.openstack.common import lockutils
from ironic.openstack.common import log
from ironic.openstack.common import periodic_task
@@ -68,6 +68,9 @@ from ironic.openstack.common import periodic_task
MANAGER_TOPIC = 'ironic.conductor_manager'
WORKER_SPAWN_lOCK = "conductor_worker_spawn"
+_LW = i18n._LW
+_LI = i18n._LI
+
LOG = log.getLogger(__name__)
conductor_opts = [
@@ -340,6 +343,34 @@ class ConductorManager(periodic_task.PeriodicTasks):
method=driver_method,
**info)
+ def _provisioning_error_handler(self, e, node, context, provision_state,
+ target_provision_state):
+ """Set the node's provisioning states if error occurs.
+
+ This hook gets called upon an exception being raised when spawning
+ the worker to do the deployment or tear down of a node.
+
+ :param e: the exception object that was raised.
+ :param node: an Ironic node object.
+ :param context: security context.
+ :param provision_state: the provision state to be set on
+ the node.
+ :param target_provision_state: the target provision state to be
+ set on the node.
+
+ """
+ if isinstance(e, exception.NoFreeConductorWorker):
+ node.provision_state = provision_state
+ node.target_provision_state = target_provision_state
+ node.last_error = (_("No free conductor workers available"))
+ node.save(context)
+ LOG.warning(_LW("No free conductor workers available to perform "
+ "an action on node %(node)s, setting node's "
+ "provision_state back to %(prov_state)s and "
+ "target_provision_state to %(tgt_prov_state)s."),
+ {'node': node.uuid, 'prov_state': provision_state,
+ 'tgt_prov_state': target_provision_state})
+
@messaging.expected_exceptions(exception.NoFreeConductorWorker,
exception.NodeLocked,
exception.NodeInMaintenance,
@@ -395,11 +426,21 @@ class ConductorManager(periodic_task.PeriodicTasks):
"RPC do_node_deploy failed to validate deploy info. "
"Error: %(msg)s") % {'msg': e})
+ # Save the previous states so we can rollback the node to a
+ # consistent state in case there's no free workers to do the
+ # deploy work
+ previous_prov_state = node.provision_state
+ previous_tgt_provision_state = node.target_provision_state
+
# Set target state to expose that work is in progress
node.provision_state = states.DEPLOYING
node.target_provision_state = states.DEPLOYDONE
node.last_error = None
node.save(context)
+
+ task.set_spawn_error_hook(self._provisioning_error_handler, node,
+ context, previous_prov_state,
+ previous_tgt_provision_state)
task.spawn_after(self._spawn_worker, self._do_node_deploy,
context, task)
@@ -466,10 +507,21 @@ class ConductorManager(periodic_task.PeriodicTasks):
"RPC do_node_tear_down failed to validate deploy info. "
"Error: %(msg)s") % {'msg': e})
+ # save the previous states so we can rollback the node to a
+ # consistent state in case there's no free workers to do the
+ # tear down work
+ previous_prov_state = node.provision_state
+ previous_tgt_provision_state = node.target_provision_state
+
+ # set target state to expose that work is in progress
node.provision_state = states.DELETING
node.target_provision_state = states.DELETED
node.last_error = None
node.save(context)
+
+ task.set_spawn_error_hook(self._provisioning_error_handler, node,
+ context, previous_prov_state,
+ previous_tgt_provision_state)
task.spawn_after(self._spawn_worker, self._do_node_tear_down,
context, task)
diff --git a/ironic/tests/conductor/test_manager.py b/ironic/tests/conductor/test_manager.py
index 2e3c20e3f..76dfeb8dd 100644
--- a/ironic/tests/conductor/test_manager.py
+++ b/ironic/tests/conductor/test_manager.py
@@ -720,7 +720,12 @@ class ManagerTestCase(tests_db_base.DbTestCase):
mock_spawn.assert_called_once_with(mock.ANY, mock.ANY, mock.ANY)
def test_do_node_deploy_worker_pool_full(self):
- node = obj_utils.create_test_node(self.context, driver='fake')
+ prv_state = states.NOSTATE
+ tgt_prv_state = states.NOSTATE
+ node = obj_utils.create_test_node(self.context,
+ provision_state=prv_state,
+ target_provision_state=tgt_prv_state,
+ last_error=None, driver='fake')
self._start_service()
with mock.patch.object(self.service, '_spawn_worker') as mock_spawn:
@@ -733,8 +738,10 @@ class ManagerTestCase(tests_db_base.DbTestCase):
self.assertEqual(exception.NoFreeConductorWorker, exc.exc_info[0])
self.service._worker_pool.waitall()
node.refresh()
- # This is a sync operation last_error should be None.
- self.assertIsNone(node.last_error)
+ # Make sure things were rolled back
+ self.assertEqual(prv_state, node.provision_state)
+ self.assertEqual(tgt_prv_state, node.target_provision_state)
+ self.assertIsNotNone(node.last_error)
# Verify reservation has been cleared.
self.assertIsNone(node.reservation)
@@ -818,10 +825,14 @@ class ManagerTestCase(tests_db_base.DbTestCase):
@mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker')
def test_do_node_tear_down_worker_pool_full(self, mock_spawn):
+ prv_state = states.ACTIVE
+ tgt_prv_state = states.NOSTATE
fake_instance_info = {'foo': 'bar'}
node = obj_utils.create_test_node(self.context, driver='fake',
- provision_state=states.ACTIVE,
- instance_info=fake_instance_info)
+ provision_state=prv_state,
+ target_provision_state=tgt_prv_state,
+ instance_info=fake_instance_info,
+ last_error=None)
self._start_service()
mock_spawn.side_effect = exception.NoFreeConductorWorker()
@@ -833,10 +844,12 @@ class ManagerTestCase(tests_db_base.DbTestCase):
self.assertEqual(exception.NoFreeConductorWorker, exc.exc_info[0])
self.service._worker_pool.waitall()
node.refresh()
- # This is a sync operation last_error should be None.
- self.assertIsNone(node.last_error)
# Assert instance_info was not touched
self.assertEqual(fake_instance_info, node.instance_info)
+ # Make sure things were rolled back
+ self.assertEqual(prv_state, node.provision_state)
+ self.assertEqual(tgt_prv_state, node.target_provision_state)
+ self.assertIsNotNone(node.last_error)
# Verify reservation has been cleared.
self.assertIsNone(node.reservation)