summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ironic/common/states.py19
-rw-r--r--ironic/conductor/base_manager.py10
-rw-r--r--ironic/conductor/manager.py128
-rw-r--r--ironic/conductor/utils.py104
-rw-r--r--ironic/conductor/verify.py6
-rw-r--r--ironic/conf/conductor.py51
-rw-r--r--ironic/db/api.py18
-rw-r--r--ironic/db/sqlalchemy/api.py71
-rw-r--r--ironic/tests/unit/conductor/test_base_manager.py12
-rw-r--r--ironic/tests/unit/conductor/test_manager.py165
-rw-r--r--ironic/tests/unit/conductor/test_utils.py71
-rw-r--r--ironic/tests/unit/conductor/test_verify.py6
12 files changed, 622 insertions, 39 deletions
diff --git a/ironic/common/states.py b/ironic/common/states.py
index 3b0cbe18a..89b710189 100644
--- a/ironic/common/states.py
+++ b/ironic/common/states.py
@@ -297,6 +297,25 @@ ALLOCATING = 'allocating'
# States ERROR and ACTIVE are reused.
+###########################
+# History Event State Types
+###########################
+
+PROVISIONING = "provisioning"
+CLEANING = "cleaning"
+DEPLOYING = "deploying"
+TAKEOVER = "takeover"
+INTROSPECTION = "introspection"
+RESCUE = "rescue"
+CONDUCTOR = "conductor"
+TRANSITION = "transition"
+STARTFAIL = "startup failure"
+UNPROVISION = "unprovision"
+ADOPTION = "adoption"
+CONSOLE = "console"
+MONITORING = "monitoring"
+VERIFY = "verify"
+
#####################
# State machine model
diff --git a/ironic/conductor/base_manager.py b/ironic/conductor/base_manager.py
index 53b2b48ae..43a3ff51e 100644
--- a/ironic/conductor/base_manager.py
+++ b/ironic/conductor/base_manager.py
@@ -37,6 +37,7 @@ from ironic.common import states
from ironic.conductor import allocations
from ironic.conductor import notification_utils as notify_utils
from ironic.conductor import task_manager
+from ironic.conductor import utils
from ironic.conf import CONF
from ironic.db import api as dbapi
from ironic.drivers import base as driver_base
@@ -536,7 +537,10 @@ class BaseConductorManager(object):
err_handler=err_handler,
target_state=target_state)
else:
- task.node.last_error = last_error
+ utils.node_history_record(
+ task.node, event=last_error,
+ error=True,
+ event_type=states.TRANSITION)
task.process_event('fail', target_state=target_state)
except exception.NoFreeConductorWorker:
break
@@ -580,7 +584,9 @@ class BaseConductorManager(object):
LOG.error(msg)
# If starting console failed, set node console_enabled
# back to False and set node's last error.
- task.node.last_error = msg
+ utils.node_history_record(task.node, event=msg,
+ error=True,
+ event_type=states.STARTFAIL)
task.node.console_enabled = False
task.node.save()
notify_utils.emit_console_notification(
diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py
index 1abb75fe1..7fe4c5959 100644
--- a/ironic/conductor/manager.py
+++ b/ironic/conductor/manager.py
@@ -712,7 +712,11 @@ class ConductorManager(base_manager.BaseConductorManager):
def handle_failure(e, errmsg, log_func=LOG.error):
utils.remove_node_rescue_password(node, save=False)
- node.last_error = errmsg % e
+ error = errmsg % e
+ utils.node_history_record(task.node, event=error,
+ event_type=states.RESCUE,
+ error=True,
+ user=task.context.user_id)
task.process_event('fail')
log_func('Error while performing rescue operation for node '
'%(node)s with instance %(instance)s: %(err)s',
@@ -801,7 +805,11 @@ class ConductorManager(base_manager.BaseConductorManager):
node = task.node
def handle_failure(e, errmsg, log_func=LOG.error):
- node.last_error = errmsg % e
+ error = errmsg % e
+ utils.node_history_record(task.node, event=error,
+ event_type=states.RESCUE,
+ error=True,
+ user=task.context.user_id)
task.process_event('fail')
log_func('Error while performing unrescue operation for node '
'%(node)s with instance %(instance)s: %(err)s',
@@ -845,7 +853,9 @@ class ConductorManager(base_manager.BaseConductorManager):
error_msg = _('Failed to clean up rescue after aborting '
'the operation')
node.refresh()
- node.last_error = error_msg
+ utils.node_history_record(node, event=error_msg,
+ event_type=states.RESCUE, error=True,
+ user=task.context.user_id)
node.maintenance = True
node.maintenance_reason = error_msg
node.fault = faults.RESCUE_ABORT_FAILURE
@@ -853,10 +863,15 @@ class ConductorManager(base_manager.BaseConductorManager):
return
info_message = _('Rescue operation aborted for node %s.') % node.uuid
- last_error = _('By request, the rescue operation was aborted.')
+ # NOTE(TheJulia): This "error" is not an actual error, the operation
+ # has been aborted and the node returned to normal operation.
+ error = _('By request, the rescue operation was aborted.')
+ utils.node_history_record(task.node, event=error,
+ event_type=states.RESCUE,
+ error=False,
+ user=task.context.user_id)
node.refresh()
utils.remove_agent_url(node)
- node.last_error = last_error
node.save()
LOG.info(info_message)
@@ -1053,7 +1068,11 @@ class ConductorManager(base_manager.BaseConductorManager):
with excutils.save_and_reraise_exception():
LOG.exception('Error in tear_down of node %(node)s: %(err)s',
{'node': node.uuid, 'err': e})
- node.last_error = _("Failed to tear down. Error: %s") % e
+ error = _("Failed to tear down. Error: %s") % e
+ utils.node_history_record(task.node, event=error,
+ event_type=states.UNPROVISION,
+ error=True,
+ user=task.context.user_id)
task.process_event('fail')
else:
# NOTE(tenbrae): When tear_down finishes, the deletion is done,
@@ -1339,10 +1358,18 @@ class ConductorManager(base_manager.BaseConductorManager):
with excutils.save_and_reraise_exception():
LOG.exception('Error in aborting the inspection of '
'node %(node)s', {'node': node.uuid})
- node.last_error = _('Failed to abort inspection. '
- 'Error: %s') % e
+ error = _('Failed to abort inspection. '
+ 'Error: %s') % e
+ utils.node_history_record(task.node, event=error,
+ event_type=states.INTROSPECTION,
+ error=True,
+ user=task.context.user_id)
node.save()
- node.last_error = _('Inspection was aborted by request.')
+ error = _('Inspection was aborted by request.')
+ utils.node_history_record(task.node, event=error,
+ event_type=states.INTROSPECTION,
+ error=True,
+ user=task.context.user_id)
utils.wipe_token_and_url(task)
task.process_event('abort')
LOG.info('Successfully aborted inspection of node %(node)s',
@@ -1659,9 +1686,14 @@ class ConductorManager(base_manager.BaseConductorManager):
if not task.node.maintenance and task.node.target_power_state:
old_state = task.node.target_power_state
task.node.target_power_state = None
- task.node.last_error = _('Pending power operation was '
- 'aborted due to conductor take '
- 'over')
+ error = _('Pending power operation was '
+ 'aborted due to conductor take '
+ 'over')
+ utils.node_history_record(task.node, event=error,
+ event_type=states.TAKEOVER,
+ error=True,
+ user=task.context.user_id)
+
task.node.save()
LOG.warning('Aborted pending power operation %(op)s '
'on node %(node)s due to conductor take over',
@@ -1725,7 +1757,10 @@ class ConductorManager(base_manager.BaseConductorManager):
LOG.error(msg)
# Wipe power state from being preserved as it is likely invalid.
node.power_state = states.NOSTATE
- node.last_error = msg
+ utils.node_history_record(task.node, event=msg,
+ event_type=states.ADOPTION,
+ error=True,
+ user=task.context.user_id)
task.process_event('fail')
@METRICS.timer('ConductorManager._do_takeover')
@@ -1764,7 +1799,10 @@ class ConductorManager(base_manager.BaseConductorManager):
LOG.error(msg)
# If taking over console failed, set node's console_enabled
# back to False and set node's last error.
- task.node.last_error = msg
+ utils.node_history_record(task.node, event=msg,
+ event_type=states.TAKEOVER,
+ error=True,
+ user=task.context.user_id)
task.node.console_enabled = False
console_error = True
else:
@@ -2231,7 +2269,10 @@ class ConductorManager(base_manager.BaseConductorManager):
'Reason: %(error)s') % {'op': op,
'node': node.uuid,
'error': e})
- node.last_error = msg
+ utils.node_history_record(task.node, event=msg,
+ event_type=states.CONSOLE,
+ error=True,
+ user=task.context.user_id)
LOG.error(msg)
node.save()
notify_utils.emit_console_notification(
@@ -3495,6 +3536,55 @@ class ConductorManager(base_manager.BaseConductorManager):
task.node.save()
return task.node
+ @METRICS.timer('ConductorManager.manage_node_history')
+ @periodics.periodic(
+ spacing=CONF.conductor.node_history_cleanup_interval,
+ enabled=(
+ CONF.conductor.node_history_cleanup_batch_count > 0
+ and CONF.conductor.node_history_max_entries != 0
+ )
+ )
+ def manage_node_history(self, context):
+ try:
+ self._manage_node_history(context)
+ except Exception as e:
+ LOG.error('Encountered error while cleaning node '
+ 'history records: %s', e)
+
+ def _manage_node_history(self, context):
+ """Periodic task to keep the node history tidy."""
+ max_batch = CONF.conductor.node_history_cleanup_batch_count
+ # NOTE(TheJulia): Asks the db for the list. Presently just gets
+ # the node id and the count. If we incorporate by date constraint
+ # or handling, then it will need to be something like the method
+ # needs to identify the explicit ID values to delete, and then
+ # the deletion process needs to erase in logical chunks.
+ entries_to_clean = self.dbapi.query_node_history_records_for_purge(
+ conductor_id=self.conductor.id)
+ count = 0
+ for node_id in entries_to_clean:
+ if count < max_batch:
+ # If we have not hit our total limit, proceed
+ if entries_to_clean[node_id]:
+ # if we have work to do on this node, proceed.
+ self.dbapi.bulk_delete_node_history_records(
+ entries_to_clean[node_id])
+ else:
+ LOG.warning('While cleaning up node history records, '
+ 'we reached the maximum number of records '
+ 'permitted in a single batch. If this error '
+ 'is repeated, consider tuning node history '
+ 'configuration options to be more aggressive '
+ 'by increasing frequency and lowering the '
+ 'number of entries to be deleted to not '
+ 'negatively impact performance.')
+ break
+ count = count + len(entries_to_clean[node_id])
+ # Yield to other threads, since we also don't want to be
+ # looping tightly deleting rows as that will negatively
+ # impact DB access if done in excess.
+ eventlet.sleep(0)
+
@METRICS.timer('get_vendor_passthru_metadata')
def get_vendor_passthru_metadata(route_dict):
@@ -3534,7 +3624,9 @@ def handle_sync_power_state_max_retries_exceeded(task, actual_power_state,
old_power_state = node.power_state
node.power_state = actual_power_state
- node.last_error = msg
+ utils.node_history_record(task.node, event=msg,
+ event_type=states.MONITORING,
+ error=True)
node.maintenance = True
node.maintenance_reason = msg
node.fault = faults.POWER_FAILURE
@@ -3688,7 +3780,9 @@ def _do_inspect_hardware(task):
node = task.node
def handle_failure(e, log_func=LOG.error):
- node.last_error = e
+ utils.node_history_record(task.node, event=e,
+ event_type=states.INTROSPECTION,
+ error=True, user=task.context.user_id)
task.process_event('fail')
log_func("Failed to inspect node %(node)s: %(err)s",
{'node': node.uuid, 'err': e})
diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py
index bbc489dfc..59b60a5da 100644
--- a/ironic/conductor/utils.py
+++ b/ironic/conductor/utils.py
@@ -39,6 +39,7 @@ from ironic.common import states
from ironic.conductor import notification_utils as notify_utils
from ironic.conductor import task_manager
from ironic.objects import fields
+from ironic.objects import node_history
LOG = log.getLogger(__name__)
CONF = cfg.CONF
@@ -245,9 +246,10 @@ def _can_skip_state_change(task, new_state):
curr_state = task.driver.power.get_power_state(task)
except Exception as e:
with excutils.save_and_reraise_exception():
- node['last_error'] = _(
+ error = _(
"Failed to change power state to '%(target)s'. "
"Error: %(error)s") % {'target': new_state, 'error': e}
+ node_history_record(node, event=error, error=True)
node['target_power_state'] = states.NOSTATE
node.save()
notify_utils.emit_power_set_notification(
@@ -329,12 +331,13 @@ def node_power_action(task, new_state, timeout=None):
except Exception as e:
with excutils.save_and_reraise_exception():
node['target_power_state'] = states.NOSTATE
- node['last_error'] = _(
+ error = _(
"Failed to change power state to '%(target_state)s' "
"by '%(new_state)s'. Error: %(error)s") % {
'target_state': target_state,
'new_state': new_state,
'error': e}
+ node_history_record(node, event=error, error=True)
node.save()
notify_utils.emit_power_set_notification(
task, fields.NotificationLevel.ERROR,
@@ -399,7 +402,9 @@ def provisioning_error_handler(e, node, provision_state,
# because it isn't updated on a failed deploy
node.provision_state = provision_state
node.target_provision_state = target_provision_state
- node.last_error = (_("No free conductor workers available"))
+ error = (_("No free conductor workers available"))
+ node_history_record(node, event=error, event_type=states.PROVISIONING,
+ error=True)
node.save()
LOG.warning("No free conductor workers available to perform "
"an action on node %(node)s, setting node's "
@@ -483,7 +488,8 @@ def cleaning_error_handler(task, logmsg, errmsg=None, traceback=False,
# For manual cleaning, the target provision state is MANAGEABLE, whereas
# for automated cleaning, it is AVAILABLE.
manual_clean = node.target_provision_state == states.MANAGEABLE
- node.last_error = errmsg
+ node_history_record(node, event=errmsg, event_type=states.CLEANING,
+ error=True)
# NOTE(dtantsur): avoid overwriting existing maintenance_reason
if not node.maintenance_reason and set_maintenance:
node.maintenance_reason = errmsg
@@ -570,7 +576,8 @@ def deploying_error_handler(task, logmsg, errmsg=None, traceback=False,
errmsg = errmsg or logmsg
node = task.node
LOG.error(logmsg, exc_info=traceback)
- node.last_error = errmsg
+ node_history_record(node, event=errmsg, event_type=states.DEPLOYING,
+ error=True)
node.save()
cleanup_err = None
@@ -600,7 +607,9 @@ def deploying_error_handler(task, logmsg, errmsg=None, traceback=False,
wipe_deploy_internal_info(task)
if cleanup_err:
- node.last_error = cleanup_err
+ node_history_record(node, event=cleanup_err,
+ event_type=states.DEPLOYING,
+ error=True)
node.save()
# NOTE(tenbrae): there is no need to clear conductor_affinity
@@ -636,7 +645,8 @@ def abort_on_conductor_take_over(task):
else:
# For aborted deployment (and potentially other operations), just set
# the last_error accordingly.
- task.node.last_error = msg
+ node_history_record(task.node, event=msg, event_type=states.TAKEOVER,
+ error=True)
task.node.save()
LOG.warning('Aborted the current operation on node %s due to '
@@ -657,17 +667,22 @@ def rescuing_error_handler(task, msg, set_fail_state=True):
node_power_action(task, states.POWER_OFF)
task.driver.rescue.clean_up(task)
remove_agent_url(node)
- node.last_error = msg
+ node_history_record(task.node, event=msg, event_type=states.RESCUE,
+ error=True)
except exception.IronicException as e:
- node.last_error = (_('Rescue operation was unsuccessful, clean up '
- 'failed for node: %(error)s') % {'error': e})
+ error = (_('Rescue operation was unsuccessful, clean up '
+ 'failed for node: %(error)s') % {'error': e})
+ node_history_record(task.node, event=error, event_type=states.RESCUE,
+ error=True)
LOG.error(('Rescue operation was unsuccessful, clean up failed for '
'node %(node)s: %(error)s'),
{'node': node.uuid, 'error': e})
except Exception as e:
- node.last_error = (_('Rescue failed, but an unhandled exception was '
- 'encountered while aborting: %(error)s') %
- {'error': e})
+ error = (_('Rescue failed, but an unhandled exception was '
+ 'encountered while aborting: %(error)s') %
+ {'error': e})
+ node_history_record(task.node, event=error, event_type=states.RESCUE,
+ error=True)
LOG.exception('Rescue failed for node %(node)s, an exception was '
'encountered while aborting.', {'node': node.uuid})
finally:
@@ -708,7 +723,9 @@ def _spawn_error_handler(e, node, operation):
:param operation: the operation being performed on the node.
"""
if isinstance(e, exception.NoFreeConductorWorker):
- node.last_error = (_("No free conductor workers available"))
+ error = (_("No free conductor workers available"))
+ node_history_record(node, event=error, event_type=states.CONDUCTOR,
+ error=True)
node.save()
LOG.warning("No free conductor workers available to perform "
"%(operation)s on node %(node)s",
@@ -749,7 +766,9 @@ def power_state_error_handler(e, node, power_state):
if isinstance(e, exception.NoFreeConductorWorker):
node.power_state = power_state
node.target_power_state = states.NOSTATE
- node.last_error = (_("No free conductor workers available"))
+ error = (_("No free conductor workers available"))
+ node_history_record(node, event=error, event_type=states.CONDUCTOR,
+ error=True)
node.save()
LOG.warning("No free conductor workers available to perform "
"an action on node %(node)s, setting node's "
@@ -1573,3 +1592,58 @@ def node_change_secure_boot(task, secure_boot_target):
{'state': secure_boot_target, 'node': task.node.uuid})
task.node.secure_boot = secure_boot_target
task.node.save()
+
+
+def node_history_record(node, conductor=None, event=None,
+ event_type=None, user=None,
+ error=False):
+ """Records a node history record
+
+ Adds an entry to the node history table with the appropriate fields
+ populated to ensure consistent experience by also updating the
+ node ``last_error`` field. Please note the event is only recorded
+ if the ``[conductor]node_history_max_size`` parameter is set to a
+ value greater than ``0``.
+
+ :param node: A node object from a task object. Required.
+ :param conductor: The hostname of the conductor. If not specified
+ this value is populated with the conductor FQDN.
+ :param event: The text to record to the node history table.
+ If no value is supplied, the method silently returns
+ to the caller.
+ :param event_type: The type activity where the event was encountered,
+ either "provisioning", "monitoring", "cleaning",
+ or whatever text the a driver author wishes to supply
+ based upon the activity. The purpose is to help guide
+ an API consumer/operator to have a better contextual
+ understanding of what was going on *when* the "event"
+ occured.
+ :param user: The user_id value which triggered the request,
+ if available.
+ :param error: Boolean value, default false, to signify if the event
+ is an error which should be recorded in the node
+ ``last_error`` field.
+ :returns: None. No value is returned by this method.
+ """
+ if not event:
+ # No error has occured, apparently.
+ return
+ if error:
+ # When the task exits out or is saved, the event
+ # or error is saved, but that is outside of ceating an
+ # entry in the history table.
+ node.last_error = event
+ if not conductor:
+ conductor = CONF.host
+ if CONF.conductor.node_history:
+ # If the maximum number of entries is not set to zero,
+ # then we should record the entry.
+ # NOTE(TheJulia): DB API automatically adds in a uuid.
+ # TODO(TheJulia): At some point, we should allow custom severity.
+ node_history.NodeHistory(
+ node_id=node.id,
+ conductor=CONF.host,
+ user=user,
+ severity=error and "ERROR" or "INFO",
+ event=event,
+ event_type=event_type or "UNKNOWN").create()
diff --git a/ironic/conductor/verify.py b/ironic/conductor/verify.py
index 3c40c1e50..b611e7e4e 100644
--- a/ironic/conductor/verify.py
+++ b/ironic/conductor/verify.py
@@ -16,6 +16,7 @@ from oslo_log import log
from ironic.common import exception
from ironic.common.i18n import _
+from ironic.common import states
from ironic.conductor import notification_utils as notify_utils
from ironic.conductor import task_manager
from ironic.conductor import utils
@@ -67,5 +68,8 @@ def do_node_verify(task):
else:
task.process_event('done')
else:
- node.last_error = error
+ utils.node_history_record(task.node, event=error,
+ event_type=states.VERIFY,
+ error=True,
+ user=task.context.user_id)
task.process_event('fail')
diff --git a/ironic/conf/conductor.py b/ironic/conf/conductor.py
index cb52c45d4..61d6b3247 100644
--- a/ironic/conf/conductor.py
+++ b/ironic/conf/conductor.py
@@ -283,6 +283,57 @@ opts = [
'will not run during cleaning. If unset for an '
'inband clean step, will use the priority set in the '
'ramdisk.')),
+ cfg.BoolOpt('node_history',
+ default=True,
+ mutable=True,
+ help=_('Boolean value, default True, if node event history '
+ 'is to be recorded. Errors and other noteworthy '
+ 'events in relation to a node are journaled to a '
+ 'database table which incurs some additional load. '
+ 'A periodic task does periodically remove entries '
+ 'from the database. Please note, if this is disabled, '
+ 'the conductor will continue to purge entries as '
+ 'long as [conductor]node_history_cleanup_batch_count '
+ 'is not 0.')),
+ cfg.IntOpt('node_history_max_entries',
+ default=300,
+ min=0,
+ mutable=True,
+ help=_('Maximum number of history entries which will be stored '
+ 'in the database per node. Default is 300. This setting '
+ 'excludes the minimum number of days retained using the '
+ '[conductor]node_history_minimum_days setting.')),
+ cfg.IntOpt('node_history_cleanup_interval',
+ min=0,
+ default=86400,
+ mutable=False,
+ help=_('Interval in seconds at which node history entries '
+ 'can be cleaned up in the database. Setting to 0 '
+ 'disables the periodic task. Defaults to once a day, '
+ 'or 86400 seconds.')),
+ cfg.IntOpt('node_history_cleanup_batch_count',
+ min=0,
+ default=1000,
+ mutable=False,
+ help=_('The target number of node history records to purge '
+ 'from the database when performing clean-up. '
+ 'Deletes are performed by node, and a node with excess '
+ 'records for a node will still be deleted. '
+ 'Defaults to 1000. Operators who find node history '
+ 'building up may wish to '
+ 'lower this threshold and decrease the time between '
+ 'cleanup operations using the '
+ '``node_history_cleanup_interval`` setting.')),
+ cfg.IntOpt('node_history_minimum_days',
+ min=0,
+ default=0,
+ mutable=True,
+ help=_('The minimum number of days to explicitly keep on '
+ 'hand in the database history entries for nodes. '
+ 'This is exclusive from the [conductor]'
+ 'node_history_max_entries setting as users of '
+ 'this setting are anticipated to need to retain '
+ 'history by policy.')),
]
diff --git a/ironic/db/api.py b/ironic/db/api.py
index 5b71d32bc..77fa054e2 100644
--- a/ironic/db/api.py
+++ b/ironic/db/api.py
@@ -1380,3 +1380,21 @@ class Connection(object, metaclass=abc.ABCMeta):
(asc, desc)
:returns: A list of histories.
"""
+
+ @abc.abstractmethod
+ def query_node_history_records_for_purge(self, conductor_id):
+ """Utility method to identify nodes to clean history records for.
+
+ :param conductor_id: Id value for the conductor to perform this
+ query on behalf of.
+ :returns: A dictionary with key values of node database ID values
+ and a list of values associated with the node.
+ """
+
+ @abc.abstractmethod
+ def bulk_delete_node_history_records(self, node_id, limit):
+ """Utility method to bulk delete node history entries.
+
+ :param entires: A list of node history entriy id's to be
+ queried for deletion.
+ """
diff --git a/ironic/db/sqlalchemy/api.py b/ironic/db/sqlalchemy/api.py
index 716c422dd..8f654483f 100644
--- a/ironic/db/sqlalchemy/api.py
+++ b/ironic/db/sqlalchemy/api.py
@@ -2329,3 +2329,74 @@ class Connection(api.Connection):
query = query.filter_by(node_id=node_id)
return _paginate_query(models.NodeHistory, limit, marker,
sort_key, sort_dir, query)
+
+ def query_node_history_records_for_purge(self, conductor_id):
+ min_days = CONF.conductor.node_history_minimum_days
+ max_num = CONF.conductor.node_history_max_entries
+
+ with _session_for_read() as session:
+ # First, figure out our nodes.
+ nodes = session.query(
+ models.Node.id,
+ ).filter(
+ models.Node.conductor_affinity == conductor_id
+ )
+
+ # Build our query to get the node_id and record id.
+ query = session.query(
+ models.NodeHistory.node_id,
+ models.NodeHistory.id,
+ )
+
+ # Filter by the nodes
+ query = query.filter(
+ models.NodeHistory.node_id.in_(nodes)
+ ).order_by(
+ # Order in an ascending order as older is always first.
+ models.NodeHistory.created_at.asc()
+ )
+
+ # Filter by minimum days
+ if min_days > 0:
+ before = datetime.datetime.now() - datetime.timedelta(
+ days=min_days)
+ query = query.filter(
+ models.NodeHistory.created_at < before
+ )
+
+ # Build our result set
+ result_set = {}
+ for entry in query.all():
+ if entry[0] not in result_set:
+ result_set[entry[0]] = []
+ result_set[entry[0]].append(entry[1])
+
+ final_set = {}
+ # Generate our final set of entries which should be removed
+ # by accounting for the number of permitted entries.
+ for entry in result_set:
+ final_set[entry] = []
+ set_len = len(result_set[entry])
+ # Any count <= the maximum number is okay
+ if set_len > max_num:
+ # figure out how many entries need to be removed
+ num_to_remove = set_len - max_num
+ for i in range(0, num_to_remove):
+ final_set[entry].append(result_set[entry][i])
+ # remove the entries at the end of the list
+ # which will be the more recent items as we
+ # ordered ascending originally.
+ print('returning final set')
+ print(final_set)
+ return final_set
+
+ def bulk_delete_node_history_records(self, entries):
+ with _session_for_write() as session:
+ # Uses input entry list, selects entries matching those ids
+ # then deletes them and does not synchronize the session so
+ # sqlalchemy doesn't do extra un-necessary work.
+ session.query(
+ models.NodeHistory
+ ).filter(
+ models.NodeHistory.id.in_(entries)
+ ).delete(synchronize_session=False)
diff --git a/ironic/tests/unit/conductor/test_base_manager.py b/ironic/tests/unit/conductor/test_base_manager.py
index 886ab07ed..7ae2c74ee 100644
--- a/ironic/tests/unit/conductor/test_base_manager.py
+++ b/ironic/tests/unit/conductor/test_base_manager.py
@@ -542,6 +542,12 @@ class StartConsolesTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
fields.NotificationStatus.START),
mock.call(mock.ANY, 'console_restore',
fields.NotificationStatus.ERROR)])
+ history = objects.NodeHistory.list_by_node_id(self.context,
+ test_node.id)
+ entry = history[0]
+ self.assertEqual('startup failure', entry['event_type'])
+ self.assertEqual('ERROR', entry['severity'])
+ self.assertIsNotNone(entry['event'])
@mock.patch.object(base_manager, 'LOG', autospec=True)
def test__start_consoles_node_locked(self, log_mock, mock_notify,
@@ -599,3 +605,9 @@ class MiscTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self.service._fail_transient_state(states.DEPLOYING, 'unknown err')
node.refresh()
self.assertEqual(states.DEPLOYFAIL, node.provision_state)
+ history = objects.NodeHistory.list_by_node_id(self.context,
+ node.id)
+ entry = history[0]
+ self.assertEqual('transition', entry['event_type'])
+ self.assertEqual('ERROR', entry['severity'])
+ self.assertEqual('unknown err', entry['event'])
diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py
index 50d1d151d..e93b3cb78 100644
--- a/ironic/tests/unit/conductor/test_manager.py
+++ b/ironic/tests/unit/conductor/test_manager.py
@@ -3361,7 +3361,7 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
with task_manager.acquire(self.context, node.uuid) as task:
self.service._do_node_rescue_abort(task)
clean_up_mock.assert_called_once_with(task.driver.rescue, task)
- self.assertIsNotNone(task.node.last_error)
+ self.assertIsNone(task.node.last_error)
self.assertFalse(task.node.maintenance)
self.assertNotIn('agent_url', task.node.driver_internal_info)
@@ -8153,3 +8153,166 @@ class DoNodeInspectAbortTestCase(mgr_utils.CommonMixIn,
self.assertIn('Inspection was aborted', node.last_error)
self.assertNotIn('agent_url', node.driver_internal_info)
self.assertNotIn('agent_secret_token', node.driver_internal_info)
+
+
+class NodeHistoryRecordCleanupTestCase(mgr_utils.ServiceSetUpMixin,
+ db_base.DbTestCase):
+
+ def setUp(self):
+ super(NodeHistoryRecordCleanupTestCase, self).setUp()
+ self.node1 = obj_utils.get_test_node(self.context,
+ driver='fake-hardware',
+ id=10,
+ uuid=uuidutils.generate_uuid(),
+ conductor_affinity=1)
+ self.node2 = obj_utils.get_test_node(self.context,
+ driver='fake-hardware',
+ id=11,
+ uuid=uuidutils.generate_uuid(),
+ conductor_affinity=1)
+ self.node3 = obj_utils.get_test_node(self.context,
+ driver='fake-hardware',
+ id=12,
+ uuid=uuidutils.generate_uuid(),
+ conductor_affinity=1)
+ self.nodes = [self.node1, self.node2, self.node3]
+ # Create the nodes, as the tasks need to operate across tables.
+ self.node1.create()
+ self.node2.create()
+ self.node3.create()
+ CONF.set_override('node_history_max_entries', 2, group='conductor')
+ CONF.set_override('node_history_cleanup_batch_count', 2,
+ group='conductor')
+ self._start_service()
+
+ def test_history_is_pruned_to_config(self):
+ for node in self.nodes:
+ for event in ['one', 'two', 'three']:
+ conductor_utils.node_history_record(node, event=event)
+ conductor_utils.node_history_record(self.node1, event="final")
+ self.service._manage_node_history(self.context)
+ events = objects.NodeHistory.list(self.context)
+ self.assertEqual(8, len(events))
+ events = objects.NodeHistory.list_by_node_id(self.context, 10)
+ self.assertEqual('three', events[0].event)
+ self.assertEqual('final', events[1].event)
+ self.assertEqual(2, len(events))
+ events = objects.NodeHistory.list_by_node_id(self.context, 11)
+ self.assertEqual(3, len(events))
+ self.assertEqual('one', events[0].event)
+ self.assertEqual('two', events[1].event)
+ self.assertEqual('three', events[2].event)
+ events = objects.NodeHistory.list_by_node_id(self.context, 12)
+ self.assertEqual(3, len(events))
+ self.assertEqual('one', events[0].event)
+ self.assertEqual('two', events[1].event)
+ self.assertEqual('three', events[2].event)
+
+ def test_history_is_pruned_to_config_two_pass(self):
+ for node in self.nodes:
+ for event in ['one', 'two', 'three']:
+ conductor_utils.node_history_record(node, event=event)
+ conductor_utils.node_history_record(self.node1, event="final")
+ with mock.patch.object(manager.LOG, 'warning',
+ autospec=True) as mock_log:
+ self.service._manage_node_history(self.context)
+ # assert we did call a warning entry
+ self.assertEqual(1, mock_log.call_count)
+ events = objects.NodeHistory.list(self.context)
+ self.assertEqual(8, len(events))
+ self.service._manage_node_history(self.context)
+ events = objects.NodeHistory.list(self.context)
+ print(events)
+ self.assertEqual(6, len(events))
+ events = objects.NodeHistory.list_by_node_id(self.context, 10)
+ self.assertEqual(2, len(events))
+ self.assertEqual('three', events[0].event)
+ self.assertEqual('final', events[1].event)
+ events = objects.NodeHistory.list_by_node_id(self.context, 11)
+ self.assertEqual(2, len(events))
+ self.assertEqual('two', events[0].event)
+ self.assertEqual('three', events[1].event)
+ events = objects.NodeHistory.list_by_node_id(self.context, 12)
+ self.assertEqual(2, len(events))
+ self.assertEqual('two', events[0].event)
+ self.assertEqual('three', events[1].event)
+
+ def test_history_is_pruned_from_all_nodes_one_pass(self):
+ CONF.set_override('node_history_cleanup_batch_count', 15,
+ group='conductor')
+ for node in self.nodes:
+ for event in ['one', 'two', 'three']:
+ conductor_utils.node_history_record(node, event=event)
+ self.service._manage_node_history(self.context)
+ events = objects.NodeHistory.list(self.context)
+ self.assertEqual(6, len(events))
+
+ def test_history_pruning_no_work(self):
+ conductor_utils.node_history_record(self.node1, event='meow')
+ with mock.patch.object(self.dbapi,
+ 'bulk_delete_node_history_records',
+ autospec=True) as mock_delete:
+ self.service._manage_node_history(self.context)
+ mock_delete.assert_not_called()
+ events = objects.NodeHistory.list(self.context)
+ self.assertEqual(1, len(events))
+
+ def test_history_pruning_not_other_conductor(self):
+ node = obj_utils.get_test_node(self.context,
+ driver='fake-hardware',
+ id=33,
+ uuid=uuidutils.generate_uuid(),
+ conductor_affinity=2)
+ # create node so it can be queried
+ node.create()
+ for i in range(0, 3):
+ conductor_utils.node_history_record(node, event='meow%s' % i)
+ with mock.patch.object(self.dbapi,
+ 'bulk_delete_node_history_records',
+ autospec=True) as mock_delete:
+ self.service._manage_node_history(self.context)
+ mock_delete.assert_not_called()
+ events = objects.NodeHistory.list(self.context)
+ self.assertEqual(3, len(events))
+ self.assertEqual('meow0', events[0].event)
+
+ def test_history_is_pruned_to_config_with_days(self):
+ CONF.set_override('node_history_cleanup_batch_count', 15,
+ group='conductor')
+ CONF.set_override('node_history_minimum_days', 1,
+ group='conductor')
+ CONF.set_override('node_history_max_entries', 1, group='conductor')
+
+ old_date = datetime.datetime.now() - datetime.timedelta(days=7)
+ # Creates 18 entries
+ for node in self.nodes:
+ for event in ['oldone', 'oldtwo', 'oldthree']:
+ objects.NodeHistory(created_at=old_date,
+ event=event,
+ node_id=node.id).create()
+ for event in ['one', 'two', 'three']:
+ conductor_utils.node_history_record(node, event=event)
+
+ # 9 retained due to days, 3 to config
+ self.service._manage_node_history(self.context)
+ events = objects.NodeHistory.list(self.context)
+ print(events)
+ self.assertEqual(12, len(events))
+ events = objects.NodeHistory.list_by_node_id(self.context, 10)
+ self.assertEqual(4, len(events))
+ self.assertEqual('oldthree', events[0].event)
+ self.assertEqual('one', events[1].event)
+ self.assertEqual('two', events[2].event)
+ self.assertEqual('three', events[3].event)
+ events = objects.NodeHistory.list_by_node_id(self.context, 11)
+ self.assertEqual(4, len(events))
+ self.assertEqual('oldthree', events[0].event)
+ self.assertEqual('one', events[1].event)
+ self.assertEqual('two', events[2].event)
+ self.assertEqual('three', events[3].event)
+ events = objects.NodeHistory.list_by_node_id(self.context, 12)
+ self.assertEqual(4, len(events))
+ self.assertEqual('oldthree', events[0].event)
+ self.assertEqual('one', events[1].event)
+ self.assertEqual('two', events[2].event)
+ self.assertEqual('three', events[3].event)
diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py
index f7e16c618..1e563ec71 100644
--- a/ironic/tests/unit/conductor/test_utils.py
+++ b/ironic/tests/unit/conductor/test_utils.py
@@ -959,7 +959,7 @@ class NodeSoftPowerActionTestCase(db_base.DbTestCase):
self.assertIsNone(node['last_error'])
-class DeployingErrorHandlerTestCase(tests_base.TestCase):
+class DeployingErrorHandlerTestCase(db_base.DbTestCase):
def setUp(self):
super(DeployingErrorHandlerTestCase, self).setUp()
self.task = mock.Mock(spec=task_manager.TaskManager)
@@ -972,6 +972,8 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase):
self.node.last_error = None
self.node.deploy_step = None
self.node.driver_internal_info = {}
+ self.node.id = obj_utils.create_test_node(self.context,
+ driver='fake-hardware').id
self.logmsg = "log message"
self.errmsg = "err message"
@@ -1023,6 +1025,7 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase):
self.assertEqual({}, self.node.deploy_step)
self.assertNotIn('deploy_step_index', self.node.driver_internal_info)
self.task.process_event.assert_called_once_with('fail')
+ self.assertIsNotNone(self.node.last_error)
def test_deploying_error_handler_cleanup_ironic_exception(self):
self._test_deploying_error_handler_cleanup(
@@ -1058,7 +1061,7 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase):
self.task.process_event.assert_called_once_with('fail')
-class ErrorHandlersTestCase(tests_base.TestCase):
+class ErrorHandlersTestCase(db_base.DbTestCase):
def setUp(self):
super(ErrorHandlersTestCase, self).setUp()
self.task = mock.Mock(spec=task_manager.TaskManager)
@@ -1070,9 +1073,13 @@ class ErrorHandlersTestCase(tests_base.TestCase):
# strict typing of the node power state fields and would fail if passed
# a Mock object in constructors. A task context is also required for
# notifications.
+ fake_node = obj_utils.create_test_node(self.context,
+ driver='fake-hardware')
+
self.node.configure_mock(power_state=states.POWER_OFF,
target_power_state=states.POWER_ON,
- maintenance=False, maintenance_reason=None)
+ maintenance=False, maintenance_reason=None,
+ id=fake_node.id)
self.task.context = self.context
@mock.patch.object(conductor_utils, 'LOG', autospec=True)
@@ -1401,6 +1408,7 @@ class ErrorHandlersTestCase(tests_base.TestCase):
'%(node)s, an exception was '
'encountered while aborting.',
{'node': self.node.uuid})
+ self.assertIsNotNone(self.node.last_error)
self.node.save.assert_called_once_with()
@mock.patch.object(conductor_utils.LOG, 'error', autospec=True)
@@ -2562,3 +2570,60 @@ class GetConfigDriveImageTestCase(db_base.DbTestCase):
network_data=None,
user_data=b'{"user": "data"}',
vendor_data=None)
+
+
+class NodeHistoryRecordTestCase(db_base.DbTestCase):
+
+ def setUp(self):
+ super(NodeHistoryRecordTestCase, self).setUp()
+ self.node = obj_utils.create_test_node(
+ self.context,
+ uuid=uuidutils.generate_uuid())
+
+ def test_record_node_history(self):
+ conductor_utils.node_history_record(self.node, event='meow')
+ entries = objects.NodeHistory.list_by_node_id(self.context,
+ self.node.id)
+ entry = entries[0]
+ self.assertEqual('meow', entry['event'])
+ self.assertEqual(CONF.host, entry['conductor'])
+ self.assertEqual('INFO', entry['severity'])
+ self.assertIsNone(entry['user'])
+
+ def test_record_node_history_with_user(self):
+ conductor_utils.node_history_record(self.node, event='meow',
+ user='peachesthecat')
+ entries = objects.NodeHistory.list_by_node_id(self.context,
+ self.node.id)
+ entry = entries[0]
+ self.assertEqual('meow', entry['event'])
+ self.assertEqual(CONF.host, entry['conductor'])
+ self.assertEqual('peachesthecat', entry['user'])
+
+ def test_record_node_history_with_error_severity(self):
+ conductor_utils.node_history_record(self.node, event='meowmeow',
+ error=True,
+ event_type='catwantfood')
+ entries = objects.NodeHistory.list_by_node_id(self.context,
+ self.node.id)
+ entry = entries[0]
+ self.assertEqual('meowmeow', entry['event'])
+ self.assertEqual(CONF.host, entry['conductor'])
+ self.assertEqual('ERROR', entry['severity'])
+ self.assertEqual('catwantfood', entry['event_type'])
+
+ @mock.patch.object(objects, 'NodeHistory', autospec=True)
+ def test_record_node_history_noop(self, mock_history):
+ CONF.set_override('node_history', False, group='conductor')
+ self.assertIsNone(conductor_utils.node_history_record(self.node))
+ mock_history.assert_not_called()
+
+ @mock.patch.object(objects, 'NodeHistory', autospec=True)
+ def test_record_node_history_disaled(self, mock_history):
+ mock_create = mock.Mock()
+ conductor_utils.node_history_record(self.node, event='meow',
+ error=True)
+ self.assertEqual('meow', self.node.last_error)
+ mock_history.create = mock_create
+ mock_history.assert_not_called()
+ mock_create.assert_not_called()
diff --git a/ironic/tests/unit/conductor/test_verify.py b/ironic/tests/unit/conductor/test_verify.py
index 0e29f87f5..a898ba436 100644
--- a/ironic/tests/unit/conductor/test_verify.py
+++ b/ironic/tests/unit/conductor/test_verify.py
@@ -27,6 +27,7 @@ from ironic.common import states
from ironic.conductor import task_manager
from ironic.conductor import utils as conductor_utils
from ironic.conductor import verify
+from ironic import objects
from ironic.tests.unit.conductor import mgr_utils
from ironic.tests.unit.db import base as db_base
from ironic.tests.unit.objects import utils as obj_utils
@@ -138,6 +139,11 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self.assertEqual(states.ENROLL, node.provision_state)
self.assertIsNone(node.target_provision_state)
self.assertTrue(node.last_error)
+ history = objects.NodeHistory.list_by_node_id(self.context,
+ node.id)
+ entry = history[0]
+ self.assertEqual('verify', entry['event_type'])
+ self.assertEqual('ERROR', entry['severity'])
@mock.patch.object(conductor_utils, 'LOG', autospec=True)
@mock.patch('ironic.drivers.modules.fake.FakePower.validate',