diff options
Diffstat (limited to 'ironic')
-rw-r--r-- | ironic/common/states.py | 19 | ||||
-rw-r--r-- | ironic/conductor/base_manager.py | 10 | ||||
-rw-r--r-- | ironic/conductor/manager.py | 128 | ||||
-rw-r--r-- | ironic/conductor/utils.py | 104 | ||||
-rw-r--r-- | ironic/conductor/verify.py | 6 | ||||
-rw-r--r-- | ironic/conf/conductor.py | 51 | ||||
-rw-r--r-- | ironic/db/api.py | 18 | ||||
-rw-r--r-- | ironic/db/sqlalchemy/api.py | 71 | ||||
-rw-r--r-- | ironic/tests/unit/conductor/test_base_manager.py | 12 | ||||
-rw-r--r-- | ironic/tests/unit/conductor/test_manager.py | 165 | ||||
-rw-r--r-- | ironic/tests/unit/conductor/test_utils.py | 71 | ||||
-rw-r--r-- | ironic/tests/unit/conductor/test_verify.py | 6 |
12 files changed, 622 insertions, 39 deletions
diff --git a/ironic/common/states.py b/ironic/common/states.py index 3b0cbe18a..89b710189 100644 --- a/ironic/common/states.py +++ b/ironic/common/states.py @@ -297,6 +297,25 @@ ALLOCATING = 'allocating' # States ERROR and ACTIVE are reused. +########################### +# History Event State Types +########################### + +PROVISIONING = "provisioning" +CLEANING = "cleaning" +DEPLOYING = "deploying" +TAKEOVER = "takeover" +INTROSPECTION = "introspection" +RESCUE = "rescue" +CONDUCTOR = "conductor" +TRANSITION = "transition" +STARTFAIL = "startup failure" +UNPROVISION = "unprovision" +ADOPTION = "adoption" +CONSOLE = "console" +MONITORING = "monitoring" +VERIFY = "verify" + ##################### # State machine model diff --git a/ironic/conductor/base_manager.py b/ironic/conductor/base_manager.py index 53b2b48ae..43a3ff51e 100644 --- a/ironic/conductor/base_manager.py +++ b/ironic/conductor/base_manager.py @@ -37,6 +37,7 @@ from ironic.common import states from ironic.conductor import allocations from ironic.conductor import notification_utils as notify_utils from ironic.conductor import task_manager +from ironic.conductor import utils from ironic.conf import CONF from ironic.db import api as dbapi from ironic.drivers import base as driver_base @@ -536,7 +537,10 @@ class BaseConductorManager(object): err_handler=err_handler, target_state=target_state) else: - task.node.last_error = last_error + utils.node_history_record( + task.node, event=last_error, + error=True, + event_type=states.TRANSITION) task.process_event('fail', target_state=target_state) except exception.NoFreeConductorWorker: break @@ -580,7 +584,9 @@ class BaseConductorManager(object): LOG.error(msg) # If starting console failed, set node console_enabled # back to False and set node's last error. - task.node.last_error = msg + utils.node_history_record(task.node, event=msg, + error=True, + event_type=states.STARTFAIL) task.node.console_enabled = False task.node.save() notify_utils.emit_console_notification( diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index 1abb75fe1..7fe4c5959 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -712,7 +712,11 @@ class ConductorManager(base_manager.BaseConductorManager): def handle_failure(e, errmsg, log_func=LOG.error): utils.remove_node_rescue_password(node, save=False) - node.last_error = errmsg % e + error = errmsg % e + utils.node_history_record(task.node, event=error, + event_type=states.RESCUE, + error=True, + user=task.context.user_id) task.process_event('fail') log_func('Error while performing rescue operation for node ' '%(node)s with instance %(instance)s: %(err)s', @@ -801,7 +805,11 @@ class ConductorManager(base_manager.BaseConductorManager): node = task.node def handle_failure(e, errmsg, log_func=LOG.error): - node.last_error = errmsg % e + error = errmsg % e + utils.node_history_record(task.node, event=error, + event_type=states.RESCUE, + error=True, + user=task.context.user_id) task.process_event('fail') log_func('Error while performing unrescue operation for node ' '%(node)s with instance %(instance)s: %(err)s', @@ -845,7 +853,9 @@ class ConductorManager(base_manager.BaseConductorManager): error_msg = _('Failed to clean up rescue after aborting ' 'the operation') node.refresh() - node.last_error = error_msg + utils.node_history_record(node, event=error_msg, + event_type=states.RESCUE, error=True, + user=task.context.user_id) node.maintenance = True node.maintenance_reason = error_msg node.fault = faults.RESCUE_ABORT_FAILURE @@ -853,10 +863,15 @@ class ConductorManager(base_manager.BaseConductorManager): return info_message = _('Rescue operation aborted for node %s.') % node.uuid - last_error = _('By request, the rescue operation was aborted.') + # NOTE(TheJulia): This "error" is not an actual error, the operation + # has been aborted and the node returned to normal operation. + error = _('By request, the rescue operation was aborted.') + utils.node_history_record(task.node, event=error, + event_type=states.RESCUE, + error=False, + user=task.context.user_id) node.refresh() utils.remove_agent_url(node) - node.last_error = last_error node.save() LOG.info(info_message) @@ -1053,7 +1068,11 @@ class ConductorManager(base_manager.BaseConductorManager): with excutils.save_and_reraise_exception(): LOG.exception('Error in tear_down of node %(node)s: %(err)s', {'node': node.uuid, 'err': e}) - node.last_error = _("Failed to tear down. Error: %s") % e + error = _("Failed to tear down. Error: %s") % e + utils.node_history_record(task.node, event=error, + event_type=states.UNPROVISION, + error=True, + user=task.context.user_id) task.process_event('fail') else: # NOTE(tenbrae): When tear_down finishes, the deletion is done, @@ -1339,10 +1358,18 @@ class ConductorManager(base_manager.BaseConductorManager): with excutils.save_and_reraise_exception(): LOG.exception('Error in aborting the inspection of ' 'node %(node)s', {'node': node.uuid}) - node.last_error = _('Failed to abort inspection. ' - 'Error: %s') % e + error = _('Failed to abort inspection. ' + 'Error: %s') % e + utils.node_history_record(task.node, event=error, + event_type=states.INTROSPECTION, + error=True, + user=task.context.user_id) node.save() - node.last_error = _('Inspection was aborted by request.') + error = _('Inspection was aborted by request.') + utils.node_history_record(task.node, event=error, + event_type=states.INTROSPECTION, + error=True, + user=task.context.user_id) utils.wipe_token_and_url(task) task.process_event('abort') LOG.info('Successfully aborted inspection of node %(node)s', @@ -1659,9 +1686,14 @@ class ConductorManager(base_manager.BaseConductorManager): if not task.node.maintenance and task.node.target_power_state: old_state = task.node.target_power_state task.node.target_power_state = None - task.node.last_error = _('Pending power operation was ' - 'aborted due to conductor take ' - 'over') + error = _('Pending power operation was ' + 'aborted due to conductor take ' + 'over') + utils.node_history_record(task.node, event=error, + event_type=states.TAKEOVER, + error=True, + user=task.context.user_id) + task.node.save() LOG.warning('Aborted pending power operation %(op)s ' 'on node %(node)s due to conductor take over', @@ -1725,7 +1757,10 @@ class ConductorManager(base_manager.BaseConductorManager): LOG.error(msg) # Wipe power state from being preserved as it is likely invalid. node.power_state = states.NOSTATE - node.last_error = msg + utils.node_history_record(task.node, event=msg, + event_type=states.ADOPTION, + error=True, + user=task.context.user_id) task.process_event('fail') @METRICS.timer('ConductorManager._do_takeover') @@ -1764,7 +1799,10 @@ class ConductorManager(base_manager.BaseConductorManager): LOG.error(msg) # If taking over console failed, set node's console_enabled # back to False and set node's last error. - task.node.last_error = msg + utils.node_history_record(task.node, event=msg, + event_type=states.TAKEOVER, + error=True, + user=task.context.user_id) task.node.console_enabled = False console_error = True else: @@ -2231,7 +2269,10 @@ class ConductorManager(base_manager.BaseConductorManager): 'Reason: %(error)s') % {'op': op, 'node': node.uuid, 'error': e}) - node.last_error = msg + utils.node_history_record(task.node, event=msg, + event_type=states.CONSOLE, + error=True, + user=task.context.user_id) LOG.error(msg) node.save() notify_utils.emit_console_notification( @@ -3495,6 +3536,55 @@ class ConductorManager(base_manager.BaseConductorManager): task.node.save() return task.node + @METRICS.timer('ConductorManager.manage_node_history') + @periodics.periodic( + spacing=CONF.conductor.node_history_cleanup_interval, + enabled=( + CONF.conductor.node_history_cleanup_batch_count > 0 + and CONF.conductor.node_history_max_entries != 0 + ) + ) + def manage_node_history(self, context): + try: + self._manage_node_history(context) + except Exception as e: + LOG.error('Encountered error while cleaning node ' + 'history records: %s', e) + + def _manage_node_history(self, context): + """Periodic task to keep the node history tidy.""" + max_batch = CONF.conductor.node_history_cleanup_batch_count + # NOTE(TheJulia): Asks the db for the list. Presently just gets + # the node id and the count. If we incorporate by date constraint + # or handling, then it will need to be something like the method + # needs to identify the explicit ID values to delete, and then + # the deletion process needs to erase in logical chunks. + entries_to_clean = self.dbapi.query_node_history_records_for_purge( + conductor_id=self.conductor.id) + count = 0 + for node_id in entries_to_clean: + if count < max_batch: + # If we have not hit our total limit, proceed + if entries_to_clean[node_id]: + # if we have work to do on this node, proceed. + self.dbapi.bulk_delete_node_history_records( + entries_to_clean[node_id]) + else: + LOG.warning('While cleaning up node history records, ' + 'we reached the maximum number of records ' + 'permitted in a single batch. If this error ' + 'is repeated, consider tuning node history ' + 'configuration options to be more aggressive ' + 'by increasing frequency and lowering the ' + 'number of entries to be deleted to not ' + 'negatively impact performance.') + break + count = count + len(entries_to_clean[node_id]) + # Yield to other threads, since we also don't want to be + # looping tightly deleting rows as that will negatively + # impact DB access if done in excess. + eventlet.sleep(0) + @METRICS.timer('get_vendor_passthru_metadata') def get_vendor_passthru_metadata(route_dict): @@ -3534,7 +3624,9 @@ def handle_sync_power_state_max_retries_exceeded(task, actual_power_state, old_power_state = node.power_state node.power_state = actual_power_state - node.last_error = msg + utils.node_history_record(task.node, event=msg, + event_type=states.MONITORING, + error=True) node.maintenance = True node.maintenance_reason = msg node.fault = faults.POWER_FAILURE @@ -3688,7 +3780,9 @@ def _do_inspect_hardware(task): node = task.node def handle_failure(e, log_func=LOG.error): - node.last_error = e + utils.node_history_record(task.node, event=e, + event_type=states.INTROSPECTION, + error=True, user=task.context.user_id) task.process_event('fail') log_func("Failed to inspect node %(node)s: %(err)s", {'node': node.uuid, 'err': e}) diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index bbc489dfc..59b60a5da 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -39,6 +39,7 @@ from ironic.common import states from ironic.conductor import notification_utils as notify_utils from ironic.conductor import task_manager from ironic.objects import fields +from ironic.objects import node_history LOG = log.getLogger(__name__) CONF = cfg.CONF @@ -245,9 +246,10 @@ def _can_skip_state_change(task, new_state): curr_state = task.driver.power.get_power_state(task) except Exception as e: with excutils.save_and_reraise_exception(): - node['last_error'] = _( + error = _( "Failed to change power state to '%(target)s'. " "Error: %(error)s") % {'target': new_state, 'error': e} + node_history_record(node, event=error, error=True) node['target_power_state'] = states.NOSTATE node.save() notify_utils.emit_power_set_notification( @@ -329,12 +331,13 @@ def node_power_action(task, new_state, timeout=None): except Exception as e: with excutils.save_and_reraise_exception(): node['target_power_state'] = states.NOSTATE - node['last_error'] = _( + error = _( "Failed to change power state to '%(target_state)s' " "by '%(new_state)s'. Error: %(error)s") % { 'target_state': target_state, 'new_state': new_state, 'error': e} + node_history_record(node, event=error, error=True) node.save() notify_utils.emit_power_set_notification( task, fields.NotificationLevel.ERROR, @@ -399,7 +402,9 @@ def provisioning_error_handler(e, node, provision_state, # because it isn't updated on a failed deploy node.provision_state = provision_state node.target_provision_state = target_provision_state - node.last_error = (_("No free conductor workers available")) + error = (_("No free conductor workers available")) + node_history_record(node, event=error, event_type=states.PROVISIONING, + error=True) node.save() LOG.warning("No free conductor workers available to perform " "an action on node %(node)s, setting node's " @@ -483,7 +488,8 @@ def cleaning_error_handler(task, logmsg, errmsg=None, traceback=False, # For manual cleaning, the target provision state is MANAGEABLE, whereas # for automated cleaning, it is AVAILABLE. manual_clean = node.target_provision_state == states.MANAGEABLE - node.last_error = errmsg + node_history_record(node, event=errmsg, event_type=states.CLEANING, + error=True) # NOTE(dtantsur): avoid overwriting existing maintenance_reason if not node.maintenance_reason and set_maintenance: node.maintenance_reason = errmsg @@ -570,7 +576,8 @@ def deploying_error_handler(task, logmsg, errmsg=None, traceback=False, errmsg = errmsg or logmsg node = task.node LOG.error(logmsg, exc_info=traceback) - node.last_error = errmsg + node_history_record(node, event=errmsg, event_type=states.DEPLOYING, + error=True) node.save() cleanup_err = None @@ -600,7 +607,9 @@ def deploying_error_handler(task, logmsg, errmsg=None, traceback=False, wipe_deploy_internal_info(task) if cleanup_err: - node.last_error = cleanup_err + node_history_record(node, event=cleanup_err, + event_type=states.DEPLOYING, + error=True) node.save() # NOTE(tenbrae): there is no need to clear conductor_affinity @@ -636,7 +645,8 @@ def abort_on_conductor_take_over(task): else: # For aborted deployment (and potentially other operations), just set # the last_error accordingly. - task.node.last_error = msg + node_history_record(task.node, event=msg, event_type=states.TAKEOVER, + error=True) task.node.save() LOG.warning('Aborted the current operation on node %s due to ' @@ -657,17 +667,22 @@ def rescuing_error_handler(task, msg, set_fail_state=True): node_power_action(task, states.POWER_OFF) task.driver.rescue.clean_up(task) remove_agent_url(node) - node.last_error = msg + node_history_record(task.node, event=msg, event_type=states.RESCUE, + error=True) except exception.IronicException as e: - node.last_error = (_('Rescue operation was unsuccessful, clean up ' - 'failed for node: %(error)s') % {'error': e}) + error = (_('Rescue operation was unsuccessful, clean up ' + 'failed for node: %(error)s') % {'error': e}) + node_history_record(task.node, event=error, event_type=states.RESCUE, + error=True) LOG.error(('Rescue operation was unsuccessful, clean up failed for ' 'node %(node)s: %(error)s'), {'node': node.uuid, 'error': e}) except Exception as e: - node.last_error = (_('Rescue failed, but an unhandled exception was ' - 'encountered while aborting: %(error)s') % - {'error': e}) + error = (_('Rescue failed, but an unhandled exception was ' + 'encountered while aborting: %(error)s') % + {'error': e}) + node_history_record(task.node, event=error, event_type=states.RESCUE, + error=True) LOG.exception('Rescue failed for node %(node)s, an exception was ' 'encountered while aborting.', {'node': node.uuid}) finally: @@ -708,7 +723,9 @@ def _spawn_error_handler(e, node, operation): :param operation: the operation being performed on the node. """ if isinstance(e, exception.NoFreeConductorWorker): - node.last_error = (_("No free conductor workers available")) + error = (_("No free conductor workers available")) + node_history_record(node, event=error, event_type=states.CONDUCTOR, + error=True) node.save() LOG.warning("No free conductor workers available to perform " "%(operation)s on node %(node)s", @@ -749,7 +766,9 @@ def power_state_error_handler(e, node, power_state): if isinstance(e, exception.NoFreeConductorWorker): node.power_state = power_state node.target_power_state = states.NOSTATE - node.last_error = (_("No free conductor workers available")) + error = (_("No free conductor workers available")) + node_history_record(node, event=error, event_type=states.CONDUCTOR, + error=True) node.save() LOG.warning("No free conductor workers available to perform " "an action on node %(node)s, setting node's " @@ -1573,3 +1592,58 @@ def node_change_secure_boot(task, secure_boot_target): {'state': secure_boot_target, 'node': task.node.uuid}) task.node.secure_boot = secure_boot_target task.node.save() + + +def node_history_record(node, conductor=None, event=None, + event_type=None, user=None, + error=False): + """Records a node history record + + Adds an entry to the node history table with the appropriate fields + populated to ensure consistent experience by also updating the + node ``last_error`` field. Please note the event is only recorded + if the ``[conductor]node_history_max_size`` parameter is set to a + value greater than ``0``. + + :param node: A node object from a task object. Required. + :param conductor: The hostname of the conductor. If not specified + this value is populated with the conductor FQDN. + :param event: The text to record to the node history table. + If no value is supplied, the method silently returns + to the caller. + :param event_type: The type activity where the event was encountered, + either "provisioning", "monitoring", "cleaning", + or whatever text the a driver author wishes to supply + based upon the activity. The purpose is to help guide + an API consumer/operator to have a better contextual + understanding of what was going on *when* the "event" + occured. + :param user: The user_id value which triggered the request, + if available. + :param error: Boolean value, default false, to signify if the event + is an error which should be recorded in the node + ``last_error`` field. + :returns: None. No value is returned by this method. + """ + if not event: + # No error has occured, apparently. + return + if error: + # When the task exits out or is saved, the event + # or error is saved, but that is outside of ceating an + # entry in the history table. + node.last_error = event + if not conductor: + conductor = CONF.host + if CONF.conductor.node_history: + # If the maximum number of entries is not set to zero, + # then we should record the entry. + # NOTE(TheJulia): DB API automatically adds in a uuid. + # TODO(TheJulia): At some point, we should allow custom severity. + node_history.NodeHistory( + node_id=node.id, + conductor=CONF.host, + user=user, + severity=error and "ERROR" or "INFO", + event=event, + event_type=event_type or "UNKNOWN").create() diff --git a/ironic/conductor/verify.py b/ironic/conductor/verify.py index 3c40c1e50..b611e7e4e 100644 --- a/ironic/conductor/verify.py +++ b/ironic/conductor/verify.py @@ -16,6 +16,7 @@ from oslo_log import log from ironic.common import exception from ironic.common.i18n import _ +from ironic.common import states from ironic.conductor import notification_utils as notify_utils from ironic.conductor import task_manager from ironic.conductor import utils @@ -67,5 +68,8 @@ def do_node_verify(task): else: task.process_event('done') else: - node.last_error = error + utils.node_history_record(task.node, event=error, + event_type=states.VERIFY, + error=True, + user=task.context.user_id) task.process_event('fail') diff --git a/ironic/conf/conductor.py b/ironic/conf/conductor.py index cb52c45d4..61d6b3247 100644 --- a/ironic/conf/conductor.py +++ b/ironic/conf/conductor.py @@ -283,6 +283,57 @@ opts = [ 'will not run during cleaning. If unset for an ' 'inband clean step, will use the priority set in the ' 'ramdisk.')), + cfg.BoolOpt('node_history', + default=True, + mutable=True, + help=_('Boolean value, default True, if node event history ' + 'is to be recorded. Errors and other noteworthy ' + 'events in relation to a node are journaled to a ' + 'database table which incurs some additional load. ' + 'A periodic task does periodically remove entries ' + 'from the database. Please note, if this is disabled, ' + 'the conductor will continue to purge entries as ' + 'long as [conductor]node_history_cleanup_batch_count ' + 'is not 0.')), + cfg.IntOpt('node_history_max_entries', + default=300, + min=0, + mutable=True, + help=_('Maximum number of history entries which will be stored ' + 'in the database per node. Default is 300. This setting ' + 'excludes the minimum number of days retained using the ' + '[conductor]node_history_minimum_days setting.')), + cfg.IntOpt('node_history_cleanup_interval', + min=0, + default=86400, + mutable=False, + help=_('Interval in seconds at which node history entries ' + 'can be cleaned up in the database. Setting to 0 ' + 'disables the periodic task. Defaults to once a day, ' + 'or 86400 seconds.')), + cfg.IntOpt('node_history_cleanup_batch_count', + min=0, + default=1000, + mutable=False, + help=_('The target number of node history records to purge ' + 'from the database when performing clean-up. ' + 'Deletes are performed by node, and a node with excess ' + 'records for a node will still be deleted. ' + 'Defaults to 1000. Operators who find node history ' + 'building up may wish to ' + 'lower this threshold and decrease the time between ' + 'cleanup operations using the ' + '``node_history_cleanup_interval`` setting.')), + cfg.IntOpt('node_history_minimum_days', + min=0, + default=0, + mutable=True, + help=_('The minimum number of days to explicitly keep on ' + 'hand in the database history entries for nodes. ' + 'This is exclusive from the [conductor]' + 'node_history_max_entries setting as users of ' + 'this setting are anticipated to need to retain ' + 'history by policy.')), ] diff --git a/ironic/db/api.py b/ironic/db/api.py index 5b71d32bc..77fa054e2 100644 --- a/ironic/db/api.py +++ b/ironic/db/api.py @@ -1380,3 +1380,21 @@ class Connection(object, metaclass=abc.ABCMeta): (asc, desc) :returns: A list of histories. """ + + @abc.abstractmethod + def query_node_history_records_for_purge(self, conductor_id): + """Utility method to identify nodes to clean history records for. + + :param conductor_id: Id value for the conductor to perform this + query on behalf of. + :returns: A dictionary with key values of node database ID values + and a list of values associated with the node. + """ + + @abc.abstractmethod + def bulk_delete_node_history_records(self, node_id, limit): + """Utility method to bulk delete node history entries. + + :param entires: A list of node history entriy id's to be + queried for deletion. + """ diff --git a/ironic/db/sqlalchemy/api.py b/ironic/db/sqlalchemy/api.py index 716c422dd..8f654483f 100644 --- a/ironic/db/sqlalchemy/api.py +++ b/ironic/db/sqlalchemy/api.py @@ -2329,3 +2329,74 @@ class Connection(api.Connection): query = query.filter_by(node_id=node_id) return _paginate_query(models.NodeHistory, limit, marker, sort_key, sort_dir, query) + + def query_node_history_records_for_purge(self, conductor_id): + min_days = CONF.conductor.node_history_minimum_days + max_num = CONF.conductor.node_history_max_entries + + with _session_for_read() as session: + # First, figure out our nodes. + nodes = session.query( + models.Node.id, + ).filter( + models.Node.conductor_affinity == conductor_id + ) + + # Build our query to get the node_id and record id. + query = session.query( + models.NodeHistory.node_id, + models.NodeHistory.id, + ) + + # Filter by the nodes + query = query.filter( + models.NodeHistory.node_id.in_(nodes) + ).order_by( + # Order in an ascending order as older is always first. + models.NodeHistory.created_at.asc() + ) + + # Filter by minimum days + if min_days > 0: + before = datetime.datetime.now() - datetime.timedelta( + days=min_days) + query = query.filter( + models.NodeHistory.created_at < before + ) + + # Build our result set + result_set = {} + for entry in query.all(): + if entry[0] not in result_set: + result_set[entry[0]] = [] + result_set[entry[0]].append(entry[1]) + + final_set = {} + # Generate our final set of entries which should be removed + # by accounting for the number of permitted entries. + for entry in result_set: + final_set[entry] = [] + set_len = len(result_set[entry]) + # Any count <= the maximum number is okay + if set_len > max_num: + # figure out how many entries need to be removed + num_to_remove = set_len - max_num + for i in range(0, num_to_remove): + final_set[entry].append(result_set[entry][i]) + # remove the entries at the end of the list + # which will be the more recent items as we + # ordered ascending originally. + print('returning final set') + print(final_set) + return final_set + + def bulk_delete_node_history_records(self, entries): + with _session_for_write() as session: + # Uses input entry list, selects entries matching those ids + # then deletes them and does not synchronize the session so + # sqlalchemy doesn't do extra un-necessary work. + session.query( + models.NodeHistory + ).filter( + models.NodeHistory.id.in_(entries) + ).delete(synchronize_session=False) diff --git a/ironic/tests/unit/conductor/test_base_manager.py b/ironic/tests/unit/conductor/test_base_manager.py index 886ab07ed..7ae2c74ee 100644 --- a/ironic/tests/unit/conductor/test_base_manager.py +++ b/ironic/tests/unit/conductor/test_base_manager.py @@ -542,6 +542,12 @@ class StartConsolesTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): fields.NotificationStatus.START), mock.call(mock.ANY, 'console_restore', fields.NotificationStatus.ERROR)]) + history = objects.NodeHistory.list_by_node_id(self.context, + test_node.id) + entry = history[0] + self.assertEqual('startup failure', entry['event_type']) + self.assertEqual('ERROR', entry['severity']) + self.assertIsNotNone(entry['event']) @mock.patch.object(base_manager, 'LOG', autospec=True) def test__start_consoles_node_locked(self, log_mock, mock_notify, @@ -599,3 +605,9 @@ class MiscTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): self.service._fail_transient_state(states.DEPLOYING, 'unknown err') node.refresh() self.assertEqual(states.DEPLOYFAIL, node.provision_state) + history = objects.NodeHistory.list_by_node_id(self.context, + node.id) + entry = history[0] + self.assertEqual('transition', entry['event_type']) + self.assertEqual('ERROR', entry['severity']) + self.assertEqual('unknown err', entry['event']) diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index 50d1d151d..e93b3cb78 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -3361,7 +3361,7 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin, with task_manager.acquire(self.context, node.uuid) as task: self.service._do_node_rescue_abort(task) clean_up_mock.assert_called_once_with(task.driver.rescue, task) - self.assertIsNotNone(task.node.last_error) + self.assertIsNone(task.node.last_error) self.assertFalse(task.node.maintenance) self.assertNotIn('agent_url', task.node.driver_internal_info) @@ -8153,3 +8153,166 @@ class DoNodeInspectAbortTestCase(mgr_utils.CommonMixIn, self.assertIn('Inspection was aborted', node.last_error) self.assertNotIn('agent_url', node.driver_internal_info) self.assertNotIn('agent_secret_token', node.driver_internal_info) + + +class NodeHistoryRecordCleanupTestCase(mgr_utils.ServiceSetUpMixin, + db_base.DbTestCase): + + def setUp(self): + super(NodeHistoryRecordCleanupTestCase, self).setUp() + self.node1 = obj_utils.get_test_node(self.context, + driver='fake-hardware', + id=10, + uuid=uuidutils.generate_uuid(), + conductor_affinity=1) + self.node2 = obj_utils.get_test_node(self.context, + driver='fake-hardware', + id=11, + uuid=uuidutils.generate_uuid(), + conductor_affinity=1) + self.node3 = obj_utils.get_test_node(self.context, + driver='fake-hardware', + id=12, + uuid=uuidutils.generate_uuid(), + conductor_affinity=1) + self.nodes = [self.node1, self.node2, self.node3] + # Create the nodes, as the tasks need to operate across tables. + self.node1.create() + self.node2.create() + self.node3.create() + CONF.set_override('node_history_max_entries', 2, group='conductor') + CONF.set_override('node_history_cleanup_batch_count', 2, + group='conductor') + self._start_service() + + def test_history_is_pruned_to_config(self): + for node in self.nodes: + for event in ['one', 'two', 'three']: + conductor_utils.node_history_record(node, event=event) + conductor_utils.node_history_record(self.node1, event="final") + self.service._manage_node_history(self.context) + events = objects.NodeHistory.list(self.context) + self.assertEqual(8, len(events)) + events = objects.NodeHistory.list_by_node_id(self.context, 10) + self.assertEqual('three', events[0].event) + self.assertEqual('final', events[1].event) + self.assertEqual(2, len(events)) + events = objects.NodeHistory.list_by_node_id(self.context, 11) + self.assertEqual(3, len(events)) + self.assertEqual('one', events[0].event) + self.assertEqual('two', events[1].event) + self.assertEqual('three', events[2].event) + events = objects.NodeHistory.list_by_node_id(self.context, 12) + self.assertEqual(3, len(events)) + self.assertEqual('one', events[0].event) + self.assertEqual('two', events[1].event) + self.assertEqual('three', events[2].event) + + def test_history_is_pruned_to_config_two_pass(self): + for node in self.nodes: + for event in ['one', 'two', 'three']: + conductor_utils.node_history_record(node, event=event) + conductor_utils.node_history_record(self.node1, event="final") + with mock.patch.object(manager.LOG, 'warning', + autospec=True) as mock_log: + self.service._manage_node_history(self.context) + # assert we did call a warning entry + self.assertEqual(1, mock_log.call_count) + events = objects.NodeHistory.list(self.context) + self.assertEqual(8, len(events)) + self.service._manage_node_history(self.context) + events = objects.NodeHistory.list(self.context) + print(events) + self.assertEqual(6, len(events)) + events = objects.NodeHistory.list_by_node_id(self.context, 10) + self.assertEqual(2, len(events)) + self.assertEqual('three', events[0].event) + self.assertEqual('final', events[1].event) + events = objects.NodeHistory.list_by_node_id(self.context, 11) + self.assertEqual(2, len(events)) + self.assertEqual('two', events[0].event) + self.assertEqual('three', events[1].event) + events = objects.NodeHistory.list_by_node_id(self.context, 12) + self.assertEqual(2, len(events)) + self.assertEqual('two', events[0].event) + self.assertEqual('three', events[1].event) + + def test_history_is_pruned_from_all_nodes_one_pass(self): + CONF.set_override('node_history_cleanup_batch_count', 15, + group='conductor') + for node in self.nodes: + for event in ['one', 'two', 'three']: + conductor_utils.node_history_record(node, event=event) + self.service._manage_node_history(self.context) + events = objects.NodeHistory.list(self.context) + self.assertEqual(6, len(events)) + + def test_history_pruning_no_work(self): + conductor_utils.node_history_record(self.node1, event='meow') + with mock.patch.object(self.dbapi, + 'bulk_delete_node_history_records', + autospec=True) as mock_delete: + self.service._manage_node_history(self.context) + mock_delete.assert_not_called() + events = objects.NodeHistory.list(self.context) + self.assertEqual(1, len(events)) + + def test_history_pruning_not_other_conductor(self): + node = obj_utils.get_test_node(self.context, + driver='fake-hardware', + id=33, + uuid=uuidutils.generate_uuid(), + conductor_affinity=2) + # create node so it can be queried + node.create() + for i in range(0, 3): + conductor_utils.node_history_record(node, event='meow%s' % i) + with mock.patch.object(self.dbapi, + 'bulk_delete_node_history_records', + autospec=True) as mock_delete: + self.service._manage_node_history(self.context) + mock_delete.assert_not_called() + events = objects.NodeHistory.list(self.context) + self.assertEqual(3, len(events)) + self.assertEqual('meow0', events[0].event) + + def test_history_is_pruned_to_config_with_days(self): + CONF.set_override('node_history_cleanup_batch_count', 15, + group='conductor') + CONF.set_override('node_history_minimum_days', 1, + group='conductor') + CONF.set_override('node_history_max_entries', 1, group='conductor') + + old_date = datetime.datetime.now() - datetime.timedelta(days=7) + # Creates 18 entries + for node in self.nodes: + for event in ['oldone', 'oldtwo', 'oldthree']: + objects.NodeHistory(created_at=old_date, + event=event, + node_id=node.id).create() + for event in ['one', 'two', 'three']: + conductor_utils.node_history_record(node, event=event) + + # 9 retained due to days, 3 to config + self.service._manage_node_history(self.context) + events = objects.NodeHistory.list(self.context) + print(events) + self.assertEqual(12, len(events)) + events = objects.NodeHistory.list_by_node_id(self.context, 10) + self.assertEqual(4, len(events)) + self.assertEqual('oldthree', events[0].event) + self.assertEqual('one', events[1].event) + self.assertEqual('two', events[2].event) + self.assertEqual('three', events[3].event) + events = objects.NodeHistory.list_by_node_id(self.context, 11) + self.assertEqual(4, len(events)) + self.assertEqual('oldthree', events[0].event) + self.assertEqual('one', events[1].event) + self.assertEqual('two', events[2].event) + self.assertEqual('three', events[3].event) + events = objects.NodeHistory.list_by_node_id(self.context, 12) + self.assertEqual(4, len(events)) + self.assertEqual('oldthree', events[0].event) + self.assertEqual('one', events[1].event) + self.assertEqual('two', events[2].event) + self.assertEqual('three', events[3].event) diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py index f7e16c618..1e563ec71 100644 --- a/ironic/tests/unit/conductor/test_utils.py +++ b/ironic/tests/unit/conductor/test_utils.py @@ -959,7 +959,7 @@ class NodeSoftPowerActionTestCase(db_base.DbTestCase): self.assertIsNone(node['last_error']) -class DeployingErrorHandlerTestCase(tests_base.TestCase): +class DeployingErrorHandlerTestCase(db_base.DbTestCase): def setUp(self): super(DeployingErrorHandlerTestCase, self).setUp() self.task = mock.Mock(spec=task_manager.TaskManager) @@ -972,6 +972,8 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase): self.node.last_error = None self.node.deploy_step = None self.node.driver_internal_info = {} + self.node.id = obj_utils.create_test_node(self.context, + driver='fake-hardware').id self.logmsg = "log message" self.errmsg = "err message" @@ -1023,6 +1025,7 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase): self.assertEqual({}, self.node.deploy_step) self.assertNotIn('deploy_step_index', self.node.driver_internal_info) self.task.process_event.assert_called_once_with('fail') + self.assertIsNotNone(self.node.last_error) def test_deploying_error_handler_cleanup_ironic_exception(self): self._test_deploying_error_handler_cleanup( @@ -1058,7 +1061,7 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase): self.task.process_event.assert_called_once_with('fail') -class ErrorHandlersTestCase(tests_base.TestCase): +class ErrorHandlersTestCase(db_base.DbTestCase): def setUp(self): super(ErrorHandlersTestCase, self).setUp() self.task = mock.Mock(spec=task_manager.TaskManager) @@ -1070,9 +1073,13 @@ class ErrorHandlersTestCase(tests_base.TestCase): # strict typing of the node power state fields and would fail if passed # a Mock object in constructors. A task context is also required for # notifications. + fake_node = obj_utils.create_test_node(self.context, + driver='fake-hardware') + self.node.configure_mock(power_state=states.POWER_OFF, target_power_state=states.POWER_ON, - maintenance=False, maintenance_reason=None) + maintenance=False, maintenance_reason=None, + id=fake_node.id) self.task.context = self.context @mock.patch.object(conductor_utils, 'LOG', autospec=True) @@ -1401,6 +1408,7 @@ class ErrorHandlersTestCase(tests_base.TestCase): '%(node)s, an exception was ' 'encountered while aborting.', {'node': self.node.uuid}) + self.assertIsNotNone(self.node.last_error) self.node.save.assert_called_once_with() @mock.patch.object(conductor_utils.LOG, 'error', autospec=True) @@ -2562,3 +2570,60 @@ class GetConfigDriveImageTestCase(db_base.DbTestCase): network_data=None, user_data=b'{"user": "data"}', vendor_data=None) + + +class NodeHistoryRecordTestCase(db_base.DbTestCase): + + def setUp(self): + super(NodeHistoryRecordTestCase, self).setUp() + self.node = obj_utils.create_test_node( + self.context, + uuid=uuidutils.generate_uuid()) + + def test_record_node_history(self): + conductor_utils.node_history_record(self.node, event='meow') + entries = objects.NodeHistory.list_by_node_id(self.context, + self.node.id) + entry = entries[0] + self.assertEqual('meow', entry['event']) + self.assertEqual(CONF.host, entry['conductor']) + self.assertEqual('INFO', entry['severity']) + self.assertIsNone(entry['user']) + + def test_record_node_history_with_user(self): + conductor_utils.node_history_record(self.node, event='meow', + user='peachesthecat') + entries = objects.NodeHistory.list_by_node_id(self.context, + self.node.id) + entry = entries[0] + self.assertEqual('meow', entry['event']) + self.assertEqual(CONF.host, entry['conductor']) + self.assertEqual('peachesthecat', entry['user']) + + def test_record_node_history_with_error_severity(self): + conductor_utils.node_history_record(self.node, event='meowmeow', + error=True, + event_type='catwantfood') + entries = objects.NodeHistory.list_by_node_id(self.context, + self.node.id) + entry = entries[0] + self.assertEqual('meowmeow', entry['event']) + self.assertEqual(CONF.host, entry['conductor']) + self.assertEqual('ERROR', entry['severity']) + self.assertEqual('catwantfood', entry['event_type']) + + @mock.patch.object(objects, 'NodeHistory', autospec=True) + def test_record_node_history_noop(self, mock_history): + CONF.set_override('node_history', False, group='conductor') + self.assertIsNone(conductor_utils.node_history_record(self.node)) + mock_history.assert_not_called() + + @mock.patch.object(objects, 'NodeHistory', autospec=True) + def test_record_node_history_disaled(self, mock_history): + mock_create = mock.Mock() + conductor_utils.node_history_record(self.node, event='meow', + error=True) + self.assertEqual('meow', self.node.last_error) + mock_history.create = mock_create + mock_history.assert_not_called() + mock_create.assert_not_called() diff --git a/ironic/tests/unit/conductor/test_verify.py b/ironic/tests/unit/conductor/test_verify.py index 0e29f87f5..a898ba436 100644 --- a/ironic/tests/unit/conductor/test_verify.py +++ b/ironic/tests/unit/conductor/test_verify.py @@ -27,6 +27,7 @@ from ironic.common import states from ironic.conductor import task_manager from ironic.conductor import utils as conductor_utils from ironic.conductor import verify +from ironic import objects from ironic.tests.unit.conductor import mgr_utils from ironic.tests.unit.db import base as db_base from ironic.tests.unit.objects import utils as obj_utils @@ -138,6 +139,11 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): self.assertEqual(states.ENROLL, node.provision_state) self.assertIsNone(node.target_provision_state) self.assertTrue(node.last_error) + history = objects.NodeHistory.list_by_node_id(self.context, + node.id) + entry = history[0] + self.assertEqual('verify', entry['event_type']) + self.assertEqual('ERROR', entry['severity']) @mock.patch.object(conductor_utils, 'LOG', autospec=True) @mock.patch('ironic.drivers.modules.fake.FakePower.validate', |