diff options
author | James E. Blair <jim@acmegating.com> | 2021-09-03 13:23:44 -0700 |
---|---|---|
committer | James E. Blair <jim@acmegating.com> | 2021-09-10 10:54:59 -0700 |
commit | aee6ef6f7f93c3c1dccd0576165d71ac1eecd13e (patch) | |
tree | f0a8684b7dee96117fd939ce73da0540c9991024 /zuul/nodepool.py | |
parent | b41f467340f875101ecb366232e5ddda6714e993 (diff) | |
download | zuul-aee6ef6f7f93c3c1dccd0576165d71ac1eecd13e.tar.gz |
Report nodepool resource stats gauges in scheduler
We currently report nodepool resource usage whenever we use or return
nodes. This now happens on the executors, and they don't have a
global view of all nodes used. The schedulers do, and they already
have a periodic stats reporting method.
Shift the reporting of node resource gauges to the scheduler. To make
this efficient, use a tree cache for nodes. Because node records
alone don't have enough information to tie them back to a tenant or
project, use the new user_data field on the Node object to store that
info when we mark a node in use. Also, store the zuul system id on
the node, so that we can ensure we're only reporting nodes that belong
to us.
Update the node list in the REST API to use the cache as well, and
also filter its results by zuul system id and tenant.
Depends-On: https://review.opendev.org/807362
Change-Id: I9d0987b250b8fb54b3b937c86db327d255e54abd
Diffstat (limited to 'zuul/nodepool.py')
-rw-r--r-- | zuul/nodepool.py | 187 |
1 files changed, 106 insertions, 81 deletions
diff --git a/zuul/nodepool.py b/zuul/nodepool.py index 2d3b126e5..86ceaa51f 100644 --- a/zuul/nodepool.py +++ b/zuul/nodepool.py @@ -25,18 +25,11 @@ from zuul.zk.exceptions import LockException from zuul.zk.nodepool import NodeRequestEvent, ZooKeeperNodepool -def add_resources(target, source): - for key, value in source.items(): - target[key] += value - - -def subtract_resources(target, source): - for key, value in source.items(): - target[key] -= value - - class Nodepool(object): log = logging.getLogger('zuul.nodepool') + # The kind of resources we report stats on. We need a complete + # list in order to report 0 level gauges. + resource_types = ('ram', 'cores', 'instances') def __init__(self, zk_client, system_id, statsd, scheduler=False): self._stopped = False @@ -50,7 +43,8 @@ class Nodepool(object): self.zk_nodepool = ZooKeeperNodepool( zk_client, enable_node_request_cache=True, - node_request_event_callback=self._handleNodeRequestEvent) + node_request_event_callback=self._handleNodeRequestEvent, + enable_node_cache=True) self.election = NodepoolEventElection(zk_client) self.event_thread = threading.Thread(target=self.runEventElection) self.event_thread.daemon = True @@ -65,9 +59,10 @@ class Nodepool(object): zk_client ) - # TODO: remove internal caches for SOS - self.current_resources_by_tenant = {} - self.current_resources_by_project = {} + def addResources(self, target, source): + for key, value in source.items(): + if key in self.resource_types: + target[key] += value def runEventElection(self): while not self._stopped: @@ -113,9 +108,7 @@ class Nodepool(object): self.stop_watcher_event.clear() self.election_won = False - def _handleNodeRequestEvent(self, request, event, request_id=None): - # TODO (felix): This callback should be wrapped by leader election, so - # that only one scheduler puts NodesProvisionedEvents in the queue. + def _handleNodeRequestEvent(self, request, event): log = get_annotated_logger(self.log, event=request.event_id) if request.requestor != self.system_id: @@ -170,22 +163,6 @@ class Nodepool(object): pipe.timing(key + '.size.%s' % len(request.labels), dt) pipe.send() - def emitStatsResources(self): - if not self.statsd: - return - - for tenant, resources in self.current_resources_by_tenant.items(): - for resource, value in resources.items(): - key = 'zuul.nodepool.resources.tenant.' \ - '{tenant}.{resource}' - self.statsd.gauge(key, value, tenant=tenant, resource=resource) - for project, resources in self.current_resources_by_project.items(): - for resource, value in resources.items(): - key = 'zuul.nodepool.resources.project.' \ - '{project}.{resource}' - self.statsd.gauge( - key, value, project=project, resource=resource) - def emitStatsResourceCounters(self, tenant, project, resources, duration): if not self.statsd: return @@ -297,7 +274,7 @@ class Nodepool(object): if node.lock is None: raise Exception("Node %s is not locked" % (node,)) if node.resources: - add_resources(resources, node.resources) + self.addResources(resources, node.resources) node.state = model.STATE_HOLD node.hold_job = " ".join([request.tenant, request.project, @@ -337,48 +314,30 @@ class Nodepool(object): # _doBuildCompletedEvent, we always want to try to unlock it. self.zk_nodepool.unlockHoldRequest(request) - # When holding a nodeset we need to update the gauges to avoid - # leaking resources - if tenant and project and resources: - subtract_resources( - self.current_resources_by_tenant[tenant], resources) - subtract_resources( - self.current_resources_by_project[project], resources) - self.emitStatsResources() - - if duration: - self.emitStatsResourceCounters( - tenant, project, resources, duration) + if tenant and project and resources and duration: + self.emitStatsResourceCounters( + tenant, project, resources, duration) # TODO (felix): Switch back to use a build object here rather than the # ansible_job once it's available via ZK. def useNodeSet(self, nodeset, ansible_job=None): self.log.info("Setting nodeset %s in use", nodeset) - resources = defaultdict(int) + user_data = None + if ansible_job: + args = ansible_job.arguments + tenant_name = args["zuul"]["tenant"] + project_name = args["zuul"]["project"]["canonical_name"] + user_data = dict( + zuul_system=self.system_id, + tenant_name=tenant_name, + project_name=project_name, + ) for node in nodeset.getNodes(): if node.lock is None: raise Exception("Node %s is not locked", node) node.state = model.STATE_IN_USE + node.user_data = user_data self.zk_nodepool.storeNode(node) - if node.resources: - add_resources(resources, node.resources) - if ansible_job and resources: - args = ansible_job.arguments - # we have a buildset and thus also tenant and project so we - # can emit project specific resource usage stats - tenant_name = args["zuul"]["tenant"] - project_name = args["zuul"]["project"]["canonical_name"] - - self.current_resources_by_tenant.setdefault( - tenant_name, defaultdict(int)) - self.current_resources_by_project.setdefault( - project_name, defaultdict(int)) - - add_resources(self.current_resources_by_tenant[tenant_name], - resources) - add_resources(self.current_resources_by_project[project_name], - resources) - self.emitStatsResources() # TODO (felix): Switch back to use a build object here rather than the # ansible_job once it's available via ZK. @@ -394,7 +353,7 @@ class Nodepool(object): try: if node.state == model.STATE_IN_USE: if node.resources: - add_resources(resources, node.resources) + self.addResources(resources, node.resources) node.state = model.STATE_USED self.zk_nodepool.storeNode(node) except Exception: @@ -415,20 +374,9 @@ class Nodepool(object): "for %s seconds for build %s for project %s", nodeset, len(nodeset.nodes), duration, ansible_job, project) - # When returning a nodeset we need to update the gauges if we have a - # build. Further we calculate resource*duration and increment their - # tenant or project specific counters. With that we have both the - # current value and also counters to be able to perform accounting. - if resources: - subtract_resources( - self.current_resources_by_tenant[tenant], resources) - subtract_resources( - self.current_resources_by_project[project], resources) - self.emitStatsResources() - - if duration: - self.emitStatsResourceCounters( - tenant, project, resources, duration) + if resources and duration: + self.emitStatsResourceCounters( + tenant, project, resources, duration) def unlockNodeSet(self, nodeset): self._unlockNodes(nodeset.getNodes()) @@ -536,3 +484,80 @@ class Nodepool(object): req = self.zk_nodepool.getNodeRequest(req_id, cached=True) if req.requestor == self.system_id: yield req + + def getNodes(self): + """Get all nodes in use or held by Zuul + + Note this relies entirely on the internal cache. + + :returns: An iterator of Node objects in use (or held) by this Zuul + system. + """ + for node_id in self.zk_nodepool.getNodes(cached=True): + node = self.zk_nodepool.getNode(node_id) + if (node.user_data and + isinstance(node.user_data, dict) and + node.user_data.get('zuul_system') == self.system_id): + yield node + + def emitStatsTotals(self, abide): + if not self.statsd: + return + + total_requests = 0 + tenant_requests = defaultdict(int) + resources_by_tenant = defaultdict(int) + resources_by_project = defaultdict(int) + empty_resource_dict = dict([(k, 0) for k in self.resource_types]) + + # Initialize zero values for gauges + for tenant in abide.tenants.values(): + tenant_requests[tenant.name] = 0 + resources_by_tenant[tenant.name] = empty_resource_dict.copy() + for project in tenant.all_projects: + resources_by_project[project.canonical_name] =\ + empty_resource_dict.copy() + + # Count node requests + for req in self.getNodeRequests(): + total_requests += 1 + if not req.tenant_name: + continue + tenant_requests[req.tenant_name] += 1 + + self.statsd.gauge('zuul.nodepool.current_requests', + total_requests) + for tenant, request_count in tenant_requests.items(): + self.statsd.gauge( + "zuul.nodepool.tenant.{tenant}.current_requests", + request_count, + tenant=tenant) + + # Count nodes + for node in self.getNodes(): + if not node.resources: + continue + project_name = node.user_data.get('project_name') + tenant_name = node.user_data.get('tenant_name') + if not (project_name and tenant_name): + continue + if node.state not in {model.STATE_IN_USE, + model.STATE_USED, + model.STATE_HOLD}: + continue + self.addResources(resources_by_tenant[tenant_name], + node.resources) + self.addResources(resources_by_project[project_name], + node.resources) + + for tenant, resources in resources_by_tenant.items(): + for resource, value in resources.items(): + key = 'zuul.nodepool.resources.tenant.' \ + '{tenant}.{resource}' + self.statsd.gauge(key, value, tenant=tenant, resource=resource) + for project, resources in resources_by_project.items(): + for resource, value in resources.items(): + key = 'zuul.nodepool.resources.project.' \ + '{project}.{resource}' + self.statsd.gauge( + key, value, project=project, resource=resource) |