diff options
Diffstat (limited to 'src/metrics/scoreboard.py')
-rw-r--r-- | src/metrics/scoreboard.py | 568 |
1 files changed, 568 insertions, 0 deletions
diff --git a/src/metrics/scoreboard.py b/src/metrics/scoreboard.py new file mode 100644 index 0000000..df37ac9 --- /dev/null +++ b/src/metrics/scoreboard.py @@ -0,0 +1,568 @@ +import copy +import math +import psutil + +from collections import namedtuple + +from mod_wsgi import server_metrics as raw_server_metrics + +SERVER_READY = '_' +SERVER_STARTING = 'S' +SERVER_BUSY_READ = 'R' +SERVER_BUSY_WRITE = 'W' +SERVER_BUST_KEEPALIVE = 'K' +SERVER_BUSY_LOG = 'L' +SERVER_BUSY_DNS = 'D' +SERVER_CLOSING = 'C' +SERVER_GRACEFUL = 'G' +SERVER_IDLE_KILL = 'I' +SERVER_DEAD = '.' + +WORKER_STATUS = { + SERVER_READY: 'Ready', + SERVER_STARTING: 'Starting', + SERVER_BUSY_READ: 'Read', + SERVER_BUSY_WRITE: 'Write', + SERVER_BUST_KEEPALIVE: 'Keepalive', + SERVER_BUSY_LOG: 'Logging', + SERVER_BUSY_DNS: 'DNS lookup', + SERVER_CLOSING: 'Closing', + SERVER_GRACEFUL: 'Graceful', + SERVER_IDLE_KILL: 'Dying', + SERVER_DEAD: 'Dead' +} + +def server_metrics(): + """Returns server metrics, which are a combination of data from the + raw mod_wsgi server metrics, along with further data derived from + that raw data. + + """ + + workers_busy = 0 + workers_idle = 0 + + access_count = 0 + bytes_served = 0 + + active_processes = 0 + + # Grab the raw server metrics. + + result = raw_server_metrics() + + # Loop over all the processes and workers they contain aggregating + # various details. + + for process in result['processes']: + process['active_workers'] = 0 + + for worker in process['workers']: + # Here we determine whether a worker is busy or idle. + + status = worker['status'] + + if not process['quiescing'] and process['pid']: + if (status == SERVER_READY and process['generation'] == + result['running_generation']): + + process['active_workers'] += 1 + workers_idle += 1 + + elif status not in (SERVER_DEAD, SERVER_STARTING, + SERVER_IDLE_KILL): + + process['active_workers'] += 1 + workers_busy += 1 + + # Here we aggregate number of requests served and + # amount of bytes transferred. + + count = worker['access_count'] + + if count or status not in (SERVER_READY, SERVER_DEAD): + access_count += count + bytes_served += worker['bytes_served'] + + if process['active_workers']: + active_processes += 1 + + result['workers_busy'] = workers_busy + result['workers_idle'] = workers_idle + + result['access_count'] = access_count + result['bytes_served'] = bytes_served + + result['active_processes'] = active_processes + + return result + +RequestSample = namedtuple('RequestSample', 'start_time duration') + +class Scoreboard(object): + + """Container for holding selected server metrics accumulated from + multiple samples making up a sampling period. + + """ + + system_frequency = 1 + + def __init__(self): + # Setup the starting values. We need to grab an initial + # set of server metrics as a reference point for certain + # values. + + data = server_metrics() + + # Start of the period will be the time we just generated + # the initial server metrics used as a reference. + + self.period_start = data['current_time'] + + # The current end time for the period always starts out + # as the same as the start time. + + self.period_end = self.period_start + + # Sample periods count tracks how many consecutive sample + # periods have been run which have been chained together. + + self.sample_periods = 1 + + # Sample count tracks how many samples have been collected + # against this sample period. + + self.sample_count = 0 + + # Sampler exiting flag indicates whether this is the final + # sampling period to be reported on due to the sampler + # exiting due to process shutdown or some other event. + + self.sampler_exiting = False + + # The server and thread limits are the maximum number of + # processes and workers per process that can be created. + # In practice the number of workers per process is always + # fixed at the thread limit as Apache doesn't dynamically + # adjust the number of running workers per process and + # instead always creates the maximum number and leaves it + # at that for the life of the process. + + self.server_limit = data['server_limit'] + self.thread_limit = data['thread_limit'] + + # Active processes is how many Apache child processes + # currently contain active workers. This is used between + # samples, to determine whether relative to the last + # sample, the number of processes increased or decreased. + + self.active_processes = 0 + + # Running counters of the total number of running, starting + # or stopped processes across all samples. The count of + # running processes is used to determine the average number + # of processes running for the whole sample period. The + # counts of starting and stopping are used in reflecting + # the amount of process churn. + + self.processes_running_count = 0 + self.processes_started_count = 0 + self.processes_stopped_count = 0 + + # Running counters of the total number of idle and busy + # workers across all samples. These counts are used to + # detemine the average number of workers in each state + # for the whole sample period. + + self.workers_idle_count = 0 + self.workers_busy_count = 0 + + # Running counters of the actual workers statuses across + # all samples. These counts are used to detemine the + # average number of workers in each state for the whole + # sample period. The statues are a more fine grained + # depiction of the worker state compared to the summary + # state of idle or busy. + + self.workers_status_count = dict.fromkeys(WORKER_STATUS.keys(), 0) + + # Access count is the number of completed requests that + # have been handled by Apache. We have the total and a + # delta for the current sampling period. + + self.access_count_total = data['access_count'] + self.access_count_delta = 0 + + # Bytes served is the number of bytes which have been + # transferred by Apache. We have the total and a delta + # for the current sampling period. + + self.bytes_served_total = data['bytes_served'] + self.bytes_served_delta = 0 + + # Request samples is a list of details for a subset of + # requests derived from the server metrics. It is not + # possible to collect the details of every request. We + # can only even get samples where we see a worker, at the + # time of the sample, which hasn't yet started a new + # request and so can extract the details from the last + # request that the worker handled. If a worker is + # handling multiple requests between sample periods, we + # also only get the opportunity to see the details for + # the last one handled. The number of request samples + # should be bounded by the number of workers times the + # number of samples in the sample period. + + self.request_samples = [] + + # Process system info records details of any processes + # such as memory, CPU usage and context switches. + + self.processes_system_info = {} + + @property + def duration(self): + """The duration of the sampling period. + + """ + + return self.period_end - self.period_start + + @property + def processes_running(self): + if self.sample_count == 0: + return 0 + + return math.ceil(float(self.processes_running_count) / + self.sample_count) + + @property + def workers_idle(self): + if self.sample_count == 0: + return 0 + + return math.ceil(float(self.workers_idle_count) / self.sample_count) + + @property + def workers_busy(self): + if self.sample_count == 0: + return 0 + + return math.ceil(float(self.workers_busy_count) / self.sample_count) + + @property + def workers_utilization(self): + if self.sample_count == 0: + return 0 + + return (float(self.workers_busy_count) / self.sample_count) / ( + self.server_limit * self.thread_limit) + + @property + def workers_status(self): + result = {} + + if self.sample_count == 0: + return result + + total = 0 + + for value in self.workers_status_count.values(): + value = float(value) / self.sample_count + total += value + + if total: + for key, value in self.workers_status_count.items(): + if key != SERVER_DEAD and value != 0: + label = WORKER_STATUS.get(key, 'Unknown') + value = float(value) / self.sample_count + result[label] = (value / total) * total + + return result + + @property + def request_percentiles(self): + result = {} + + # Calculate from the set of sampled requests the average + # and percentile metrics. + + requests = self.request_samples + + if requests: + requests.sort(key=lambda e: e.duration) + + total = sum([x.duration for x in requests]) + + # Chart as 'Average'. + + result['Average'] = total/len(requests) + + idx50 = int(0.50 * len(requests)) + result['Median'] = requests[idx50].duration + + idx95 = int(0.95 * len(requests)) + result['95%'] = requests[idx95].duration + + idx99 = int(0.99 * len(requests)) + result['99%'] = requests[idx99].duration + + return result + + @property + def request_samples_quality(self): + if self.access_count_delta == 0: + return 0.0 + + return float(len(self.request_samples)) / self.access_count_delta + + def update(self, rollover=False,exiting=False): + """Updates the scoreboard values for the current sampling + period by incorporating current server metrics. + + """ + + # Grab the current server metrics. + + data = server_metrics() + + # Update times for current sampling period and number of + # samples taken. + + sample_start = self.period_end + sample_end = data['current_time'] + sample_duration = max(0, sample_end - sample_start) + + self.period_end = sample_end + + # Calculate changes in access count and bytes served since + # the last sample. + + access_count_total = data['access_count'] + access_count_delta = access_count_total - self.access_count_total + + self.access_count_delta += access_count_delta + self.access_count_total = access_count_total + + bytes_served_total = data['bytes_served'] + bytes_served_delta = bytes_served_total - self.bytes_served_total + + self.bytes_served_delta += bytes_served_delta + self.bytes_served_total = bytes_served_total + + # Collect request samples. The requests must have completed + # since the last sample time and the worker must not have + # already started on a new request. + + for process in data['processes']: + for worker in process['workers']: + start_time = worker['start_time'] + stop_time = worker['stop_time'] + + if (stop_time > start_time and sample_start < stop_time + and stop_time <= sample_end): + + self.request_samples.append(RequestSample( + start_time=start_time, + duration=stop_time-start_time)) + + # Calculate changes in the number of active, starting and + # stopping processes, and the number of idle and busy workers. + + current_active_processes = data['active_processes'] + previous_active_processes = self.active_processes + + self.active_processes = current_active_processes + self.processes_running_count += current_active_processes + + if current_active_processes > previous_active_processes: + self.processes_started_count += (current_active_processes - + previous_active_processes) + + elif current_active_processes < previous_active_processes: + self.processes_stopped_count += (previous_active_processes - + current_active_processes) + + self.workers_idle_count += data['workers_idle'] + self.workers_busy_count += data['workers_busy'] + + for process in data['processes']: + for worker in process['workers']: + self.workers_status_count[worker['status']] += 1 + + # Record details about state of processes. + + if self.sample_count % self.system_frequency == 0 or rollover: + + # First we mark all process entries as being dead. We + # will then mark as alive those which truly are. + + for details in self.processes_system_info.values(): + details['dead'] = True + + for process in data['processes']: + pid = process['pid'] + + if pid == 0: + continue + + details = self.processes_system_info.get(pid) + + if details is None: + details = dict(pid=pid) + + details['duration'] = 0.0 + + details['cpu_times'] = None + details['cpu_user_time'] = 0.0 + details['cpu_system_time'] = 0.0 + + details['ctx_switches'] = None + details['ctx_switch_voluntary'] = 0 + details['ctx_switch_involuntary'] = 0 + + details['dead'] = False + + try: + p = psutil.Process(pid) + + except psutil.NoSuchProcess: + details['dead'] = True + + continue + + try: + rss, vms = p.memory_info() + + details['memory_rss'] = rss + details['memory_vms'] = vms + + except psutil.AccessDenied: + details['dead'] = True + + continue + + except Exception: + raise + + try: + cpu_times = p.cpu_times() + + if details['cpu_times'] is None: + details['cpu_times'] = cpu_times + + # Note that we don't want to baseline CPU usage + # at zero the first time we see the process, as we + # want to capture any work performed in doing any + # startup initialisation of the process. This + # would occur before the first time we see it. + # Thus populate CPU usage with the initial values. + # Is slight risk that we may in part apportion + # this to the wrong sampling period if didn't fall + # within the sample, but nothing we can do about + # that. + + details['cpu_user_time'] = cpu_times[0] + details['cpu_system_time'] = cpu_times[1] + + else: + user_time = cpu_times[0] - details['cpu_times'][0] + system_time = cpu_times[1] - details['cpu_times'][1] + + details['cpu_times'] = cpu_times + details['cpu_user_time'] += user_time + details['cpu_system_time'] += system_time + + except psutil.AccessDenied: + details['dead'] = True + + continue + + except Exception: + raise + + try: + ctx_switches = p.num_ctx_switches() + + if details['ctx_switches'] is None: + details['ctx_switches'] = ctx_switches + + else: + voluntary = (ctx_switches.voluntary - + details['ctx_switches'].voluntary) + involuntary = (ctx_switches.involuntary - + details['ctx_switches'].involuntary) + + details['ctx_switches'] = ctx_switches + details['ctx_switch_voluntary'] += voluntary + details['ctx_switch_involuntary'] += involuntary + + except psutil.AccessDenied: + details['dead'] = True + + continue + + except Exception: + raise + + details['duration'] += sample_duration + + self.processes_system_info[pid] = details + + # Update the flag indicating whether the sampler is exiting + # and this is the final sampling period data to be supplied. + + self.sampler_exiting = exiting + + self.sample_count += 1 + + def rollover(self): + """Creates a copy of the current scoreboard and resets any + attributes back to initial values where appropriate for the + start of a new sampling period. + + """ + + # Create a copy. A shallow copy is enough. + + scoreboard = copy.deepcopy(self) + + # Reset selected attributes back to initial values. + + scoreboard.period_start = scoreboard.period_end + + scoreboard.sample_count = 0; + + scoreboard.access_count_delta = 0 + scoreboard.bytes_served_delta = 0 + + scoreboard.processes_running_count = 0 + scoreboard.processes_started_count = 0 + scoreboard.processes_stopped_count = 0 + + scoreboard.workers_idle_count = 0 + scoreboard.workers_busy_count = 0 + + scoreboard.workers_status_count = dict.fromkeys( + WORKER_STATUS.keys(), 0) + + scoreboard.request_samples = [] + + # For record of processes, we want to remove just the dead ones. + + for pid, details in list(scoreboard.processes_system_info.items()): + if details['dead']: + del scoreboard.processes_system_info[pid] + else: + details['duration'] = 0.0 + details['cpu_user_time'] = 0.0 + details['cpu_system_time'] = 0.0 + details['ctx_switch_voluntary'] = 0 + details['ctx_switch_involuntary'] = 0 + + # Increment the count of successive sampling periods. + + scoreboard.sample_periods += 1 + + return scoreboard |