summaryrefslogtreecommitdiff
path: root/src/metrics/scoreboard.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/metrics/scoreboard.py')
-rw-r--r--src/metrics/scoreboard.py568
1 files changed, 568 insertions, 0 deletions
diff --git a/src/metrics/scoreboard.py b/src/metrics/scoreboard.py
new file mode 100644
index 0000000..df37ac9
--- /dev/null
+++ b/src/metrics/scoreboard.py
@@ -0,0 +1,568 @@
+import copy
+import math
+import psutil
+
+from collections import namedtuple
+
+from mod_wsgi import server_metrics as raw_server_metrics
+
+SERVER_READY = '_'
+SERVER_STARTING = 'S'
+SERVER_BUSY_READ = 'R'
+SERVER_BUSY_WRITE = 'W'
+SERVER_BUST_KEEPALIVE = 'K'
+SERVER_BUSY_LOG = 'L'
+SERVER_BUSY_DNS = 'D'
+SERVER_CLOSING = 'C'
+SERVER_GRACEFUL = 'G'
+SERVER_IDLE_KILL = 'I'
+SERVER_DEAD = '.'
+
+WORKER_STATUS = {
+ SERVER_READY: 'Ready',
+ SERVER_STARTING: 'Starting',
+ SERVER_BUSY_READ: 'Read',
+ SERVER_BUSY_WRITE: 'Write',
+ SERVER_BUST_KEEPALIVE: 'Keepalive',
+ SERVER_BUSY_LOG: 'Logging',
+ SERVER_BUSY_DNS: 'DNS lookup',
+ SERVER_CLOSING: 'Closing',
+ SERVER_GRACEFUL: 'Graceful',
+ SERVER_IDLE_KILL: 'Dying',
+ SERVER_DEAD: 'Dead'
+}
+
+def server_metrics():
+ """Returns server metrics, which are a combination of data from the
+ raw mod_wsgi server metrics, along with further data derived from
+ that raw data.
+
+ """
+
+ workers_busy = 0
+ workers_idle = 0
+
+ access_count = 0
+ bytes_served = 0
+
+ active_processes = 0
+
+ # Grab the raw server metrics.
+
+ result = raw_server_metrics()
+
+ # Loop over all the processes and workers they contain aggregating
+ # various details.
+
+ for process in result['processes']:
+ process['active_workers'] = 0
+
+ for worker in process['workers']:
+ # Here we determine whether a worker is busy or idle.
+
+ status = worker['status']
+
+ if not process['quiescing'] and process['pid']:
+ if (status == SERVER_READY and process['generation'] ==
+ result['running_generation']):
+
+ process['active_workers'] += 1
+ workers_idle += 1
+
+ elif status not in (SERVER_DEAD, SERVER_STARTING,
+ SERVER_IDLE_KILL):
+
+ process['active_workers'] += 1
+ workers_busy += 1
+
+ # Here we aggregate number of requests served and
+ # amount of bytes transferred.
+
+ count = worker['access_count']
+
+ if count or status not in (SERVER_READY, SERVER_DEAD):
+ access_count += count
+ bytes_served += worker['bytes_served']
+
+ if process['active_workers']:
+ active_processes += 1
+
+ result['workers_busy'] = workers_busy
+ result['workers_idle'] = workers_idle
+
+ result['access_count'] = access_count
+ result['bytes_served'] = bytes_served
+
+ result['active_processes'] = active_processes
+
+ return result
+
+RequestSample = namedtuple('RequestSample', 'start_time duration')
+
+class Scoreboard(object):
+
+ """Container for holding selected server metrics accumulated from
+ multiple samples making up a sampling period.
+
+ """
+
+ system_frequency = 1
+
+ def __init__(self):
+ # Setup the starting values. We need to grab an initial
+ # set of server metrics as a reference point for certain
+ # values.
+
+ data = server_metrics()
+
+ # Start of the period will be the time we just generated
+ # the initial server metrics used as a reference.
+
+ self.period_start = data['current_time']
+
+ # The current end time for the period always starts out
+ # as the same as the start time.
+
+ self.period_end = self.period_start
+
+ # Sample periods count tracks how many consecutive sample
+ # periods have been run which have been chained together.
+
+ self.sample_periods = 1
+
+ # Sample count tracks how many samples have been collected
+ # against this sample period.
+
+ self.sample_count = 0
+
+ # Sampler exiting flag indicates whether this is the final
+ # sampling period to be reported on due to the sampler
+ # exiting due to process shutdown or some other event.
+
+ self.sampler_exiting = False
+
+ # The server and thread limits are the maximum number of
+ # processes and workers per process that can be created.
+ # In practice the number of workers per process is always
+ # fixed at the thread limit as Apache doesn't dynamically
+ # adjust the number of running workers per process and
+ # instead always creates the maximum number and leaves it
+ # at that for the life of the process.
+
+ self.server_limit = data['server_limit']
+ self.thread_limit = data['thread_limit']
+
+ # Active processes is how many Apache child processes
+ # currently contain active workers. This is used between
+ # samples, to determine whether relative to the last
+ # sample, the number of processes increased or decreased.
+
+ self.active_processes = 0
+
+ # Running counters of the total number of running, starting
+ # or stopped processes across all samples. The count of
+ # running processes is used to determine the average number
+ # of processes running for the whole sample period. The
+ # counts of starting and stopping are used in reflecting
+ # the amount of process churn.
+
+ self.processes_running_count = 0
+ self.processes_started_count = 0
+ self.processes_stopped_count = 0
+
+ # Running counters of the total number of idle and busy
+ # workers across all samples. These counts are used to
+ # detemine the average number of workers in each state
+ # for the whole sample period.
+
+ self.workers_idle_count = 0
+ self.workers_busy_count = 0
+
+ # Running counters of the actual workers statuses across
+ # all samples. These counts are used to detemine the
+ # average number of workers in each state for the whole
+ # sample period. The statues are a more fine grained
+ # depiction of the worker state compared to the summary
+ # state of idle or busy.
+
+ self.workers_status_count = dict.fromkeys(WORKER_STATUS.keys(), 0)
+
+ # Access count is the number of completed requests that
+ # have been handled by Apache. We have the total and a
+ # delta for the current sampling period.
+
+ self.access_count_total = data['access_count']
+ self.access_count_delta = 0
+
+ # Bytes served is the number of bytes which have been
+ # transferred by Apache. We have the total and a delta
+ # for the current sampling period.
+
+ self.bytes_served_total = data['bytes_served']
+ self.bytes_served_delta = 0
+
+ # Request samples is a list of details for a subset of
+ # requests derived from the server metrics. It is not
+ # possible to collect the details of every request. We
+ # can only even get samples where we see a worker, at the
+ # time of the sample, which hasn't yet started a new
+ # request and so can extract the details from the last
+ # request that the worker handled. If a worker is
+ # handling multiple requests between sample periods, we
+ # also only get the opportunity to see the details for
+ # the last one handled. The number of request samples
+ # should be bounded by the number of workers times the
+ # number of samples in the sample period.
+
+ self.request_samples = []
+
+ # Process system info records details of any processes
+ # such as memory, CPU usage and context switches.
+
+ self.processes_system_info = {}
+
+ @property
+ def duration(self):
+ """The duration of the sampling period.
+
+ """
+
+ return self.period_end - self.period_start
+
+ @property
+ def processes_running(self):
+ if self.sample_count == 0:
+ return 0
+
+ return math.ceil(float(self.processes_running_count) /
+ self.sample_count)
+
+ @property
+ def workers_idle(self):
+ if self.sample_count == 0:
+ return 0
+
+ return math.ceil(float(self.workers_idle_count) / self.sample_count)
+
+ @property
+ def workers_busy(self):
+ if self.sample_count == 0:
+ return 0
+
+ return math.ceil(float(self.workers_busy_count) / self.sample_count)
+
+ @property
+ def workers_utilization(self):
+ if self.sample_count == 0:
+ return 0
+
+ return (float(self.workers_busy_count) / self.sample_count) / (
+ self.server_limit * self.thread_limit)
+
+ @property
+ def workers_status(self):
+ result = {}
+
+ if self.sample_count == 0:
+ return result
+
+ total = 0
+
+ for value in self.workers_status_count.values():
+ value = float(value) / self.sample_count
+ total += value
+
+ if total:
+ for key, value in self.workers_status_count.items():
+ if key != SERVER_DEAD and value != 0:
+ label = WORKER_STATUS.get(key, 'Unknown')
+ value = float(value) / self.sample_count
+ result[label] = (value / total) * total
+
+ return result
+
+ @property
+ def request_percentiles(self):
+ result = {}
+
+ # Calculate from the set of sampled requests the average
+ # and percentile metrics.
+
+ requests = self.request_samples
+
+ if requests:
+ requests.sort(key=lambda e: e.duration)
+
+ total = sum([x.duration for x in requests])
+
+ # Chart as 'Average'.
+
+ result['Average'] = total/len(requests)
+
+ idx50 = int(0.50 * len(requests))
+ result['Median'] = requests[idx50].duration
+
+ idx95 = int(0.95 * len(requests))
+ result['95%'] = requests[idx95].duration
+
+ idx99 = int(0.99 * len(requests))
+ result['99%'] = requests[idx99].duration
+
+ return result
+
+ @property
+ def request_samples_quality(self):
+ if self.access_count_delta == 0:
+ return 0.0
+
+ return float(len(self.request_samples)) / self.access_count_delta
+
+ def update(self, rollover=False,exiting=False):
+ """Updates the scoreboard values for the current sampling
+ period by incorporating current server metrics.
+
+ """
+
+ # Grab the current server metrics.
+
+ data = server_metrics()
+
+ # Update times for current sampling period and number of
+ # samples taken.
+
+ sample_start = self.period_end
+ sample_end = data['current_time']
+ sample_duration = max(0, sample_end - sample_start)
+
+ self.period_end = sample_end
+
+ # Calculate changes in access count and bytes served since
+ # the last sample.
+
+ access_count_total = data['access_count']
+ access_count_delta = access_count_total - self.access_count_total
+
+ self.access_count_delta += access_count_delta
+ self.access_count_total = access_count_total
+
+ bytes_served_total = data['bytes_served']
+ bytes_served_delta = bytes_served_total - self.bytes_served_total
+
+ self.bytes_served_delta += bytes_served_delta
+ self.bytes_served_total = bytes_served_total
+
+ # Collect request samples. The requests must have completed
+ # since the last sample time and the worker must not have
+ # already started on a new request.
+
+ for process in data['processes']:
+ for worker in process['workers']:
+ start_time = worker['start_time']
+ stop_time = worker['stop_time']
+
+ if (stop_time > start_time and sample_start < stop_time
+ and stop_time <= sample_end):
+
+ self.request_samples.append(RequestSample(
+ start_time=start_time,
+ duration=stop_time-start_time))
+
+ # Calculate changes in the number of active, starting and
+ # stopping processes, and the number of idle and busy workers.
+
+ current_active_processes = data['active_processes']
+ previous_active_processes = self.active_processes
+
+ self.active_processes = current_active_processes
+ self.processes_running_count += current_active_processes
+
+ if current_active_processes > previous_active_processes:
+ self.processes_started_count += (current_active_processes -
+ previous_active_processes)
+
+ elif current_active_processes < previous_active_processes:
+ self.processes_stopped_count += (previous_active_processes -
+ current_active_processes)
+
+ self.workers_idle_count += data['workers_idle']
+ self.workers_busy_count += data['workers_busy']
+
+ for process in data['processes']:
+ for worker in process['workers']:
+ self.workers_status_count[worker['status']] += 1
+
+ # Record details about state of processes.
+
+ if self.sample_count % self.system_frequency == 0 or rollover:
+
+ # First we mark all process entries as being dead. We
+ # will then mark as alive those which truly are.
+
+ for details in self.processes_system_info.values():
+ details['dead'] = True
+
+ for process in data['processes']:
+ pid = process['pid']
+
+ if pid == 0:
+ continue
+
+ details = self.processes_system_info.get(pid)
+
+ if details is None:
+ details = dict(pid=pid)
+
+ details['duration'] = 0.0
+
+ details['cpu_times'] = None
+ details['cpu_user_time'] = 0.0
+ details['cpu_system_time'] = 0.0
+
+ details['ctx_switches'] = None
+ details['ctx_switch_voluntary'] = 0
+ details['ctx_switch_involuntary'] = 0
+
+ details['dead'] = False
+
+ try:
+ p = psutil.Process(pid)
+
+ except psutil.NoSuchProcess:
+ details['dead'] = True
+
+ continue
+
+ try:
+ rss, vms = p.memory_info()
+
+ details['memory_rss'] = rss
+ details['memory_vms'] = vms
+
+ except psutil.AccessDenied:
+ details['dead'] = True
+
+ continue
+
+ except Exception:
+ raise
+
+ try:
+ cpu_times = p.cpu_times()
+
+ if details['cpu_times'] is None:
+ details['cpu_times'] = cpu_times
+
+ # Note that we don't want to baseline CPU usage
+ # at zero the first time we see the process, as we
+ # want to capture any work performed in doing any
+ # startup initialisation of the process. This
+ # would occur before the first time we see it.
+ # Thus populate CPU usage with the initial values.
+ # Is slight risk that we may in part apportion
+ # this to the wrong sampling period if didn't fall
+ # within the sample, but nothing we can do about
+ # that.
+
+ details['cpu_user_time'] = cpu_times[0]
+ details['cpu_system_time'] = cpu_times[1]
+
+ else:
+ user_time = cpu_times[0] - details['cpu_times'][0]
+ system_time = cpu_times[1] - details['cpu_times'][1]
+
+ details['cpu_times'] = cpu_times
+ details['cpu_user_time'] += user_time
+ details['cpu_system_time'] += system_time
+
+ except psutil.AccessDenied:
+ details['dead'] = True
+
+ continue
+
+ except Exception:
+ raise
+
+ try:
+ ctx_switches = p.num_ctx_switches()
+
+ if details['ctx_switches'] is None:
+ details['ctx_switches'] = ctx_switches
+
+ else:
+ voluntary = (ctx_switches.voluntary -
+ details['ctx_switches'].voluntary)
+ involuntary = (ctx_switches.involuntary -
+ details['ctx_switches'].involuntary)
+
+ details['ctx_switches'] = ctx_switches
+ details['ctx_switch_voluntary'] += voluntary
+ details['ctx_switch_involuntary'] += involuntary
+
+ except psutil.AccessDenied:
+ details['dead'] = True
+
+ continue
+
+ except Exception:
+ raise
+
+ details['duration'] += sample_duration
+
+ self.processes_system_info[pid] = details
+
+ # Update the flag indicating whether the sampler is exiting
+ # and this is the final sampling period data to be supplied.
+
+ self.sampler_exiting = exiting
+
+ self.sample_count += 1
+
+ def rollover(self):
+ """Creates a copy of the current scoreboard and resets any
+ attributes back to initial values where appropriate for the
+ start of a new sampling period.
+
+ """
+
+ # Create a copy. A shallow copy is enough.
+
+ scoreboard = copy.deepcopy(self)
+
+ # Reset selected attributes back to initial values.
+
+ scoreboard.period_start = scoreboard.period_end
+
+ scoreboard.sample_count = 0;
+
+ scoreboard.access_count_delta = 0
+ scoreboard.bytes_served_delta = 0
+
+ scoreboard.processes_running_count = 0
+ scoreboard.processes_started_count = 0
+ scoreboard.processes_stopped_count = 0
+
+ scoreboard.workers_idle_count = 0
+ scoreboard.workers_busy_count = 0
+
+ scoreboard.workers_status_count = dict.fromkeys(
+ WORKER_STATUS.keys(), 0)
+
+ scoreboard.request_samples = []
+
+ # For record of processes, we want to remove just the dead ones.
+
+ for pid, details in list(scoreboard.processes_system_info.items()):
+ if details['dead']:
+ del scoreboard.processes_system_info[pid]
+ else:
+ details['duration'] = 0.0
+ details['cpu_user_time'] = 0.0
+ details['cpu_system_time'] = 0.0
+ details['ctx_switch_voluntary'] = 0
+ details['ctx_switch_involuntary'] = 0
+
+ # Increment the count of successive sampling periods.
+
+ scoreboard.sample_periods += 1
+
+ return scoreboard