#!/usr/bin/env python3

#
# (c) Jared Weakly 2017
#
# This file will be a utility to help facilitate the comparison of performance
# metrics across arbitrary commits. The file will produce a table comparing
# metrics between measurements taken for given commits in the environment
# (which defaults to 'local' if not given by --test-env).
#

import argparse
import re
import subprocess

from testglobals import *
from math import ceil, trunc
from testutil import parse_git_notes

#
# Comparison tools for the test driver to use on performance tests.
#
# The chain of functions looks like this:
#   1. collect_stats() is written in an all.T file by the (human) test writer.
#   2. In the main test execution loop, at some point, the collect_stats()
#      functions are evaluated, parse_git_notes is executed, and the expected
#      values are written into the stats_range_fields dictionary.
#   3. Then the test is run; after the test is executed,  and the relevant values
#      are written to a temporary file in a temporary directory for the test.
#   4. After that, checkStats() is called; it grabs the expected values and
#      then calls evaluate_metric
#   5. evaluate_metric writes the results of the test to the accumulate_metrics
#      file (which will be written to git notes at the end of the test run) and
#      passes the expected and actual values off to the test_cmp function.
#   6. test_cmp handles the numerical evaluation of whether or not a test passes
#      as well as the printing of relevant information in the case of failure
#      or verbosity level.
#
# It looks a bit scary and complicated but it's not too bad.
# Small note: Step 2 handwaves a bit. There are several execution functions
# which fire depending on how a test is setup.
# However, for performance tests, only compile_and_run is used which internally
# executes simple_build and simple_run_and_run (both in testutil.py) so it is
# mostly sufficient to consider those three if a closer look is desired.

# These my_ functions are duplicates of functions in testlib.py that I can't
# import here and are mostly a consequence of some semi-ugly refactoring.
def my_passed():
    return {'passFail': 'pass'}

def my_failBecause(reason, tag=None):
    return {'passFail': 'fail', 'reason': reason, 'tag': tag}

# At some point this should be changed to handle tests like so:
# - Upon noticing a: 5% regression, leave a comment on phabricator
# -                 10% regression, flag commit for review on phabricator
# -                 20% regression, fail test.
def test_cmp(full_name, field, val, expected, dev=20):
    result = my_passed()
    lowerBound = trunc(           int(expected) * ((100 - float(dev))/100))
    upperBound = trunc(0.5 + ceil(int(expected) * ((100 + float(dev))/100)))
    deviation = round(((float(val) * 100)/ int(expected)) - 100, 1)

    if val < lowerBound:
        result = my_failBecause('value is too low:\n(If this is \
        because you have improved GHC, feel\nfree to ignore this error)','stat')
    if val > upperBound:
        result = my_failBecause('value is too high:\nstat is not good enough','stat')

    if val < lowerBound or val > upperBound or config.verbose >= 4:
        length = max(len(str(x)) for x in [expected, lowerBound, upperBound, val])

        def display(descr, val, extra):
            print(descr, str(val).rjust(length), extra)

        display('    Expected    ' + full_name + ' ' + field + ':', expected, '+/-' + str(dev) + '%')
        display('    Lower bound ' + full_name + ' ' + field + ':', lowerBound, '')
        display('    Upper bound ' + full_name + ' ' + field + ':', upperBound, '')
        display('    Actual      ' + full_name + ' ' + field + ':', val, '')
        if val != expected:
            display('    Deviation   ' + full_name + ' ' + field + ':', deviation, '%')

    return result

# Corresponds to 'all' setting for metric parameter in collect_stats function.
testing_metrics = ['bytes allocated', 'peak_megabytes_allocated', 'max_bytes_used']

# Defaults to "test everything, and only break on extreme cases"
#
# The inputs to this function are slightly interesting:
# metric can be either:
#     - 'all', in which case all 3 possible metrics are collected and compared.
#     - The specific metric one wants to use in the test.
#     - A list of the metrics one wants to use in the test.
#
# Deviation defaults to 20% because the goal is correctness over performance.
# The testsuite should avoid breaking when there is not an actual error.
# Instead, the testsuite should notify of regressions in a non-breaking manner.
#
# collect_compiler_stats is used when the metrics collected are about the compiler.
# collect_stats is used in the majority case when the metrics to be collected
# are about the performance of the runtime code generated by the compiler.
def collect_compiler_stats(metric='all',deviation=20):
    return lambda name, opts, m=metric, d=deviation: _collect_stats(name, opts, m,d, True)

def collect_stats(metric='all', deviation=20):
    return lambda name, opts, m=metric, d=deviation: _collect_stats(name, opts, m, d)

# 'is_compiler_stats_test' is somewhat of an unfortunate name.
# If the boolean is set to true, it indicates that this test is one that
# measures the performance numbers of the compiler.
# As this is a fairly rare case in the testsuite, it defaults to false to
# indicate that it is a 'normal' performance test.
# This is an internal function that is used only in the implementation.
def _collect_stats(name, opts, metric, deviation, is_compiler_stats_test=False):
    if not re.match('^[0-9]*[a-zA-Z][a-zA-Z0-9._-]*$', name):
        # my_framework_fail(name, 'bad_name', 'This test has an invalid name')
        my_failBecause('This test has an invalid name.')

    tests = parse_git_notes('perf','HEAD^')

    # Might have multiple metrics being measured for a single test.
    test = [t for t in tests if t['test'] == name]

    if tests == [] or test == []:
        # There are no prior metrics for this test.
        if isinstance(metric, str):
            if metric == 'all':
                for field in testing_metrics:
                    opts.stats_range_fields[field] = (0,0)
            else:
                opts.stats_range_fields[metric] = (0,0)
        if isinstance(metric, list):
            for field in metric:
                opts.stats_range_fields[field] = (0,0)

        return

    if is_compiler_stats_test:
        opts.is_compiler_stats_test = True

    # Compiler performance numbers change when debugging is on, making the results
    # useless and confusing. Therefore, skip if debugging is on.
    if config.compiler_debugged and is_compiler_stats_test:
        opts.skip = 1

    # 'all' is a shorthand to test for bytes allocated, peak megabytes allocated, and max bytes used.
    if isinstance(metric, str):
        if metric == 'all':
            for field in testing_metrics:
                # As there might be multiple "duplicates" of a test, the list
                # comprehension considers the latest (ie the last item) to be
                # the one we care about.
                # (Ideally the list comprehension would result in a singleton list)
                opts.stats_range_fields[field] = ([t['value'] for t in test if t['metric'] == field][-1], deviation)
                return
        else:
            opts.stats_range_fields[metric] = ([t['value'] for t in test if t['metric'] == metric][-1], deviation)
            return

    if isinstance(metric, list):
        for field in metric:
            opts.stats_range_fields[field] = ([t['value'] for t in test if t['metric'] == field][-1], deviation)

def evaluate_metric(opts, test, field, deviation, contents, way):
    full_name = test + ' (' + way + ' )'
    (expected,_) = opts.stats_range_fields[field]

    m = re.search('\("' + field + '", "([0-9]+)"\)', contents)
    if m == None:
        print('Failed to find field: ', field)
        return my_failBecause('no such stats field')

    val = int(m.group(1))

    # Add val into the git note if option is set.
    test_env = config.test_env
    config.accumulate_metrics.append('\t'.join([test_env, test, way, field, str(val)]))

    if expected == 0:
        return my_passed()

    return test_cmp(full_name, field, val, expected, deviation)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--test-env",
                        help="The given test environment to be compared.")
    parser.add_argument("--test-name",
                        help="If given, filters table to include only \
                        tests matching the given regular expression.")
    parser.add_argument("--min-delta",type=float,
                        help="Display only tests where the relative \
                        spread is greater than the given value. \
                        This will not be run if you only pass in one commit.")
    parser.add_argument("--add-note", nargs=3,
                        help="Development only. Adds N fake metrics to the given commit. \
                        If the third argument is not a blank string, this will generate \
                        different looking fake metrics.")
    parser.add_argument("commits", nargs=argparse.REMAINDER,
                        help="The rest of the arguments will be the commits that will be used.")
    args = parser.parse_args()

    env = 'local'
    name = re.compile('.*')
    # metrics is a dictionary of the form
    # [ {'test_env': 'local', 'test': 'T100', 'way': 'some_way', 'metric': 'some_field', 'value': '1000', 'commit': 'HEAD'} ]
    metrics = []
    singleton_commit = len(args.commits) == 1

    #
    # Main logic of the program when called from the command-line.
    #

    if args.commits:
        for c in args.commits:
            metrics += parse_git_notes('perf',c)

    if args.test_env:
        metrics = [test for test in metrics if test['test_env'] == args.test_env]

    if args.test_name:
        name = re.compile(args.test_name)
        metrics = [test for test in metrics if name.search(test.get('test',''))]

    if args.min_delta:
        delta = args.min_delta

        def cmp(v1, v2):
            if v1 > v2:
                return (100 * (v1 - v2)/v2) > delta
            else:
                return (100 * (v2 - v1)/v1) > delta

        m = []
        for t in latest_commit:
            m += [(t,test) for test in metrics if (t['test'] == test['test']) and (t['commit'] != test['commit'])]

        deltas = []
        for fst,snd in m:
            if cmp(float(fst['value']),float(snd['value'])):
                deltas.append(fst)

        # Throw away the work if we only have one commit passed in.
        # Ugly way to do it but ¯\_(ツ)_/¯
        if not singleton_list:
            metrics = deltas

    if args.add_note:
        def note_gen(n, commit, delta=''):
            note = []
            # Generates simple fake data. Likely not comprehensive enough to catch all edge cases.
            if not delta:
                [note.append('\t'.join(['local', 'T'+ str(i*100), 'some_way', 'some_field', str(i*1000)])) for i in range(1,int(int(n)/2)+1)]
                [note.append('\t'.join(['non-local', 'W'+ str(i*100), 'other_way', 'other_field', str(i*100)])) for i in range(int(int(n)/2)+1,int(n)+1)]
            if delta:
                [note.append('\t'.join(['local', 'T'+ str(i*100), 'some_way', 'some_field', str(i*10)])) for i in range(1,int(int(n)/2)+1)]
                [note.append('\t'.join(['non-local', 'W'+ str(i*100), 'other_way', 'other_field', str(i*1)])) for i in range(int(int(n)/2)+1,int(n)+1)]

            git_note = subprocess.check_output(["git","notes","--ref=perf","append",commit,"-m", "\n".join(note)])

        note_gen(args.add_note[0],args.add_note[1],args.add_note[2])

    #
    # String utilities for pretty-printing
    #

    string = ''
    for i in args.commits:
        string+='{:18}'
        commits = string.format(*[c[:10] for c in args.commits])
        latest_commit = [test for test in metrics if test['commit'] == args.commits[0]]

    def cmtline(insert):
        return string.format(*[insert for c in args.commits]).strip()

    def header(unit):
        first_line = "{:27}{:30}".format('    ','      ') + cmtline(unit)
        second_line = ("{:27}{:30}".format('Test','Metric') + commits).strip()

        # Test   Metric   c1   c2   c3 ...
        print("-" * (len(second_line)+1))
        print(first_line)
        print(second_line)
        print("-" * (len(second_line)+1))

    def commit_string(test, flag):
        def delta(v1, v2):
            return round((100 * (v1 - v2)/v2),2)

        i = 0
        string = []
        fmtstr = ""
        for commit in args.commits:
            fmtstr+="{:18}"
            string += [t['value'] for t in metrics if t['commit'] == args.commits[i] and t['test'] == test]
            i+=1
            string = string[:i]

        if flag == 'metrics':
            return fmtstr.format(*string).strip()
        if flag == 'percentages':
            s = [str(delta(float(string[0]),float(val))) + '%' for val in string]
            return fmtstr.format(*s).strip()

    #
    # The pretty-printed output
    #

    header('commit')
    # Printing out metrics.
    for test in latest_commit:
        print("{:27}{:30}".format(test['test'], test['metric']) + commit_string(test['test'],'metrics'))

    # Has no meaningful output if there is no commit to compare to.
    if not singleton_commit:
        header('percent')

        # Printing out percentages.
        for test in latest_commit:
            print("{:27}{:30}".format(test['test'], test['metric']) + commit_string(test['test'],'percentages'))