#!/usr/bin/env python """ Checker for repeated tokens ~~~~~~~~~~~~~~~~~~~~~~~~~~~ Helper script to find suspicious lexers which produce the same token repeatedly, i.e. for example: .. code:: 'd' Text 'a' Text 't' Text 'a' Text 'b' Text 'a' Text 's' Text 'e' Text This script has two test modes: Check for tokens repeating more often than a given threshold, and exclude anything but single-character tokens. Repeated single-character tokens are quite problematic as they result in bloated output and are usually an indication that someone is missing a + or * in the regex. :copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS. :license: BSD, see LICENSE for details. """ import argparse import os import sys def unpack_file(path): """Unpack a file into text, token pairs.""" from collections import namedtuple pair = namedtuple('TextTokenPair', ['text', 'token']) for line in open(path).readlines(): line = line.strip() if line: # Line can start with ' or ", so let's check which one it is # and find the matching one quotation_start = 0 quotation_end = line.rfind(line[0]) text = line[quotation_start+1:quotation_end] token = line.split()[-1] text = text.replace('\\n', '\n') text = text.replace('\\t', '\t') yield pair(text, token) def check_file(path, threshold, single_only): current_token = '' current_token_repeat_count = 1 is_suspicious = False for value, token in unpack_file(path): if single_only and len(value) > 1: token = '' current_token_repeat_count = 1 continue if token != current_token: current_token = token current_token_repeat_count = 1 else: current_token_repeat_count += 1 if current_token_repeat_count > threshold: is_suspicious = True break if is_suspicious: print(path) return not is_suspicious def main(args): errors = 0 for dir, _, files in os.walk(args.TEST_ROOT): for file in files: if not file.endswith('.output'): continue path = os.path.join(dir, file) if not check_file(path, args.threshold, args.single): errors += 1 if errors > 0: return 1 return 0 if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('TEST_ROOT', help='Root directory containing the tests') parser.add_argument('-t', '--threshold', type=int, default=5, help='Warn if a token repeats itself more often then this number.') parser.add_argument('-s', '--single', action='store_true', default=False, help='Only look at tokens matching a single character') args = parser.parse_args() sys.exit(main(args))