#!/usr/bin/env python # Copyright 2018 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. from __future__ import print_function import argparse import subprocess import pickle import os from os import path from datetime import date, timedelta from collections import namedtuple, defaultdict, Counter Commit = namedtuple('Commit', ['hash', 'author', 'commit_date', 'dirs']) # dict mapping each subdirectory and author to the number of their commits and # modifications in that directory DIRECTORY_AUTHORS = defaultdict(dict) # cache for directory owners for memoisation purposes OWNERS_CACHE = {} # filename for pickle cache CACHE_FILENAME = 'suggest_owners.cache' def _RunGitCommand(options, cmd_args): repo_path = path.join(options.repo_path, '.git') cmd = ['git', '--git-dir', repo_path] + cmd_args print('>', ' '.join(cmd)) return subprocess.check_output(cmd) def _ValidAuthor(author): return author.find('@chromium.org') > -1 and author.find('roller') == -1 # Returns additions/deletions by a commit to a directory (and its descendants). def getEditsForDirectory(commit, directory): additions = deletions = 0 for commit_directory, (directory_additions, directory_deletions) \ in commit.dirs.items(): # check if commit_directory is same as or a descendant of directory if isSubDirectory(directory, commit_directory): additions += directory_additions deletions += directory_deletions return additions, deletions # This propagates a commit touching a directory to also be touching all # ancesstor directories. def _PropagateCommit(options, commit): touched_dirs = set() # first get all the touched dirs and their ancestors for directory in commit.dirs.iterkeys(): while directory != '': touched_dirs.add(directory) # get the parent directory directory = path.dirname(directory) # loop over them and calculate the edits per directory for directory in touched_dirs: author_commits, author_additions, author_deletions = \ DIRECTORY_AUTHORS[directory].get(commit.author, (0,0,0)) directory_additions, directory_deletions = \ getEditsForDirectory(commit, directory) DIRECTORY_AUTHORS[directory][commit.author] = \ (author_commits + 1, author_additions + directory_additions, author_deletions + directory_deletions) # Checks if child_directory is same as or below parent_directory. For some # reason the os.path module does not have this functionality. def isSubDirectory(parent_directory, child_directory): parent_directory = parent_directory + '/' child_directory = child_directory + '/' return child_directory.startswith(parent_directory) def _GetGitLogCmd(options): # TODO(mheikal): git-log with --numstat vs --name-only takes 10x the time to # complete. It takes >15 mins for git log --numstat to return the 1 year git # history of the full repo. Should probably add a script flag to switch off # keeping track of number of modifications per commit. date_limit = date.today() - timedelta(days=options.days_ago) format_string = "%h,%ae,%cI" cmd_args = [ 'log', '--since', date_limit.isoformat(), '--numstat', '--pretty=format:%s'%format_string, ] # has to be last arg if options.subdirectory: cmd_args += ['--', options.subdirectory] return cmd_args def _ParseCommitLine(line): commit_hash, author, commit_date = line.split(",") return Commit(hash=commit_hash, author=author, commit_date=commit_date, dirs={}) def _ParseFileStatsLine(current_commit, line): try: additions, deletions, filepath = line.split('\t') except ValueError: return False if additions == '-': additions = 0 else: additions = int(additions) if deletions == '-': deletions = 0 else: deletions = int(deletions) dir_path = path.dirname(filepath) commit_additions, commit_deletions = \ current_commit.dirs.get(dir_path, (0,0)) current_commit.dirs[dir_path] = ( additions + commit_additions, deletions + commit_deletions) return True def processAllCommits(options): if not options.subdirectory and options.days_ago > 100: print('git log for your query might take > 5 minutes, limit by a ' 'subdirectory or reduce the number of days of history to low double ' 'digits to make this faster. There is no progress indicator, it is ' 'all waiting for single git log to finish.') output = _RunGitCommand(options, _GetGitLogCmd(options)) current_commit = None for line in output.splitlines(): if current_commit is None: current_commit = _ParseCommitLine(line) else: if line == '': # all commit details read if _ValidAuthor(current_commit.author): _PropagateCommit(options, current_commit) current_commit = None else: # Merge commits weird out git-log. If we fail to parse the line, then # the last commit was a merge and this line is actually another commit # description line. if not _ParseFileStatsLine(current_commit, line): current_commit = _ParseCommitLine(line) # process the final commit if _ValidAuthor(current_commit.author): _PropagateCommit(options, current_commit) def _CountCommits(directory): return sum( [count for (count, _a, _d) in DIRECTORY_AUTHORS[directory].itervalues()]) def _GetOwnerLevel(options, author, directory): sorted_owners = sorted(_GetOwners(options, directory), key=lambda (o,l): l) for owner, level in sorted_owners: if author == owner: return level else: return -1 # Returns the owners for a repo subdirectory. This does not understand per-file # directives. # TODO(mheikal): use depot_tools owners.py for parsing owners files. def _GetOwners(options, directory_path): if directory_path in OWNERS_CACHE: return OWNERS_CACHE[directory_path] owners_path = path.join(options.repo_path, directory_path, 'OWNERS') owners = set() parent_dir = directory_path owner_level = 0 while parent_dir != '': if path.isfile(owners_path): parsed_owners, noparent = _ParseOwnersFile(options, owners_path) owners.update([(owner, owner_level) for owner in parsed_owners]) owner_level += 1 if noparent: break parent_dir = path.dirname(parent_dir) owners_path = path.join(parent_dir, 'OWNERS') OWNERS_CACHE[directory_path] = set(owners) return owners # Parse an OWNERS file, returns set of owners and if the file sets noparent def _ParseOwnersFile(options, filepath): owners = set() noparent = False with open(filepath) as f: for line in f.readlines(): line = line.strip() # The script deals with directories so per-files are ignored. if line == '' or line[0] == '#' or line.startswith('per-file'): continue if line.startswith('file://'): relpath = line[7:] abspath = path.join(options.repo_path, relpath) parsed_owners, _ = _ParseOwnersFile(options, abspath) owners.update(parsed_owners) if line == 'set noparent': noparent = True index = line.find('@chromium.org') if index > -1: owners.add(line[:index + len('@chromium.org')]) return owners, noparent # Trivial directories are ones that just contain a single child subdir and # nothing else. def _IsTrivialDirectory(options, repo_subdir): try: return len(os.listdir(path.join(options.repo_path, repo_subdir))) == 1 except OSError: # directory no longer exists return False def computeSuggestions(options): directory_suggestions = [] for directory, authors in sorted( DIRECTORY_AUTHORS.iteritems(), key=lambda (d, a): d): if _IsTrivialDirectory(options, directory): continue if _CountCommits(directory) < options.dir_commit_limit: continue # skip suggestions for directories outside the passed in directory if (options.subdirectory and not isSubDirectory(options.subdirectory, directory)): continue # sort authors by descending number of commits sorted_authors = sorted(authors.items(), key=lambda (author, details): -details[0]) # keep only authors above the limit suggestions = [(a,c) for a,c in sorted_authors if \ a not in options.ignore_authors \ and c[0] >= options.author_cl_limit] directory_suggestions.append((directory, suggestions)) return directory_suggestions def _PrintSettings(options): print('Showing directories with at least ({}) commits in the last ({}) ' 'days.'.format(options.dir_commit_limit, options.days_ago)) print('Showing top ({}) committers who have commited at least ({}) commits ' 'to the directory in the last ({}) days.'.format( options.max_suggestions, options.author_cl_limit, options.days_ago)) print('(owners+N) represents distance through OWNERS files for said owner\n') def printSuggestions(options, directory_suggestions): print('\nCommit stats:') _PrintSettings(options) for directory, suggestions in directory_suggestions: print('{}: {} commits in the last {} days'.format( directory, _CountCommits(directory), options.days_ago)) non_owner_suggestions = 0 for author, (commit_count, additions, deletions) in suggestions: owner_level = _GetOwnerLevel(options, author, directory) if owner_level > -1: owner_string = ' (owner+{})'.format(owner_level) else: non_owner_suggestions +=1 owner_string = '' print('{}{}, commits: {}, additions:{}, deletions: {}'.format( author, owner_string, commit_count, additions, deletions)) if non_owner_suggestions >= options.max_suggestions: break print() def _GetHeadCommitHash(options): return _RunGitCommand(options, ['rev-parse', 'HEAD']).strip() def _GetCacheMetadata(options): return _GetHeadCommitHash(options), options.days_ago, options.subdirectory def _IsCacheValid(options, metadata): head_hash, days_ago, cached_subdirectory = metadata if head_hash != _GetHeadCommitHash(options): return False if days_ago != options.days_ago: return False if (cached_subdirectory is not None and not isSubDirectory(cached_subdirectory, options.subdirectory)): return False return True def cacheProcessedCommits(options): metadata = _GetCacheMetadata(options) with open(CACHE_FILENAME, 'w') as f: pickle.dump((metadata, DIRECTORY_AUTHORS), f) def maybeRestoreProcessedCommits(options): global DIRECTORY_AUTHORS if not path.exists(CACHE_FILENAME): return False with open(CACHE_FILENAME) as f: stored_metadata, cached_directory_authors = pickle.load(f) if _IsCacheValid(options, stored_metadata): print('Loading from cache') DIRECTORY_AUTHORS = cached_directory_authors return True else: print('Cache is stale or invalid, must rerun `git log`') return False def do(options): if options.skip_cache or not maybeRestoreProcessedCommits(options): processAllCommits(options) cacheProcessedCommits(options) directory_suggestions = computeSuggestions(options) printSuggestions(options, directory_suggestions) def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('repo_path') parser.add_argument('--days-ago', type=int, help='Number of days of history to search through.', default=365, metavar='DAYS_AGO') parser.add_argument('--subdirectory', help='Limit suggestions to this subdirectory', default='') parser.add_argument('--ignore-authors', help='Ignore this comma separated list of authors') parser.add_argument('--max-suggestions', type=int, help='Maximum number of ' 'suggested authors per directory.', default=5) parser.add_argument('--author-cl-limit', type=int, help='Do not suggest ' 'authors who have commited less than this to the ' 'directory in the last DAYS_AGO days.', default=10) parser.add_argument('--dir-commit-limit', type=int, help='Skip directories ' 'with less than this number of commits in the last ' 'DAYS_AGO days.', default=100) parser.add_argument('--skip-cache', action='store_true', help='Do not read from cache.', default=False) options = parser.parse_args() if options.ignore_authors: options.ignore_authors = set( map(str.strip, options.ignore_authors.split(','))) else: options.ignore_authors = set() do(options) if __name__ == '__main__': main()