blob: c2e72106d60f3433b9307e0f3b0237cc4578ba82 [file] [log] [blame]
#!/usr/bin/env python3
from collections import Counter
from concurrent.futures import Executor, ThreadPoolExecutor
from dataclasses import dataclass
from typing import Iterator
import argparse
import os
import re
import subprocess
# Main idea: use git-diff to compute the file regions touched by a given commit, then use
# git-blame to find who authored those regions prior to the commit. Print out the authors
# sorted by the number of lines they touched. Do everything concurrently because git-blame
# can be a little slow.
def main():
parser = argparse.ArgumentParser(description='Use git blame to find good reviewers for a change')
parser.add_argument('commits', nargs='+', help='the changes needing code review')
parser.add_argument('-U', '--context', type=int, default=2, help='num context lines per hunk')
parser.add_argument('-v', '--verbose', action='store_true', help='print author info per file')
args = parser.parse_args()
os.chdir(run('git', 'rev-parse', '--show-toplevel')) # Move to git root directory.
commits = run('git', 'rev-parse', *args.commits).splitlines() # Get stable SHAs.
print(f'Finding reviewers for {len(commits)} commit(s)')
with ThreadPoolExecutor() as executor:
find_reviewers(commits, args.context, args.verbose, executor)
def find_reviewers(commits: list[str], context: int, verbose: bool, executor: Executor):
# Note: we currently do not parallelize across commits because nested usages of
# ThreadPoolExecutor can too easily lead to pool exhaustion and deadlock.
for commit in commits:
desc = run('git', 'log', '-1', '--format=%H: %s', commit)
underline = '=' * 40
print(f'{underline}\n{desc}\n{underline}')
try:
author_counts = find_reviewers_for_commit(commit, context, executor)
except Exception as e:
print(e)
print()
continue
if verbose:
print('Total')
print_author_counts(author_counts.total)
print()
if verbose:
for file, counts in author_counts.by_file.items():
print(file)
print_author_counts(counts)
print()
@dataclass
class AuthorLineCounts:
"""Map from author emails to the corresponding number of blamed lines."""
total: Counter
by_file: dict[str,Counter]
def find_reviewers_for_commit(commit: str, context: int, executor: Executor) -> AuthorLineCounts:
files = run('git', 'diff-tree', '-r', '--name-only', '--diff-filter=MD', '--no-commit-id', commit).splitlines()
if len(files) == 0:
raise Exception(f'No files modified by commit {commit:.10}')
if len(files) > 50:
raise Exception(f'Too many files modified by commit {commit:.10}')
jobs = [executor.submit(find_reviewers_in_file, commit, context, file) for file in files]
results = [job.result() for job in jobs]
authors_total = sum(results, Counter())
authors_by_file = dict(zip(files, results))
return AuthorLineCounts(authors_total, authors_by_file)
def find_reviewers_in_file(commit: str, context: int, file: str) -> Counter:
diff = run('git', 'diff', f'-U{context}', f'{commit}~', commit, '--', file)
region_args = [f'-L{r.start},{r.end}' for r in extract_regions(diff)]
if len(region_args) == 0:
print(f'Assuming binary file: {file}')
last_modifier = run('git', 'log', '-1', '--format=%ae', f'{commit}~', '--', file)
return Counter([last_modifier])
blame = run('git', 'blame', '--line-porcelain', *region_args, f'{commit}~', '--', file)
authors = re.findall(r'^author-mail <(.*)>$', blame, re.MULTILINE)
return Counter(authors)
def print_author_counts(counts: Counter):
for author, count in counts.most_common():
print(f' {count:4d} {author}')
@dataclass
class Region:
start: int
end: int
def extract_regions(diff: str) -> Iterator[Region]:
# Parse line offsets from hunk headers. Example header: @@ -50,7 +50,12 @@
for start, offset in re.findall(r'^@@ -(\d+),(\d+) \+\d+,\d+ @@', diff, re.MULTILINE):
start = int(start)
end = start + int(offset) - 1
yield Region(start, end)
# Run a command and return stdout.
def run(*args: str, **kwargs) -> str:
return subprocess.check_output(args, **kwargs).decode().strip()
if __name__ == '__main__':
main()