blob: bddac91224af4393242894b0780c3368ecff2fe7 [file] [log] [blame]
from collections import namedtuple
from pathlib import Path
import locale
import subprocess
import re
import requests
import os
import json
from dataclasses import dataclass
@dataclass
class CategoryGroup:
name: str
categories: list
frontend_categories = [
'meta',
'nn',
'linalg',
'cpp',
'python',
'complex',
'vmap',
'autograd',
'build',
'memory_format',
'foreach',
'dataloader',
'sparse',
'nested tensor',
'optimizer'
]
pytorch_2_categories = [
'dynamo',
'inductor',
]
# These will all get mapped to quantization
quantization = CategoryGroup(
name="quantization",
categories=[
'quantization',
'AO frontend',
'AO Pruning', ]
)
# Distributed has a number of release note labels we want to map to one
distributed = CategoryGroup(
name="distributed",
categories=[
'distributed',
'distributed (c10d)',
'distributed (composable)',
'distributed (ddp)',
'distributed (fsdp)',
'distributed (rpc)',
'distributed (sharded)',
]
)
categories = [
'Uncategorized',
'lazy',
'hub',
'mobile',
'jit',
'visualization',
'onnx',
'caffe2',
'amd',
'rocm',
'cuda',
'cpu',
'cudnn',
'xla',
'benchmark',
'profiler',
'performance_as_product',
'package',
'dispatcher',
'releng',
'fx',
'code_coverage',
'vulkan',
'skip',
'composability',
# 2.0 release
'mps',
'intel',
'functorch',
'gnn',
'distributions',
'serialization',
] + [f'{category}_frontend' for category in frontend_categories] + pytorch_2_categories + [quantization.name] + [distributed.name]
topics = [
'bc_breaking',
'deprecations',
'new_features',
'improvements',
'bug_fixes',
'performance',
'docs',
'devs',
'Untopiced',
"not user facing",
"security",
]
Features = namedtuple('Features', [
'title',
'body',
'pr_number',
'files_changed',
'labels',
'author',
'accepters'
])
def dict_to_features(dct):
return Features(
title=dct['title'],
body=dct['body'],
pr_number=dct['pr_number'],
files_changed=dct['files_changed'],
labels=dct['labels'],
author=dct['author'],
accepters=tuple(dct['accepters']))
def features_to_dict(features):
return dict(features._asdict())
def run(command):
"""Returns (return-code, stdout, stderr)"""
p = subprocess.Popen(command, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, shell=True)
output, err = p.communicate()
rc = p.returncode
enc = locale.getpreferredencoding()
output = output.decode(enc)
err = err.decode(enc)
return rc, output.strip(), err.strip()
def commit_body(commit_hash):
cmd = f'git log -n 1 --pretty=format:%b {commit_hash}'
ret, out, err = run(cmd)
return out if ret == 0 else None
def commit_title(commit_hash):
cmd = f'git log -n 1 --pretty=format:%s {commit_hash}'
ret, out, err = run(cmd)
return out if ret == 0 else None
def commit_files_changed(commit_hash):
cmd = f'git diff-tree --no-commit-id --name-only -r {commit_hash}'
ret, out, err = run(cmd)
return out.split('\n') if ret == 0 else None
def parse_pr_number(body, commit_hash, title):
regex = r'Pull Request resolved: https://github.com/pytorch/pytorch/pull/([0-9]+)'
matches = re.findall(regex, body)
if len(matches) == 0:
if 'revert' not in title.lower() and 'updating submodules' not in title.lower():
print(f'[{commit_hash}: {title}] Could not parse PR number, ignoring PR')
return None
if len(matches) > 1:
print(f'[{commit_hash}: {title}] Got two PR numbers, using the first one')
return matches[0]
return matches[0]
def get_ghstack_token():
pattern = 'github_oauth = (.*)'
with open(Path('~/.ghstackrc').expanduser(), 'r+') as f:
config = f.read()
matches = re.findall(pattern, config)
if len(matches) == 0:
raise RuntimeError("Can't find a github oauth token")
return matches[0]
def get_token():
env_token = os.environ.get("GITHUB_TOKEN")
if env_token is not None:
print("using GITHUB_TOKEN from environment variable")
return env_token
else:
return get_ghstack_token()
token = get_token()
headers = {"Authorization": f"token {token}"}
def run_query(query):
request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
if request.status_code == 200:
return request.json()
else:
raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, request.json()))
def github_data(pr_number):
query = """
{
repository(owner: "pytorch", name: "pytorch") {
pullRequest(number: %s ) {
author {
login
}
reviews(last: 5, states: APPROVED) {
nodes {
author {
login
}
}
}
labels(first: 10) {
edges {
node {
name
}
}
}
}
}
}
""" % pr_number
query = run_query(query)
if query.get('errors'):
raise Exception(query['errors'])
edges = query['data']['repository']['pullRequest']['labels']['edges']
labels = [edge['node']['name'] for edge in edges]
author = query['data']['repository']['pullRequest']['author']['login']
nodes = query['data']['repository']['pullRequest']['reviews']['nodes']
# using set to dedup multiple accepts from same accepter
accepters = {node["author"]["login"] for node in nodes}
accepters = tuple(sorted(accepters))
return labels, author, accepters
def get_features(commit_hash):
title, body, files_changed = (
commit_title(commit_hash),
commit_body(commit_hash),
commit_files_changed(commit_hash))
pr_number = parse_pr_number(body, commit_hash, title)
labels = []
author = ""
accepters = tuple()
if pr_number is not None:
labels, author, accepters = github_data(pr_number)
result = Features(title, body, pr_number, files_changed, labels, author, accepters)
return result
_commit_data_cache = None
def get_commit_data_cache(path='results/data.json'):
global _commit_data_cache
if _commit_data_cache is None:
_commit_data_cache = _CommitDataCache(path)
return _commit_data_cache
class _CommitDataCache:
def __init__(self, path):
self.path = path
self.data = {}
if os.path.exists(path):
self.data = self.read_from_disk()
else:
os.makedirs(Path(path).parent, exist_ok=True)
def get(self, commit):
if commit not in self.data.keys():
# Fetch and cache the data
self.data[commit] = get_features(commit)
self.write_to_disk()
return self.data[commit]
def read_from_disk(self):
with open(self.path, 'r') as f:
data = json.load(f)
data = {commit: dict_to_features(dct)
for commit, dct in data.items()}
return data
def write_to_disk(self):
data = {commit: features._asdict() for commit, features in self.data.items()}
with open(self.path, 'w') as f:
json.dump(data, f)