| import json |
| import locale |
| import os |
| import re |
| import subprocess |
| from collections import namedtuple |
| from dataclasses import dataclass |
| from pathlib import Path |
| |
| import requests |
| |
| |
| @dataclass |
| class CategoryGroup: |
| name: str |
| categories: list |
| |
| |
| frontend_categories = [ |
| "meta", |
| "nn", |
| "linalg", |
| "cpp", |
| "python", |
| "complex", |
| "vmap", |
| "autograd", |
| "build", |
| "memory_format", |
| "foreach", |
| "dataloader", |
| "sparse", |
| "nested tensor", |
| "optimizer", |
| ] |
| |
| pytorch_2_categories = [ |
| "dynamo", |
| "inductor", |
| ] |
| |
| # These will all get mapped to quantization |
| quantization = CategoryGroup( |
| name="quantization", |
| categories=[ |
| "quantization", |
| "AO frontend", |
| "AO Pruning", |
| ], |
| ) |
| |
| # Distributed has a number of release note labels we want to map to one |
| distributed = CategoryGroup( |
| name="distributed", |
| categories=[ |
| "distributed", |
| "distributed (c10d)", |
| "distributed (composable)", |
| "distributed (ddp)", |
| "distributed (fsdp)", |
| "distributed (rpc)", |
| "distributed (sharded)", |
| ], |
| ) |
| |
| categories = ( |
| [ |
| "Uncategorized", |
| "lazy", |
| "hub", |
| "mobile", |
| "jit", |
| "visualization", |
| "onnx", |
| "caffe2", |
| "amd", |
| "rocm", |
| "cuda", |
| "cpu", |
| "cudnn", |
| "xla", |
| "benchmark", |
| "profiler", |
| "performance_as_product", |
| "package", |
| "dispatcher", |
| "releng", |
| "fx", |
| "code_coverage", |
| "vulkan", |
| "skip", |
| "composability", |
| # 2.0 release |
| "mps", |
| "intel", |
| "functorch", |
| "gnn", |
| "distributions", |
| "serialization", |
| ] |
| + [f"{category}_frontend" for category in frontend_categories] |
| + pytorch_2_categories |
| + [quantization.name] |
| + [distributed.name] |
| ) |
| |
| |
| topics = [ |
| "bc breaking", |
| "deprecation", |
| "new features", |
| "improvements", |
| "bug fixes", |
| "performance", |
| "docs", |
| "devs", |
| "Untopiced", |
| "not user facing", |
| "security", |
| ] |
| |
| |
| Features = namedtuple( |
| "Features", |
| ["title", "body", "pr_number", "files_changed", "labels", "author", "accepters"], |
| ) |
| |
| |
| def dict_to_features(dct): |
| return Features( |
| title=dct["title"], |
| body=dct["body"], |
| pr_number=dct["pr_number"], |
| files_changed=dct["files_changed"], |
| labels=dct["labels"], |
| author=dct["author"], |
| accepters=tuple(dct["accepters"]), |
| ) |
| |
| |
| def features_to_dict(features): |
| return dict(features._asdict()) |
| |
| |
| def run(command): |
| """Returns (return-code, stdout, stderr)""" |
| p = subprocess.Popen( |
| command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True |
| ) |
| output, err = p.communicate() |
| rc = p.returncode |
| enc = locale.getpreferredencoding() |
| output = output.decode(enc) |
| err = err.decode(enc) |
| return rc, output.strip(), err.strip() |
| |
| |
| def commit_body(commit_hash): |
| cmd = f"git log -n 1 --pretty=format:%b {commit_hash}" |
| ret, out, err = run(cmd) |
| return out if ret == 0 else None |
| |
| |
| def commit_title(commit_hash): |
| cmd = f"git log -n 1 --pretty=format:%s {commit_hash}" |
| ret, out, err = run(cmd) |
| return out if ret == 0 else None |
| |
| |
| def commit_files_changed(commit_hash): |
| cmd = f"git diff-tree --no-commit-id --name-only -r {commit_hash}" |
| ret, out, err = run(cmd) |
| return out.split("\n") if ret == 0 else None |
| |
| |
| def parse_pr_number(body, commit_hash, title): |
| regex = r"Pull Request resolved: https://github.com/pytorch/pytorch/pull/([0-9]+)" |
| matches = re.findall(regex, body) |
| if len(matches) == 0: |
| if "revert" not in title.lower() and "updating submodules" not in title.lower(): |
| print(f"[{commit_hash}: {title}] Could not parse PR number, ignoring PR") |
| return None |
| if len(matches) > 1: |
| print(f"[{commit_hash}: {title}] Got two PR numbers, using the first one") |
| return matches[0] |
| return matches[0] |
| |
| |
| def get_ghstack_token(): |
| pattern = "github_oauth = (.*)" |
| with open(Path("~/.ghstackrc").expanduser(), "r+") as f: |
| config = f.read() |
| matches = re.findall(pattern, config) |
| if len(matches) == 0: |
| raise RuntimeError("Can't find a github oauth token") |
| return matches[0] |
| |
| |
| def get_token(): |
| env_token = os.environ.get("GITHUB_TOKEN") |
| if env_token is not None: |
| print("using GITHUB_TOKEN from environment variable") |
| return env_token |
| else: |
| return get_ghstack_token() |
| |
| |
| token = get_token() |
| |
| headers = {"Authorization": f"token {token}"} |
| |
| |
| def run_query(query): |
| request = requests.post( |
| "https://api.github.com/graphql", json={"query": query}, headers=headers |
| ) |
| if request.status_code == 200: |
| return request.json() |
| else: |
| raise Exception( # noqa: TRY002 |
| f"Query failed to run by returning code of {request.status_code}. {request.json()}" |
| ) |
| |
| |
| _ERRORS = [] |
| _MAX_ERROR_LEN = 20 |
| |
| |
| def github_data(pr_number): |
| query = ( |
| """ |
| { |
| repository(owner: "pytorch", name: "pytorch") { |
| pullRequest(number: %s ) { |
| author { |
| login |
| } |
| reviews(last: 5, states: APPROVED) { |
| nodes { |
| author { |
| login |
| } |
| } |
| } |
| labels(first: 10) { |
| edges { |
| node { |
| name |
| } |
| } |
| } |
| } |
| } |
| } |
| """ # noqa: UP031 |
| % pr_number |
| ) |
| query = run_query(query) |
| if query.get("errors"): |
| global _ERRORS |
| _ERRORS.append(query.get("errors")) |
| if len(_ERRORS) < _MAX_ERROR_LEN: |
| return [], "None", () |
| else: |
| raise Exception( # noqa: TRY002 |
| f"Got {_MAX_ERROR_LEN} errors: {_ERRORS}, please check if" |
| " there is something wrong" |
| ) |
| edges = query["data"]["repository"]["pullRequest"]["labels"]["edges"] |
| labels = [edge["node"]["name"] for edge in edges] |
| author = query["data"]["repository"]["pullRequest"]["author"]["login"] |
| nodes = query["data"]["repository"]["pullRequest"]["reviews"]["nodes"] |
| |
| # using set to dedup multiple accepts from same accepter |
| accepters = {node["author"]["login"] for node in nodes} |
| accepters = tuple(sorted(accepters)) |
| |
| return labels, author, accepters |
| |
| |
| def get_features(commit_hash): |
| title, body, files_changed = ( |
| commit_title(commit_hash), |
| commit_body(commit_hash), |
| commit_files_changed(commit_hash), |
| ) |
| pr_number = parse_pr_number(body, commit_hash, title) |
| labels = [] |
| author = "" |
| accepters = () |
| if pr_number is not None: |
| labels, author, accepters = github_data(pr_number) |
| result = Features(title, body, pr_number, files_changed, labels, author, accepters) |
| return result |
| |
| |
| _commit_data_cache = None |
| |
| |
| def get_commit_data_cache(path="results/data.json"): |
| global _commit_data_cache |
| if _commit_data_cache is None: |
| _commit_data_cache = _CommitDataCache(path) |
| return _commit_data_cache |
| |
| |
| class _CommitDataCache: |
| def __init__(self, path): |
| self.path = path |
| self.data = {} |
| if os.path.exists(path): |
| self.data = self.read_from_disk() |
| else: |
| os.makedirs(Path(path).parent, exist_ok=True) |
| |
| def get(self, commit): |
| if commit not in self.data.keys(): |
| # Fetch and cache the data |
| self.data[commit] = get_features(commit) |
| self.write_to_disk() |
| return self.data[commit] |
| |
| def read_from_disk(self): |
| with open(self.path) as f: |
| data = json.load(f) |
| data = {commit: dict_to_features(dct) for commit, dct in data.items()} |
| return data |
| |
| def write_to_disk(self): |
| data = {commit: features._asdict() for commit, features in self.data.items()} |
| with open(self.path, "w") as f: |
| json.dump(data, f) |