scripts/release_notes/common.py - platform/external/pytorch - Git at Google

 import json
 import locale
 import os
 import re
 import subprocess
 from collections import namedtuple
 from dataclasses import dataclass
 from pathlib import Path

 import requests


 @dataclass
 class CategoryGroup:
     name: str
     categories: list


 frontend_categories = [
     "meta",
     "nn",
     "linalg",
     "cpp",
     "python",
     "complex",
     "vmap",
     "autograd",
     "build",
     "memory_format",
     "foreach",
     "dataloader",
     "sparse",
     "nested tensor",
     "optimizer",
 ]

 pytorch_2_categories = [
     "dynamo",
     "inductor",
 ]

 # These will all get mapped to quantization
 quantization = CategoryGroup(
     name="quantization",
     categories=[
         "quantization",
         "AO frontend",
         "AO Pruning",
     ],
 )

 # Distributed has a number of release note labels we want to map to one
 distributed = CategoryGroup(
     name="distributed",
     categories=[
         "distributed",
         "distributed (c10d)",
         "distributed (composable)",
         "distributed (ddp)",
         "distributed (fsdp)",
         "distributed (rpc)",
         "distributed (sharded)",
     ],
 )

 categories = (
     [
         "Uncategorized",
         "lazy",
         "hub",
         "mobile",
         "jit",
         "visualization",
         "onnx",
         "caffe2",
         "amd",
         "rocm",
         "cuda",
         "cpu",
         "cudnn",
         "xla",
         "benchmark",
         "profiler",
         "performance_as_product",
         "package",
         "dispatcher",
         "releng",
         "fx",
         "code_coverage",
         "vulkan",
         "skip",
         "composability",
         # 2.0 release
         "mps",
         "intel",
         "functorch",
         "gnn",
         "distributions",
         "serialization",
     ]
     + [f"{category}_frontend" for category in frontend_categories]
     + pytorch_2_categories
     + [quantization.name]
     + [distributed.name]
 )


 topics = [
     "bc breaking",
     "deprecation",
     "new features",
     "improvements",
     "bug fixes",
     "performance",
     "docs",
     "devs",
     "Untopiced",
     "not user facing",
     "security",
 ]


 Features = namedtuple(
     "Features",
     ["title", "body", "pr_number", "files_changed", "labels", "author", "accepters"],
 )


 def dict_to_features(dct):
     return Features(
         title=dct["title"],
         body=dct["body"],
         pr_number=dct["pr_number"],
         files_changed=dct["files_changed"],
         labels=dct["labels"],
         author=dct["author"],
         accepters=tuple(dct["accepters"]),
     )


 def features_to_dict(features):
     return dict(features._asdict())


 def run(command):
     """Returns (return-code, stdout, stderr)"""
     p = subprocess.Popen(
         command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
     )
     output, err = p.communicate()
     rc = p.returncode
     enc = locale.getpreferredencoding()
     output = output.decode(enc)
     err = err.decode(enc)
     return rc, output.strip(), err.strip()


 def commit_body(commit_hash):
     cmd = f"git log -n 1 --pretty=format:%b {commit_hash}"
     ret, out, err = run(cmd)
     return out if ret == 0 else None


 def commit_title(commit_hash):
     cmd = f"git log -n 1 --pretty=format:%s {commit_hash}"
     ret, out, err = run(cmd)
     return out if ret == 0 else None


 def commit_files_changed(commit_hash):
     cmd = f"git diff-tree --no-commit-id --name-only -r {commit_hash}"
     ret, out, err = run(cmd)
     return out.split("\n") if ret == 0 else None


 def parse_pr_number(body, commit_hash, title):
     regex = r"Pull Request resolved: https://github.com/pytorch/pytorch/pull/([0-9]+)"
     matches = re.findall(regex, body)
     if len(matches) == 0:
         if "revert" not in title.lower() and "updating submodules" not in title.lower():
             print(f"[{commit_hash}: {title}] Could not parse PR number, ignoring PR")
         return None
     if len(matches) > 1:
         print(f"[{commit_hash}: {title}] Got two PR numbers, using the first one")
         return matches[0]
     return matches[0]


 def get_ghstack_token():
     pattern = "github_oauth = (.*)"
     with open(Path("~/.ghstackrc").expanduser(), "r+") as f:
         config = f.read()
     matches = re.findall(pattern, config)
     if len(matches) == 0:
         raise RuntimeError("Can't find a github oauth token")
     return matches[0]


 def get_token():
     env_token = os.environ.get("GITHUB_TOKEN")
     if env_token is not None:
         print("using GITHUB_TOKEN from environment variable")
         return env_token
     else:
         return get_ghstack_token()


 token = get_token()

 headers = {"Authorization": f"token {token}"}


 def run_query(query):
     request = requests.post(
         "https://api.github.com/graphql", json={"query": query}, headers=headers
     )
     if request.status_code == 200:
         return request.json()
     else:
         raise Exception(  # noqa: TRY002
             f"Query failed to run by returning code of {request.status_code}. {request.json()}"
         )


 _ERRORS = []
 _MAX_ERROR_LEN = 20


 def github_data(pr_number):
     query = (
         """
     {
       repository(owner: "pytorch", name: "pytorch") {
         pullRequest(number: %s ) {
           author {
             login
           }
           reviews(last: 5, states: APPROVED) {
             nodes {
               author {
                 login
               }
             }
           }
           labels(first: 10) {
             edges {
               node {
                 name
               }
             }
           }
         }
       }
     }
     """  # noqa: UP031
         % pr_number
     )
     query = run_query(query)
     if query.get("errors"):
         global _ERRORS
         _ERRORS.append(query.get("errors"))
         if len(_ERRORS) < _MAX_ERROR_LEN:
             return [], "None", ()
         else:
             raise Exception(  # noqa: TRY002
                 f"Got {_MAX_ERROR_LEN} errors: {_ERRORS}, please check if"
                 " there is something wrong"
             )
     edges = query["data"]["repository"]["pullRequest"]["labels"]["edges"]
     labels = [edge["node"]["name"] for edge in edges]
     author = query["data"]["repository"]["pullRequest"]["author"]["login"]
     nodes = query["data"]["repository"]["pullRequest"]["reviews"]["nodes"]

     # using set to dedup multiple accepts from same accepter
     accepters = {node["author"]["login"] for node in nodes}
     accepters = tuple(sorted(accepters))

     return labels, author, accepters


 def get_features(commit_hash):
     title, body, files_changed = (
         commit_title(commit_hash),
         commit_body(commit_hash),
         commit_files_changed(commit_hash),
     )
     pr_number = parse_pr_number(body, commit_hash, title)
     labels = []
     author = ""
     accepters = ()
     if pr_number is not None:
         labels, author, accepters = github_data(pr_number)
     result = Features(title, body, pr_number, files_changed, labels, author, accepters)
     return result


 _commit_data_cache = None


 def get_commit_data_cache(path="results/data.json"):
     global _commit_data_cache
     if _commit_data_cache is None:
         _commit_data_cache = _CommitDataCache(path)
     return _commit_data_cache


 class _CommitDataCache:
     def __init__(self, path):
         self.path = path
         self.data = {}
         if os.path.exists(path):
             self.data = self.read_from_disk()
         else:
             os.makedirs(Path(path).parent, exist_ok=True)

     def get(self, commit):
         if commit not in self.data.keys():
             # Fetch and cache the data
             self.data[commit] = get_features(commit)
             self.write_to_disk()
         return self.data[commit]

     def read_from_disk(self):
         with open(self.path) as f:
             data = json.load(f)
             data = {commit: dict_to_features(dct) for commit, dct in data.items()}
         return data

     def write_to_disk(self):
         data = {commit: features._asdict() for commit, features in self.data.items()}
         with open(self.path, "w") as f:
             json.dump(data, f)
	import json
	import locale
	import os
	import re
	import subprocess
	from collections import namedtuple
	from dataclasses import dataclass
	from pathlib import Path

	import requests


	@dataclass
	class CategoryGroup:
	name: str
	categories: list


	frontend_categories = [
	"meta",
	"nn",
	"linalg",
	"cpp",
	"python",
	"complex",
	"vmap",
	"autograd",
	"build",
	"memory_format",
	"foreach",
	"dataloader",
	"sparse",
	"nested tensor",
	"optimizer",
	]

	pytorch_2_categories = [
	"dynamo",
	"inductor",
	]

	# These will all get mapped to quantization
	quantization = CategoryGroup(
	name="quantization",
	categories=[
	"quantization",
	"AO frontend",
	"AO Pruning",
	],
	)

	# Distributed has a number of release note labels we want to map to one
	distributed = CategoryGroup(
	name="distributed",
	categories=[
	"distributed",
	"distributed (c10d)",
	"distributed (composable)",
	"distributed (ddp)",
	"distributed (fsdp)",
	"distributed (rpc)",
	"distributed (sharded)",
	],
	)

	categories = (
	[
	"Uncategorized",
	"lazy",
	"hub",
	"mobile",
	"jit",
	"visualization",
	"onnx",
	"caffe2",
	"amd",
	"rocm",
	"cuda",
	"cpu",
	"cudnn",
	"xla",
	"benchmark",
	"profiler",
	"performance_as_product",
	"package",
	"dispatcher",
	"releng",
	"fx",
	"code_coverage",
	"vulkan",
	"skip",
	"composability",
	# 2.0 release
	"mps",
	"intel",
	"functorch",
	"gnn",
	"distributions",
	"serialization",
	]
	+ [f"{category}_frontend" for category in frontend_categories]
	+ pytorch_2_categories
	+ [quantization.name]
	+ [distributed.name]
	)


	topics = [
	"bc breaking",
	"deprecation",
	"new features",
	"improvements",
	"bug fixes",
	"performance",
	"docs",
	"devs",
	"Untopiced",
	"not user facing",
	"security",
	]


	Features = namedtuple(
	"Features",
	["title", "body", "pr_number", "files_changed", "labels", "author", "accepters"],
	)


	def dict_to_features(dct):
	return Features(
	title=dct["title"],
	body=dct["body"],
	pr_number=dct["pr_number"],
	files_changed=dct["files_changed"],
	labels=dct["labels"],
	author=dct["author"],
	accepters=tuple(dct["accepters"]),
	)


	def features_to_dict(features):
	return dict(features._asdict())


	def run(command):
	"""Returns (return-code, stdout, stderr)"""
	p = subprocess.Popen(
	command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
	)
	output, err = p.communicate()
	rc = p.returncode
	enc = locale.getpreferredencoding()
	output = output.decode(enc)
	err = err.decode(enc)
	return rc, output.strip(), err.strip()


	def commit_body(commit_hash):
	cmd = f"git log -n 1 --pretty=format:%b {commit_hash}"
	ret, out, err = run(cmd)
	return out if ret == 0 else None


	def commit_title(commit_hash):
	cmd = f"git log -n 1 --pretty=format:%s {commit_hash}"
	ret, out, err = run(cmd)
	return out if ret == 0 else None


	def commit_files_changed(commit_hash):
	cmd = f"git diff-tree --no-commit-id --name-only -r {commit_hash}"
	ret, out, err = run(cmd)
	return out.split("\n") if ret == 0 else None


	def parse_pr_number(body, commit_hash, title):
	regex = r"Pull Request resolved: https://github.com/pytorch/pytorch/pull/([0-9]+)"
	matches = re.findall(regex, body)
	if len(matches) == 0:
	if "revert" not in title.lower() and "updating submodules" not in title.lower():
	print(f"[{commit_hash}: {title}] Could not parse PR number, ignoring PR")
	return None
	if len(matches) > 1:
	print(f"[{commit_hash}: {title}] Got two PR numbers, using the first one")
	return matches[0]
	return matches[0]


	def get_ghstack_token():
	pattern = "github_oauth = (.*)"
	with open(Path("~/.ghstackrc").expanduser(), "r+") as f:
	config = f.read()
	matches = re.findall(pattern, config)
	if len(matches) == 0:
	raise RuntimeError("Can't find a github oauth token")
	return matches[0]


	def get_token():
	env_token = os.environ.get("GITHUB_TOKEN")
	if env_token is not None:
	print("using GITHUB_TOKEN from environment variable")
	return env_token
	else:
	return get_ghstack_token()


	token = get_token()

	headers = {"Authorization": f"token {token}"}


	def run_query(query):
	request = requests.post(
	"https://api.github.com/graphql", json={"query": query}, headers=headers
	)
	if request.status_code == 200:
	return request.json()
	else:
	raise Exception( # noqa: TRY002
	f"Query failed to run by returning code of {request.status_code}. {request.json()}"
	)


	_ERRORS = []
	_MAX_ERROR_LEN = 20


	def github_data(pr_number):
	query = (
	"""
	{
	repository(owner: "pytorch", name: "pytorch") {
	pullRequest(number: %s ) {
	author {
	login
	}
	reviews(last: 5, states: APPROVED) {
	nodes {
	author {
	login
	}
	}
	}
	labels(first: 10) {
	edges {
	node {
	name
	}
	}
	}
	}
	}
	}
	""" # noqa: UP031
	% pr_number
	)
	query = run_query(query)
	if query.get("errors"):
	global _ERRORS
	_ERRORS.append(query.get("errors"))
	if len(_ERRORS) < _MAX_ERROR_LEN:
	return [], "None", ()
	else:
	raise Exception( # noqa: TRY002
	f"Got {_MAX_ERROR_LEN} errors: {_ERRORS}, please check if"
	" there is something wrong"
	)
	edges = query["data"]["repository"]["pullRequest"]["labels"]["edges"]
	labels = [edge["node"]["name"] for edge in edges]
	author = query["data"]["repository"]["pullRequest"]["author"]["login"]
	nodes = query["data"]["repository"]["pullRequest"]["reviews"]["nodes"]

	# using set to dedup multiple accepts from same accepter
	accepters = {node["author"]["login"] for node in nodes}
	accepters = tuple(sorted(accepters))

	return labels, author, accepters


	def get_features(commit_hash):
	title, body, files_changed = (
	commit_title(commit_hash),
	commit_body(commit_hash),
	commit_files_changed(commit_hash),
	)
	pr_number = parse_pr_number(body, commit_hash, title)
	labels = []
	author = ""
	accepters = ()
	if pr_number is not None:
	labels, author, accepters = github_data(pr_number)
	result = Features(title, body, pr_number, files_changed, labels, author, accepters)
	return result


	_commit_data_cache = None


	def get_commit_data_cache(path="results/data.json"):
	global _commit_data_cache
	if _commit_data_cache is None:
	_commit_data_cache = _CommitDataCache(path)
	return _commit_data_cache


	class _CommitDataCache:
	def __init__(self, path):
	self.path = path
	self.data = {}
	if os.path.exists(path):
	self.data = self.read_from_disk()
	else:
	os.makedirs(Path(path).parent, exist_ok=True)

	def get(self, commit):
	if commit not in self.data.keys():
	# Fetch and cache the data
	self.data[commit] = get_features(commit)
	self.write_to_disk()
	return self.data[commit]

	def read_from_disk(self):
	with open(self.path) as f:
	data = json.load(f)
	data = {commit: dict_to_features(dct) for commit, dct in data.items()}
	return data

	def write_to_disk(self):
	data = {commit: features._asdict() for commit, features in self.data.items()}
	with open(self.path, "w") as f:
	json.dump(data, f)