scripts/release_notes/common.py - platform/external/pytorch - Git at Google

 from collections import namedtuple
 from pathlib import Path
 import locale
 import subprocess
 import re
 import requests
 import os
 import json
 from dataclasses import dataclass

 @dataclass
 class CategoryGroup:
     name: str
     categories: list

 frontend_categories = [
     'meta',
     'nn',
     'linalg',
     'cpp',
     'python',
     'complex',
     'vmap',
     'autograd',
     'build',
     'memory_format',
     'foreach',
     'dataloader',
     'sparse',
     'nested tensor',
     'optimizer'
 ]

 pytorch_2_categories = [
     'dynamo',
     'inductor',
 ]

 # These will all get mapped to quantization
 quantization = CategoryGroup(
     name="quantization",
     categories=[
         'quantization',
         'AO frontend',
         'AO Pruning', ]
 )

 # Distributed has a number of release note labels we want to map to one
 distributed = CategoryGroup(
     name="distributed",
     categories=[
         'distributed',
         'distributed (c10d)',
         'distributed (composable)',
         'distributed (ddp)',
         'distributed (fsdp)',
         'distributed (rpc)',
         'distributed (sharded)',
     ]
 )

 categories = [
     'Uncategorized',
     'lazy',
     'hub',
     'mobile',
     'jit',
     'visualization',
     'onnx',
     'caffe2',
     'amd',
     'rocm',
     'cuda',
     'cpu',
     'cudnn',
     'xla',
     'benchmark',
     'profiler',
     'performance_as_product',
     'package',
     'dispatcher',
     'releng',
     'fx',
     'code_coverage',
     'vulkan',
     'skip',
     'composability',
     # 2.0 release
     'mps',
     'intel',
     'functorch',
     'gnn',
     'distributions',
     'serialization',
  ]  + [f'{category}_frontend' for category in frontend_categories] + pytorch_2_categories + [quantization.name] + [distributed.name]


 topics = [
     'bc_breaking',
     'deprecations',
     'new_features',
     'improvements',
     'bug_fixes',
     'performance',
     'docs',
     'devs',
     'Untopiced',
     "not user facing",
     "security",
 ]


 Features = namedtuple('Features', [
     'title',
     'body',
     'pr_number',
     'files_changed',
     'labels',
     'author',
     'accepters'
 ])


 def dict_to_features(dct):
     return Features(
         title=dct['title'],
         body=dct['body'],
         pr_number=dct['pr_number'],
         files_changed=dct['files_changed'],
         labels=dct['labels'],
         author=dct['author'],
         accepters=tuple(dct['accepters']))


 def features_to_dict(features):
     return dict(features._asdict())


 def run(command):
     """Returns (return-code, stdout, stderr)"""
     p = subprocess.Popen(command, stdout=subprocess.PIPE,
                          stderr=subprocess.PIPE, shell=True)
     output, err = p.communicate()
     rc = p.returncode
     enc = locale.getpreferredencoding()
     output = output.decode(enc)
     err = err.decode(enc)
     return rc, output.strip(), err.strip()


 def commit_body(commit_hash):
     cmd = f'git log -n 1 --pretty=format:%b {commit_hash}'
     ret, out, err = run(cmd)
     return out if ret == 0 else None


 def commit_title(commit_hash):
     cmd = f'git log -n 1 --pretty=format:%s {commit_hash}'
     ret, out, err = run(cmd)
     return out if ret == 0 else None


 def commit_files_changed(commit_hash):
     cmd = f'git diff-tree --no-commit-id --name-only -r {commit_hash}'
     ret, out, err = run(cmd)
     return out.split('\n') if ret == 0 else None


 def parse_pr_number(body, commit_hash, title):
     regex = r'Pull Request resolved: https://github.com/pytorch/pytorch/pull/([0-9]+)'
     matches = re.findall(regex, body)
     if len(matches) == 0:
         if 'revert' not in title.lower() and 'updating submodules' not in title.lower():
             print(f'[{commit_hash}: {title}] Could not parse PR number, ignoring PR')
         return None
     if len(matches) > 1:
         print(f'[{commit_hash}: {title}] Got two PR numbers, using the first one')
         return matches[0]
     return matches[0]


 def get_ghstack_token():
     pattern = 'github_oauth = (.*)'
     with open(Path('~/.ghstackrc').expanduser(), 'r+') as f:
         config = f.read()
     matches = re.findall(pattern, config)
     if len(matches) == 0:
         raise RuntimeError("Can't find a github oauth token")
     return matches[0]

 def get_token():
     env_token = os.environ.get("GITHUB_TOKEN")
     if env_token is not None:
         print("using GITHUB_TOKEN from environment variable")
         return env_token
     else:
         return get_ghstack_token()

 token = get_token()

 headers = {"Authorization": f"token {token}"}

 def run_query(query):
     request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
     if request.status_code == 200:
         return request.json()
     else:
         raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, request.json()))


 def github_data(pr_number):
     query = """
     {
       repository(owner: "pytorch", name: "pytorch") {
         pullRequest(number: %s ) {
           author {
             login
           }
           reviews(last: 5, states: APPROVED) {
             nodes {
               author {
                 login
               }
             }
           }
           labels(first: 10) {
             edges {
               node {
                 name
               }
             }
           }
         }
       }
     }
     """ % pr_number
     query = run_query(query)
     if query.get('errors'):
         raise Exception(query['errors'])
     edges = query['data']['repository']['pullRequest']['labels']['edges']
     labels = [edge['node']['name'] for edge in edges]
     author = query['data']['repository']['pullRequest']['author']['login']
     nodes = query['data']['repository']['pullRequest']['reviews']['nodes']

     # using set to dedup multiple accepts from same accepter
     accepters = {node["author"]["login"] for node in nodes}
     accepters = tuple(sorted(accepters))

     return labels, author, accepters


 def get_features(commit_hash):
     title, body, files_changed = (
         commit_title(commit_hash),
         commit_body(commit_hash),
         commit_files_changed(commit_hash))
     pr_number = parse_pr_number(body, commit_hash, title)
     labels = []
     author = ""
     accepters = tuple()
     if pr_number is not None:
         labels, author, accepters = github_data(pr_number)
     result = Features(title, body, pr_number, files_changed, labels, author, accepters)
     return result


 _commit_data_cache = None

 def get_commit_data_cache(path='results/data.json'):
     global _commit_data_cache
     if _commit_data_cache is None:
         _commit_data_cache = _CommitDataCache(path)
     return _commit_data_cache

 class _CommitDataCache:
     def __init__(self, path):
         self.path = path
         self.data = {}
         if os.path.exists(path):
             self.data = self.read_from_disk()
         else:
             os.makedirs(Path(path).parent, exist_ok=True)

     def get(self, commit):
         if commit not in self.data.keys():
             # Fetch and cache the data
             self.data[commit] = get_features(commit)
             self.write_to_disk()
         return self.data[commit]

     def read_from_disk(self):
         with open(self.path, 'r') as f:
             data = json.load(f)
             data = {commit: dict_to_features(dct)
                     for commit, dct in data.items()}
         return data

     def write_to_disk(self):
         data = {commit: features._asdict() for commit, features in self.data.items()}
         with open(self.path, 'w') as f:
             json.dump(data, f)
	from collections import namedtuple
	from pathlib import Path
	import locale
	import subprocess
	import re
	import requests
	import os
	import json
	from dataclasses import dataclass

	@dataclass
	class CategoryGroup:
	name: str
	categories: list

	frontend_categories = [
	'meta',
	'nn',
	'linalg',
	'cpp',
	'python',
	'complex',
	'vmap',
	'autograd',
	'build',
	'memory_format',
	'foreach',
	'dataloader',
	'sparse',
	'nested tensor',
	'optimizer'
	]

	pytorch_2_categories = [
	'dynamo',
	'inductor',
	]

	# These will all get mapped to quantization
	quantization = CategoryGroup(
	name="quantization",
	categories=[
	'quantization',
	'AO frontend',
	'AO Pruning', ]
	)

	# Distributed has a number of release note labels we want to map to one
	distributed = CategoryGroup(
	name="distributed",
	categories=[
	'distributed',
	'distributed (c10d)',
	'distributed (composable)',
	'distributed (ddp)',
	'distributed (fsdp)',
	'distributed (rpc)',
	'distributed (sharded)',
	]
	)

	categories = [
	'Uncategorized',
	'lazy',
	'hub',
	'mobile',
	'jit',
	'visualization',
	'onnx',
	'caffe2',
	'amd',
	'rocm',
	'cuda',
	'cpu',
	'cudnn',
	'xla',
	'benchmark',
	'profiler',
	'performance_as_product',
	'package',
	'dispatcher',
	'releng',
	'fx',
	'code_coverage',
	'vulkan',
	'skip',
	'composability',
	# 2.0 release
	'mps',
	'intel',
	'functorch',
	'gnn',
	'distributions',
	'serialization',
	] + [f'{category}_frontend' for category in frontend_categories] + pytorch_2_categories + [quantization.name] + [distributed.name]


	topics = [
	'bc_breaking',
	'deprecations',
	'new_features',
	'improvements',
	'bug_fixes',
	'performance',
	'docs',
	'devs',
	'Untopiced',
	"not user facing",
	"security",
	]


	Features = namedtuple('Features', [
	'title',
	'body',
	'pr_number',
	'files_changed',
	'labels',
	'author',
	'accepters'
	])


	def dict_to_features(dct):
	return Features(
	title=dct['title'],
	body=dct['body'],
	pr_number=dct['pr_number'],
	files_changed=dct['files_changed'],
	labels=dct['labels'],
	author=dct['author'],
	accepters=tuple(dct['accepters']))


	def features_to_dict(features):
	return dict(features._asdict())


	def run(command):
	"""Returns (return-code, stdout, stderr)"""
	p = subprocess.Popen(command, stdout=subprocess.PIPE,
	stderr=subprocess.PIPE, shell=True)
	output, err = p.communicate()
	rc = p.returncode
	enc = locale.getpreferredencoding()
	output = output.decode(enc)
	err = err.decode(enc)
	return rc, output.strip(), err.strip()


	def commit_body(commit_hash):
	cmd = f'git log -n 1 --pretty=format:%b {commit_hash}'
	ret, out, err = run(cmd)
	return out if ret == 0 else None


	def commit_title(commit_hash):
	cmd = f'git log -n 1 --pretty=format:%s {commit_hash}'
	ret, out, err = run(cmd)
	return out if ret == 0 else None


	def commit_files_changed(commit_hash):
	cmd = f'git diff-tree --no-commit-id --name-only -r {commit_hash}'
	ret, out, err = run(cmd)
	return out.split('\n') if ret == 0 else None


	def parse_pr_number(body, commit_hash, title):
	regex = r'Pull Request resolved: https://github.com/pytorch/pytorch/pull/([0-9]+)'
	matches = re.findall(regex, body)
	if len(matches) == 0:
	if 'revert' not in title.lower() and 'updating submodules' not in title.lower():
	print(f'[{commit_hash}: {title}] Could not parse PR number, ignoring PR')
	return None
	if len(matches) > 1:
	print(f'[{commit_hash}: {title}] Got two PR numbers, using the first one')
	return matches[0]
	return matches[0]


	def get_ghstack_token():
	pattern = 'github_oauth = (.*)'
	with open(Path('~/.ghstackrc').expanduser(), 'r+') as f:
	config = f.read()
	matches = re.findall(pattern, config)
	if len(matches) == 0:
	raise RuntimeError("Can't find a github oauth token")
	return matches[0]

	def get_token():
	env_token = os.environ.get("GITHUB_TOKEN")
	if env_token is not None:
	print("using GITHUB_TOKEN from environment variable")
	return env_token
	else:
	return get_ghstack_token()

	token = get_token()

	headers = {"Authorization": f"token {token}"}

	def run_query(query):
	request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
	if request.status_code == 200:
	return request.json()
	else:
	raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, request.json()))


	def github_data(pr_number):
	query = """
	{
	repository(owner: "pytorch", name: "pytorch") {
	pullRequest(number: %s ) {
	author {
	login
	}
	reviews(last: 5, states: APPROVED) {
	nodes {
	author {
	login
	}
	}
	}
	labels(first: 10) {
	edges {
	node {
	name
	}
	}
	}
	}
	}
	}
	""" % pr_number
	query = run_query(query)
	if query.get('errors'):
	raise Exception(query['errors'])
	edges = query['data']['repository']['pullRequest']['labels']['edges']
	labels = [edge['node']['name'] for edge in edges]
	author = query['data']['repository']['pullRequest']['author']['login']
	nodes = query['data']['repository']['pullRequest']['reviews']['nodes']

	# using set to dedup multiple accepts from same accepter
	accepters = {node["author"]["login"] for node in nodes}
	accepters = tuple(sorted(accepters))

	return labels, author, accepters


	def get_features(commit_hash):
	title, body, files_changed = (
	commit_title(commit_hash),
	commit_body(commit_hash),
	commit_files_changed(commit_hash))
	pr_number = parse_pr_number(body, commit_hash, title)
	labels = []
	author = ""
	accepters = tuple()
	if pr_number is not None:
	labels, author, accepters = github_data(pr_number)
	result = Features(title, body, pr_number, files_changed, labels, author, accepters)
	return result


	_commit_data_cache = None

	def get_commit_data_cache(path='results/data.json'):
	global _commit_data_cache
	if _commit_data_cache is None:
	_commit_data_cache = _CommitDataCache(path)
	return _commit_data_cache

	class _CommitDataCache:
	def __init__(self, path):
	self.path = path
	self.data = {}
	if os.path.exists(path):
	self.data = self.read_from_disk()
	else:
	os.makedirs(Path(path).parent, exist_ok=True)

	def get(self, commit):
	if commit not in self.data.keys():
	# Fetch and cache the data
	self.data[commit] = get_features(commit)
	self.write_to_disk()
	return self.data[commit]

	def read_from_disk(self):
	with open(self.path, 'r') as f:
	data = json.load(f)
	data = {commit: dict_to_features(dct)
	for commit, dct in data.items()}
	return data

	def write_to_disk(self):
	data = {commit: features._asdict() for commit, features in self.data.items()}
	with open(self.path, 'w') as f:
	json.dump(data, f)