blob: 51fb6fece9b70d657bfe59b807c0fef37013fa2f [file] [log] [blame] [edit]
#!/usr/bin/env python3
# Copyright 2019 The ChromiumOS Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Maps LLVM git SHAs to synthetic revision numbers and back.
Revision numbers are all of the form '(branch_name, r1234)'. As a shorthand,
r1234 is parsed as '(main, 1234)'.
"""
import argparse
from pathlib import Path
import re
import subprocess
import sys
from typing import IO, Iterable, List, NamedTuple, Optional, Tuple, Union
MAIN_BRANCH = "main"
# Note that after base_llvm_sha, we reach The Wild West(TM) of commits.
# So reasonable input that could break us includes:
#
# Revert foo
#
# This reverts foo, which had the commit message:
#
# bar
# llvm-svn: 375505
#
# While saddening, this is something we should probably try to handle
# reasonably.
base_llvm_revision = 375505
base_llvm_sha = "186155b89c2d2a2f62337081e3ca15f676c9434b"
# Known pairs of [revision, SHA] in ascending order.
# The first element is the first non-`llvm-svn` commit that exists. Later ones
# are functional nops, but speed this script up immensely, since `git` can take
# quite a while to walk >100K commits.
known_llvm_rev_sha_pairs = (
(base_llvm_revision, base_llvm_sha),
(425000, "af870e11aed7a5c475ae41a72e3015c4c88597d1"),
(450000, "906ebd5830e6053b50c52bf098e3586b567e8499"),
(475000, "530d14a99611a71f8f3eb811920fd7b5c4d4e1f8"),
(500000, "173855f9b0bdfe45d71272596b510650bfc1ca33"),
)
# Represents an LLVM git checkout:
# - |dir| is the directory of the LLVM checkout
# - |remote| is the name of the LLVM remote. Generally it's "origin".
LLVMConfig = NamedTuple(
"LLVMConfig", (("remote", str), ("dir", Union[Path, str]))
)
class Rev(NamedTuple("Rev", (("branch", str), ("number", int)))):
"""Represents a LLVM 'revision', a shorthand identifies a LLVM commit."""
@staticmethod
def parse(rev: str) -> "Rev":
"""Parses a Rev from the given string.
Raises a ValueError on a failed parse.
"""
# Revs are parsed into (${branch_name}, r${commits_since_base_commit})
# pairs.
#
# We support r${commits_since_base_commit} as shorthand for
# (main, r${commits_since_base_commit}).
if rev.startswith("r"):
branch_name = MAIN_BRANCH
rev_string = rev[1:]
else:
match = re.match(r"\((.+), r(\d+)\)", rev)
if not match:
raise ValueError("%r isn't a valid revision" % rev)
branch_name, rev_string = match.groups()
return Rev(branch=branch_name, number=int(rev_string))
def __str__(self) -> str:
branch_name, number = self
if branch_name == MAIN_BRANCH:
return "r%d" % number
return "(%s, r%d)" % (branch_name, number)
def is_git_sha(xs: str) -> bool:
"""Returns whether the given string looks like a valid git commit SHA."""
return (
len(xs) > 6
and len(xs) <= 40
and all(x.isdigit() or "a" <= x.lower() <= "f" for x in xs)
)
def check_output(command: List[str], cwd: Union[Path, str]) -> str:
"""Shorthand for subprocess.check_output. Auto-decodes any stdout."""
result = subprocess.run(
command,
cwd=cwd,
check=True,
stdin=subprocess.DEVNULL,
stdout=subprocess.PIPE,
encoding="utf-8",
)
return result.stdout
def translate_prebase_sha_to_rev_number(
llvm_config: LLVMConfig, sha: str
) -> int:
"""Translates a sha to a revision number (e.g., "llvm-svn: 1234").
This function assumes that the given SHA is an ancestor of |base_llvm_sha|.
"""
commit_message = check_output(
["git", "log", "-n1", "--format=%B", sha, "--"],
cwd=llvm_config.dir,
)
last_line = commit_message.strip().splitlines()[-1]
svn_match = re.match(r"^llvm-svn: (\d+)$", last_line)
if not svn_match:
raise ValueError(
f"No llvm-svn line found for {sha}, which... shouldn't happen?"
)
return int(svn_match.group(1))
def translate_sha_to_rev(llvm_config: LLVMConfig, sha_or_ref: str) -> Rev:
"""Translates a sha or git ref to a Rev."""
if is_git_sha(sha_or_ref):
sha = sha_or_ref
else:
sha = check_output(
["git", "rev-parse", "--revs-only", sha_or_ref, "--"],
cwd=llvm_config.dir,
)
sha = sha.strip()
for base_rev, base_sha in reversed(known_llvm_rev_sha_pairs):
merge_base = check_output(
["git", "merge-base", base_sha, sha, "--"],
cwd=llvm_config.dir,
)
merge_base = merge_base.strip()
if merge_base == base_sha:
result = check_output(
[
"git",
"rev-list",
"--count",
"--first-parent",
f"{base_sha}..{sha}",
"--",
],
cwd=llvm_config.dir,
)
count = int(result.strip())
return Rev(branch=MAIN_BRANCH, number=count + base_rev)
# Otherwise, either:
# - |merge_base| is |sha| (we have a guaranteed llvm-svn number on |sha|)
# - |merge_base| is neither (we have a guaranteed llvm-svn number on
# |merge_base|, but not |sha|)
merge_base_number = translate_prebase_sha_to_rev_number(
llvm_config, merge_base
)
if merge_base == sha:
return Rev(branch=MAIN_BRANCH, number=merge_base_number)
distance_from_base = check_output(
[
"git",
"rev-list",
"--count",
"--first-parent",
f"{merge_base}..{sha}",
"--",
],
cwd=llvm_config.dir,
)
revision_number = merge_base_number + int(distance_from_base.strip())
branches_containing = check_output(
["git", "branch", "-r", "--contains", sha],
cwd=llvm_config.dir,
)
candidates = []
prefix = llvm_config.remote + "/"
for branch in branches_containing.splitlines():
branch = branch.strip()
if branch.startswith(prefix):
candidates.append(branch[len(prefix) :])
if not candidates:
raise ValueError(
f"No viable branches found from {llvm_config.remote} with {sha}"
)
# It seems that some `origin/release/.*` branches have
# `origin/upstream/release/.*` equivalents, which is... awkward to deal
# with. Prefer the latter, since that seems to have newer commits than the
# former. Technically n^2, but len(elements) should be like, tens in the
# worst case.
candidates = [x for x in candidates if f"upstream/{x}" not in candidates]
if len(candidates) != 1:
raise ValueError(
f"Ambiguity: multiple branches from {llvm_config.remote} have "
f"{sha}: {sorted(candidates)}"
)
return Rev(branch=candidates[0], number=revision_number)
def parse_git_commit_messages(
stream: Union[Iterable[str], IO[str]], separator: str
) -> Iterable[Tuple[str, str]]:
"""Parses a stream of git log messages.
These are expected to be in the format:
40 character sha
commit
message
body
separator
40 character sha
commit
message
body
separator
"""
lines = iter(stream)
while True:
# Looks like a potential bug in pylint? crbug.com/1041148
# pylint: disable=stop-iteration-return
sha = next(lines, None)
if sha is None:
return
sha = sha.strip()
assert is_git_sha(sha), f"Invalid git SHA: {sha}"
message = []
for line in lines:
if line.strip() == separator:
break
message.append(line)
yield sha, "".join(message)
def translate_prebase_rev_to_sha(llvm_config: LLVMConfig, rev: Rev) -> str:
"""Translates a Rev to a SHA.
This function assumes that the given rev refers to a commit that's an
ancestor of |base_llvm_sha|.
"""
# Because reverts may include reverted commit messages, we can't just |-n1|
# and pick that.
separator = ">!" * 80
looking_for = f"llvm-svn: {rev.number}"
git_command = [
"git",
"log",
"--grep",
f"^{looking_for}$",
f"--format=%H%n%B{separator}",
base_llvm_sha,
]
with subprocess.Popen(
git_command,
cwd=llvm_config.dir,
stdin=subprocess.DEVNULL,
stdout=subprocess.PIPE,
encoding="utf-8",
) as subp:
assert subp.stdout is not None
for sha, message in parse_git_commit_messages(subp.stdout, separator):
last_line = message.splitlines()[-1]
if last_line.strip() == looking_for:
subp.terminate()
return sha
if subp.wait() != 0:
raise subprocess.CalledProcessError(subp.returncode, git_command)
raise ValueError(f"No commit with revision {rev} found")
def translate_rev_to_sha_from_baseline(
llvm_config: LLVMConfig,
parent_sha: str,
parent_rev: int,
child_sha: str,
child_rev: Optional[int],
want_rev: int,
branch_name: str,
) -> str:
"""Translates a revision number between a parent & child to a SHA.
Args:
llvm_config: LLVM config to use.
parent_sha: SHA of the parent that the revision number is a child of.
parent_rev: Revision number of `parent_sha`.
child_sha: A child of `parent_sha` to find a rev on.
child_rev: Optional note of what the child's revision number is.
want_rev: The desired revision number between child and parent.
branch_name: Name of the branch to refer to if a ValueError is raised.
Raises:
ValueError if the given child isn't far enough away from the parent to
find `want_rev`.
"""
# As a convenience, have a fast path for want_rev < parent_rev. In
# particular, branches can hit this case.
if want_rev < parent_rev:
baseline_git_sha = parent_sha
commits_behind_baseline = parent_rev - want_rev
else:
if child_rev is None:
commits_between_parent_and_child = check_output(
[
"git",
"rev-list",
"--count",
"--first-parent",
f"{parent_sha}..{child_sha}",
"--",
],
cwd=llvm_config.dir,
)
child_rev = parent_rev + int(
commits_between_parent_and_child.strip()
)
if child_rev < want_rev:
raise ValueError(
"Revision {want_rev} is past "
f"{llvm_config.remote}/{branch_name}. Try updating your tree?"
)
baseline_git_sha = child_sha
commits_behind_baseline = child_rev - want_rev
if not commits_behind_baseline:
return baseline_git_sha
result = check_output(
[
"git",
"rev-parse",
"--revs-only",
f"{baseline_git_sha}~{commits_behind_baseline}",
],
cwd=llvm_config.dir,
)
return result.strip()
def translate_rev_to_sha(llvm_config: LLVMConfig, rev: Rev) -> str:
"""Translates a Rev to a SHA.
Raises a ValueError if the given Rev doesn't exist in the given config.
"""
branch, number = rev
branch_tip = check_output(
["git", "rev-parse", "--revs-only", f"{llvm_config.remote}/{branch}"],
cwd=llvm_config.dir,
).strip()
if branch != MAIN_BRANCH:
main_merge_point = check_output(
[
"git",
"merge-base",
f"{llvm_config.remote}/{MAIN_BRANCH}",
branch_tip,
],
cwd=llvm_config.dir,
)
main_merge_point = main_merge_point.strip()
main_rev = translate_sha_to_rev(llvm_config, main_merge_point)
return translate_rev_to_sha_from_baseline(
llvm_config,
parent_sha=main_merge_point,
parent_rev=main_rev.number,
child_sha=branch_tip,
child_rev=None,
want_rev=number,
branch_name=branch,
)
if number < base_llvm_revision:
return translate_prebase_rev_to_sha(llvm_config, rev)
# Technically this could be a binary search, but the list has fewer than 10
# elems, and won't grow fast. Linear is easier.
last_cached_rev = None
last_cached_sha = branch_tip
for cached_rev, cached_sha in reversed(known_llvm_rev_sha_pairs):
if cached_rev == number:
return cached_sha
if cached_rev < number:
return translate_rev_to_sha_from_baseline(
llvm_config,
parent_sha=cached_sha,
parent_rev=cached_rev,
child_sha=last_cached_sha,
child_rev=last_cached_rev,
want_rev=number,
branch_name=branch,
)
last_cached_rev = cached_rev
last_cached_sha = cached_sha
# This is only hit if `number >= base_llvm_revision` _and_ there's no
# coverage for `number` in `known_llvm_rev_sha_pairs`, which contains
# `base_llvm_revision`.
assert False, "Couldn't find a base SHA for a rev on main?"
def find_root_llvm_dir(root_dir: str = ".") -> str:
"""Finds the root of an LLVM directory starting at |root_dir|.
Raises a subprocess.CalledProcessError if no git directory is found.
"""
result = check_output(
["git", "rev-parse", "--show-toplevel"],
cwd=root_dir,
)
return result.strip()
def main(argv: List[str]) -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--llvm_dir",
help="LLVM directory to consult for git history, etc. Autodetected "
"if cwd is inside of an LLVM tree",
)
parser.add_argument(
"--upstream",
default="origin",
help="LLVM upstream's remote name. Defaults to %(default)s.",
)
sha_or_rev = parser.add_mutually_exclusive_group(required=True)
sha_or_rev.add_argument(
"--sha", help="A git SHA (or ref) to convert to a rev"
)
sha_or_rev.add_argument("--rev", help="A rev to convert into a sha")
opts = parser.parse_args(argv)
llvm_dir = opts.llvm_dir
if llvm_dir is None:
try:
llvm_dir = find_root_llvm_dir()
except subprocess.CalledProcessError:
parser.error(
"Couldn't autodetect an LLVM tree; please use --llvm_dir"
)
config = LLVMConfig(
remote=opts.upstream,
dir=opts.llvm_dir or find_root_llvm_dir(),
)
if opts.sha:
rev = translate_sha_to_rev(config, opts.sha)
print(rev)
else:
sha = translate_rev_to_sha(config, Rev.parse(opts.rev))
print(sha)
if __name__ == "__main__":
main(sys.argv[1:])