blob: b4e7dd833093f8d68f35a63fbf0b9e169bbcb709 [file] [log] [blame]
# Copyright 2024 The Bazel Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Parse SimpleAPI HTML in Starlark.
"""
def parse_simpleapi_html(*, url, content):
"""Get the package URLs for given shas by parsing the Simple API HTML.
Args:
url(str): The URL that the HTML content can be downloaded from.
content(str): The Simple API HTML content.
Returns:
A list of structs with:
* filename: The filename of the artifact.
* url: The URL to download the artifact.
* sha256: The sha256 of the artifact.
* metadata_sha256: The whl METADATA sha256 if we can download it. If this is
present, then the 'metadata_url' is also present. Defaults to "".
* metadata_url: The URL for the METADATA if we can download it. Defaults to "".
"""
sdists = {}
whls = {}
lines = content.split("<a href=\"")
_, _, api_version = lines[0].partition("name=\"pypi:repository-version\" content=\"")
api_version, _, _ = api_version.partition("\"")
# We must assume the 1.0 if it is not present
# See https://packaging.python.org/en/latest/specifications/simple-repository-api/#clients
api_version = api_version or "1.0"
api_version = tuple([int(i) for i in api_version.split(".")])
if api_version >= (2, 0):
# We don't expect to have version 2.0 here, but have this check in place just in case.
# https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api
fail("Unsupported API version: {}".format(api_version))
# Each line follows the following pattern
# <a href="https://...#sha256=..." attribute1="foo" ... attributeN="bar">filename</a><br />
for line in lines[1:]:
dist_url, _, tail = line.partition("#sha256=")
sha256, _, tail = tail.partition("\"")
# See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api
yanked = "data-yanked" in line
head, _, _ = tail.rpartition("</a>")
maybe_metadata, _, filename = head.rpartition(">")
metadata_sha256 = ""
metadata_url = ""
for metadata_marker in ["data-core-metadata", "data-dist-info-metadata"]:
metadata_marker = metadata_marker + "=\"sha256="
if metadata_marker in maybe_metadata:
# Implement https://peps.python.org/pep-0714/
_, _, tail = maybe_metadata.partition(metadata_marker)
metadata_sha256, _, _ = tail.partition("\"")
metadata_url = dist_url + ".metadata"
break
if filename.endswith(".whl"):
whls[sha256] = struct(
filename = filename,
url = _absolute_url(url, dist_url),
sha256 = sha256,
metadata_sha256 = metadata_sha256,
metadata_url = _absolute_url(url, metadata_url) if metadata_url else "",
yanked = yanked,
)
else:
sdists[sha256] = struct(
filename = filename,
url = _absolute_url(url, dist_url),
sha256 = sha256,
metadata_sha256 = "",
metadata_url = "",
yanked = yanked,
)
return struct(
sdists = sdists,
whls = whls,
)
def _get_root_directory(url):
scheme_end = url.find("://")
if scheme_end == -1:
fail("Invalid URL format")
scheme = url[:scheme_end]
host_end = url.find("/", scheme_end + 3)
if host_end == -1:
host_end = len(url)
host = url[scheme_end + 3:host_end]
return "{}://{}".format(scheme, host)
def _is_downloadable(url):
"""Checks if the URL would be accepted by the Bazel downloader.
This is based on Bazel's HttpUtils::isUrlSupportedByDownloader
"""
return url.startswith("http://") or url.startswith("https://") or url.startswith("file://")
def _absolute_url(index_url, candidate):
if candidate == "":
return candidate
if _is_downloadable(candidate):
return candidate
if candidate.startswith("/"):
# absolute path
root_directory = _get_root_directory(index_url)
return "{}{}".format(root_directory, candidate)
if candidate.startswith(".."):
# relative path with up references
candidate_parts = candidate.split("..")
last = candidate_parts[-1]
for _ in range(len(candidate_parts) - 1):
index_url, _, _ = index_url.rstrip("/").rpartition("/")
return "{}/{}".format(index_url, last.strip("/"))
# relative path without up-references
return "{}/{}".format(index_url, candidate)