python/private/pypi/parse_simpleapi_html.bzl - platform/external/bazelbuild-rules_python - Git at Google

 # Copyright 2024 The Bazel Authors. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 """
 Parse SimpleAPI HTML in Starlark.
 """

 def parse_simpleapi_html(*, url, content):
     """Get the package URLs for given shas by parsing the Simple API HTML.

     Args:
         url(str): The URL that the HTML content can be downloaded from.
         content(str): The Simple API HTML content.

     Returns:
         A list of structs with:
         * filename: The filename of the artifact.
         * url: The URL to download the artifact.
         * sha256: The sha256 of the artifact.
         * metadata_sha256: The whl METADATA sha256 if we can download it. If this is
           present, then the 'metadata_url' is also present. Defaults to "".
         * metadata_url: The URL for the METADATA if we can download it. Defaults to "".
     """
     sdists = {}
     whls = {}
     lines = content.split("<a href=\"")

     _, _, api_version = lines[0].partition("name=\"pypi:repository-version\" content=\"")
     api_version, _, _ = api_version.partition("\"")

     # We must assume the 1.0 if it is not present
     # See https://packaging.python.org/en/latest/specifications/simple-repository-api/#clients
     api_version = api_version or "1.0"
     api_version = tuple([int(i) for i in api_version.split(".")])

     if api_version >= (2, 0):
         # We don't expect to have version 2.0 here, but have this check in place just in case.
         # https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api
         fail("Unsupported API version: {}".format(api_version))

     # Each line follows the following pattern
     # <a href="https://...#sha256=..." attribute1="foo" ... attributeN="bar">filename</a><br />
     for line in lines[1:]:
         dist_url, _, tail = line.partition("#sha256=")
         sha256, _, tail = tail.partition("\"")

         # See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api
         yanked = "data-yanked" in line

         head, _, _ = tail.rpartition("</a>")
         maybe_metadata, _, filename = head.rpartition(">")

         metadata_sha256 = ""
         metadata_url = ""
         for metadata_marker in ["data-core-metadata", "data-dist-info-metadata"]:
             metadata_marker = metadata_marker + "=\"sha256="
             if metadata_marker in maybe_metadata:
                 # Implement https://peps.python.org/pep-0714/
                 _, _, tail = maybe_metadata.partition(metadata_marker)
                 metadata_sha256, _, _ = tail.partition("\"")
                 metadata_url = dist_url + ".metadata"
                 break

         if filename.endswith(".whl"):
             whls[sha256] = struct(
                 filename = filename,
                 url = _absolute_url(url, dist_url),
                 sha256 = sha256,
                 metadata_sha256 = metadata_sha256,
                 metadata_url = _absolute_url(url, metadata_url) if metadata_url else "",
                 yanked = yanked,
             )
         else:
             sdists[sha256] = struct(
                 filename = filename,
                 url = _absolute_url(url, dist_url),
                 sha256 = sha256,
                 metadata_sha256 = "",
                 metadata_url = "",
                 yanked = yanked,
             )

     return struct(
         sdists = sdists,
         whls = whls,
     )

 def _get_root_directory(url):
     scheme_end = url.find("://")
     if scheme_end == -1:
         fail("Invalid URL format")

     scheme = url[:scheme_end]
     host_end = url.find("/", scheme_end + 3)
     if host_end == -1:
         host_end = len(url)
     host = url[scheme_end + 3:host_end]

     return "{}://{}".format(scheme, host)

 def _is_downloadable(url):
     """Checks if the URL would be accepted by the Bazel downloader.

     This is based on Bazel's HttpUtils::isUrlSupportedByDownloader
     """
     return url.startswith("http://") or url.startswith("https://") or url.startswith("file://")

 def _absolute_url(index_url, candidate):
     if candidate == "":
         return candidate

     if _is_downloadable(candidate):
         return candidate

     if candidate.startswith("/"):
         # absolute path
         root_directory = _get_root_directory(index_url)
         return "{}{}".format(root_directory, candidate)

     if candidate.startswith(".."):
         # relative path with up references
         candidate_parts = candidate.split("..")
         last = candidate_parts[-1]
         for _ in range(len(candidate_parts) - 1):
             index_url, _, _ = index_url.rstrip("/").rpartition("/")

         return "{}/{}".format(index_url, last.strip("/"))

     # relative path without up-references
     return "{}/{}".format(index_url, candidate)
	# Copyright 2024 The Bazel Authors. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""
	Parse SimpleAPI HTML in Starlark.
	"""

	def parse_simpleapi_html(*, url, content):
	"""Get the package URLs for given shas by parsing the Simple API HTML.

	Args:
	url(str): The URL that the HTML content can be downloaded from.
	content(str): The Simple API HTML content.

	Returns:
	A list of structs with:
	* filename: The filename of the artifact.
	* url: The URL to download the artifact.
	* sha256: The sha256 of the artifact.
	* metadata_sha256: The whl METADATA sha256 if we can download it. If this is
	present, then the 'metadata_url' is also present. Defaults to "".
	* metadata_url: The URL for the METADATA if we can download it. Defaults to "".
	"""
	sdists = {}
	whls = {}
	lines = content.split("<a href=\"")

	_, _, api_version = lines[0].partition("name=\"pypi:repository-version\" content=\"")
	api_version, _, _ = api_version.partition("\"")

	# We must assume the 1.0 if it is not present
	# See https://packaging.python.org/en/latest/specifications/simple-repository-api/#clients
	api_version = api_version or "1.0"
	api_version = tuple([int(i) for i in api_version.split(".")])

	if api_version >= (2, 0):
	# We don't expect to have version 2.0 here, but have this check in place just in case.
	# https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api
	fail("Unsupported API version: {}".format(api_version))

	# Each line follows the following pattern
	# <a href="https://...#sha256=..." attribute1="foo" ... attributeN="bar">filename</a><br />
	for line in lines[1:]:
	dist_url, _, tail = line.partition("#sha256=")
	sha256, _, tail = tail.partition("\"")

	# See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api
	yanked = "data-yanked" in line

	head, _, _ = tail.rpartition("</a>")
	maybe_metadata, _, filename = head.rpartition(">")

	metadata_sha256 = ""
	metadata_url = ""
	for metadata_marker in ["data-core-metadata", "data-dist-info-metadata"]:
	metadata_marker = metadata_marker + "=\"sha256="
	if metadata_marker in maybe_metadata:
	# Implement https://peps.python.org/pep-0714/
	_, _, tail = maybe_metadata.partition(metadata_marker)
	metadata_sha256, _, _ = tail.partition("\"")
	metadata_url = dist_url + ".metadata"
	break

	if filename.endswith(".whl"):
	whls[sha256] = struct(
	filename = filename,
	url = _absolute_url(url, dist_url),
	sha256 = sha256,
	metadata_sha256 = metadata_sha256,
	metadata_url = _absolute_url(url, metadata_url) if metadata_url else "",
	yanked = yanked,
	)
	else:
	sdists[sha256] = struct(
	filename = filename,
	url = _absolute_url(url, dist_url),
	sha256 = sha256,
	metadata_sha256 = "",
	metadata_url = "",
	yanked = yanked,
	)

	return struct(
	sdists = sdists,
	whls = whls,
	)

	def _get_root_directory(url):
	scheme_end = url.find("://")
	if scheme_end == -1:
	fail("Invalid URL format")

	scheme = url[:scheme_end]
	host_end = url.find("/", scheme_end + 3)
	if host_end == -1:
	host_end = len(url)
	host = url[scheme_end + 3:host_end]

	return "{}://{}".format(scheme, host)

	def _is_downloadable(url):
	"""Checks if the URL would be accepted by the Bazel downloader.

	This is based on Bazel's HttpUtils::isUrlSupportedByDownloader
	"""
	return url.startswith("http://") or url.startswith("https://") or url.startswith("file://")

	def _absolute_url(index_url, candidate):
	if candidate == "":
	return candidate

	if _is_downloadable(candidate):
	return candidate

	if candidate.startswith("/"):
	# absolute path
	root_directory = _get_root_directory(index_url)
	return "{}{}".format(root_directory, candidate)

	if candidate.startswith(".."):
	# relative path with up references
	candidate_parts = candidate.split("..")
	last = candidate_parts[-1]
	for _ in range(len(candidate_parts) - 1):
	index_url, _, _ = index_url.rstrip("/").rpartition("/")

	return "{}/{}".format(index_url, last.strip("/"))

	# relative path without up-references
	return "{}/{}".format(index_url, candidate)