summaryrefslogtreecommitdiffstats
path: root/Tools/build
diff options
context:
space:
mode:
authorSeth Michael Larson <seth@python.org>2024-01-26 09:48:13 (GMT)
committerGitHub <noreply@github.com>2024-01-26 09:48:13 (GMT)
commit582d95e8bb0b78bf1b6b9a12371108b9993d3b84 (patch)
tree132e4214600aafc63e88be4c244a48c5461443c8 /Tools/build
parent456e274578dc9863f42ab24d62adc0d8c511b50f (diff)
downloadcpython-582d95e8bb0b78bf1b6b9a12371108b9993d3b84.zip
cpython-582d95e8bb0b78bf1b6b9a12371108b9993d3b84.tar.gz
cpython-582d95e8bb0b78bf1b6b9a12371108b9993d3b84.tar.bz2
gh-114250: Fetch metadata for pip and its vendored dependencies from PyPI (#114450)
Diffstat (limited to 'Tools/build')
-rw-r--r--Tools/build/generate_sbom.py263
1 files changed, 213 insertions, 50 deletions
diff --git a/Tools/build/generate_sbom.py b/Tools/build/generate_sbom.py
index 317d48f..aceb13f 100644
--- a/Tools/build/generate_sbom.py
+++ b/Tools/build/generate_sbom.py
@@ -8,6 +8,7 @@ import pathlib
import subprocess
import sys
import typing
+import zipfile
from urllib.request import urlopen
CPYTHON_ROOT_DIR = pathlib.Path(__file__).parent.parent.parent
@@ -16,10 +17,16 @@ CPYTHON_ROOT_DIR = pathlib.Path(__file__).parent.parent.parent
# the license expression is a valid SPDX license expression:
# See: https://spdx.org/licenses
ALLOWED_LICENSE_EXPRESSIONS = {
- "MIT",
- "CC0-1.0",
"Apache-2.0",
+ "Apache-2.0 OR BSD-2-Clause",
"BSD-2-Clause",
+ "BSD-3-Clause",
+ "CC0-1.0",
+ "ISC",
+ "LGPL-2.1-only",
+ "MIT",
+ "MPL-2.0",
+ "Python-2.0.1",
}
# Properties which are required for our purposes.
@@ -31,14 +38,13 @@ REQUIRED_PROPERTIES_PACKAGE = frozenset([
"checksums",
"licenseConcluded",
"externalRefs",
- "originator",
"primaryPackagePurpose",
])
class PackageFiles(typing.NamedTuple):
"""Structure for describing the files of a package"""
- include: list[str]
+ include: list[str] | None
exclude: list[str] | None = None
@@ -118,62 +124,209 @@ def filter_gitignored_paths(paths: list[str]) -> list[str]:
return sorted([line.split()[-1] for line in git_check_ignore_lines if line.startswith("::")])
+def fetch_package_metadata_from_pypi(project: str, version: str, filename: str | None = None) -> tuple[str, str] | None:
+ """
+ Fetches the SHA256 checksum and download location from PyPI.
+ If we're given a filename then we match with that, otherwise we use wheels.
+ """
+ # Get pip's download location from PyPI. Check that the checksum is correct too.
+ try:
+ raw_text = urlopen(f"https://pypi.org/pypi/{project}/{version}/json").read()
+ release_metadata = json.loads(raw_text)
+ url: dict[str, typing.Any]
+
+ # Look for a matching artifact filename and then check
+ # its remote checksum to the local one.
+ for url in release_metadata["urls"]:
+ # pip can only use Python-only dependencies, so there's
+ # no risk of picking the 'incorrect' wheel here.
+ if (
+ (filename is None and url["packagetype"] == "bdist_wheel")
+ or (filename is not None and url["filename"] == filename)
+ ):
+ break
+ else:
+ raise ValueError(f"No matching filename on PyPI for '{filename}'")
+
+ # Successfully found the download URL for the matching artifact.
+ download_url = url["url"]
+ checksum_sha256 = url["digests"]["sha256"]
+ return download_url, checksum_sha256
+
+ except (OSError, ValueError) as e:
+ # Fail if we're running in CI where we should have an internet connection.
+ error_if(
+ "CI" in os.environ,
+ f"Couldn't fetch metadata for project '{project}' from PyPI: {e}"
+ )
+ return None
+
+
+def find_ensurepip_pip_wheel() -> pathlib.Path | None:
+ """Try to find the pip wheel bundled in ensurepip. If missing return None"""
+
+ ensurepip_bundled_dir = CPYTHON_ROOT_DIR / "Lib/ensurepip/_bundled"
+
+ pip_wheels = []
+ try:
+ for wheel_filename in os.listdir(ensurepip_bundled_dir):
+ if wheel_filename.startswith("pip-"):
+ pip_wheels.append(wheel_filename)
+ else:
+ print(f"Unexpected wheel in ensurepip: '{wheel_filename}'")
+ sys.exit(1)
+
+ # Ignore this error, likely caused by downstream distributors
+ # deleting the 'ensurepip/_bundled' directory.
+ except FileNotFoundError:
+ pass
+
+ if len(pip_wheels) == 0:
+ return None
+ elif len(pip_wheels) > 1:
+ print("Multiple pip wheels detected in 'Lib/ensurepip/_bundled'")
+ sys.exit(1)
+ # Otherwise return the one pip wheel.
+ return ensurepip_bundled_dir / pip_wheels[0]
+
+
+def maybe_remove_pip_and_deps_from_sbom(sbom_data: dict[str, typing.Any]) -> None:
+ """
+ Removes pip and its dependencies from the SBOM data
+ if the pip wheel is removed from ensurepip. This is done
+ by redistributors of Python and pip.
+ """
+
+ # If there's a wheel we don't remove anything.
+ if find_ensurepip_pip_wheel() is not None:
+ return
+
+ # Otherwise we traverse the relationships
+ # to find dependent packages to remove.
+ sbom_pip_spdx_id = spdx_id("SPDXRef-PACKAGE-pip")
+ sbom_spdx_ids_to_remove = {sbom_pip_spdx_id}
+
+ # Find all package SPDXIDs that pip depends on.
+ for sbom_relationship in sbom_data["relationships"]:
+ if (
+ sbom_relationship["relationshipType"] == "DEPENDS_ON"
+ and sbom_relationship["spdxElementId"] == sbom_pip_spdx_id
+ ):
+ sbom_spdx_ids_to_remove.add(sbom_relationship["relatedSpdxElement"])
+
+ # Remove all the packages and relationships.
+ sbom_data["packages"] = [
+ sbom_package for sbom_package in sbom_data["packages"]
+ if sbom_package["SPDXID"] not in sbom_spdx_ids_to_remove
+ ]
+ sbom_data["relationships"] = [
+ sbom_relationship for sbom_relationship in sbom_data["relationships"]
+ if sbom_relationship["relatedSpdxElement"] not in sbom_spdx_ids_to_remove
+ ]
+
+
def discover_pip_sbom_package(sbom_data: dict[str, typing.Any]) -> None:
"""pip is a part of a packaging ecosystem (Python, surprise!) so it's actually
automatable to discover the metadata we need like the version and checksums
- so let's do that on behalf of our friends at the PyPA.
+ so let's do that on behalf of our friends at the PyPA. This function also
+ discovers vendored packages within pip and fetches their metadata.
"""
global PACKAGE_TO_FILES
- ensurepip_bundled_dir = CPYTHON_ROOT_DIR / "Lib/ensurepip/_bundled"
- pip_wheels = []
-
- # Find the hopefully one pip wheel in the bundled directory.
- for wheel_filename in os.listdir(ensurepip_bundled_dir):
- if wheel_filename.startswith("pip-"):
- pip_wheels.append(wheel_filename)
- if len(pip_wheels) != 1:
- print("Zero or multiple pip wheels detected in 'Lib/ensurepip/_bundled'")
- sys.exit(1)
- pip_wheel_filename = pip_wheels[0]
+ pip_wheel_filepath = find_ensurepip_pip_wheel()
+ if pip_wheel_filepath is None:
+ return # There's no pip wheel, nothing to discover.
# Add the wheel filename to the list of files so the SBOM file
# and relationship generator can work its magic on the wheel too.
PACKAGE_TO_FILES["pip"] = PackageFiles(
- include=[f"Lib/ensurepip/_bundled/{pip_wheel_filename}"]
+ include=[str(pip_wheel_filepath.relative_to(CPYTHON_ROOT_DIR))]
)
# Wheel filename format puts the version right after the project name.
- pip_version = pip_wheel_filename.split("-")[1]
+ pip_version = pip_wheel_filepath.name.split("-")[1]
pip_checksum_sha256 = hashlib.sha256(
- (ensurepip_bundled_dir / pip_wheel_filename).read_bytes()
+ pip_wheel_filepath.read_bytes()
).hexdigest()
- # Get pip's download location from PyPI. Check that the checksum is correct too.
- try:
- raw_text = urlopen(f"https://pypi.org/pypi/pip/{pip_version}/json").read()
- pip_release_metadata = json.loads(raw_text)
- url: dict[str, typing.Any]
+ pip_metadata = fetch_package_metadata_from_pypi(
+ project="pip",
+ version=pip_version,
+ filename=pip_wheel_filepath.name,
+ )
+ # We couldn't fetch any metadata from PyPI,
+ # so we give up on verifying if we're not in CI.
+ if pip_metadata is None:
+ return
+
+ pip_download_url, pip_actual_sha256 = pip_metadata
+ if pip_actual_sha256 != pip_checksum_sha256:
+ raise ValueError("Unexpected")
+
+ # Parse 'pip/_vendor/vendor.txt' from the wheel for sub-dependencies.
+ with zipfile.ZipFile(pip_wheel_filepath) as whl:
+ vendor_txt_data = whl.read("pip/_vendor/vendor.txt").decode()
+
+ # With this version regex we're assuming that pip isn't using pre-releases.
+ # If any version doesn't match we get a failure below, so we're safe doing this.
+ version_pin_re = re.compile(r"^([a-zA-Z0-9_.-]+)==([0-9.]*[0-9])$")
+ sbom_pip_dependency_spdx_ids = set()
+ for line in vendor_txt_data.splitlines():
+ line = line.partition("#")[0].strip() # Strip comments and whitespace.
+ if not line: # Skip empty lines.
+ continue
+
+ # Non-empty lines we must be able to match.
+ match = version_pin_re.match(line)
+ error_if(match is None, f"Couldn't parse line from pip vendor.txt: '{line}'")
+ assert match is not None # Make mypy happy.
+
+ # Parse out and normalize the project name.
+ project_name, project_version = match.groups()
+ project_name = project_name.lower()
+
+ # At this point if pip's metadata fetch succeeded we should
+ # expect this request to also succeed.
+ project_metadata = (
+ fetch_package_metadata_from_pypi(project_name, project_version)
+ )
+ assert project_metadata is not None
+ project_download_url, project_checksum_sha256 = project_metadata
+
+ # Update our SBOM data with what we received from PyPI.
+ # Don't overwrite any existing values.
+ sbom_project_spdx_id = spdx_id(f"SPDXRef-PACKAGE-{project_name}")
+ sbom_pip_dependency_spdx_ids.add(sbom_project_spdx_id)
+ for package in sbom_data["packages"]:
+ if package["SPDXID"] != sbom_project_spdx_id:
+ continue
- # Look for a matching artifact filename and then check
- # its remote checksum to the local one.
- for url in pip_release_metadata["urls"]:
- if url["filename"] == pip_wheel_filename:
+ # Only thing missing from this blob is the `licenseConcluded`,
+ # that needs to be triaged by human maintainers if the list changes.
+ package.update({
+ "SPDXID": sbom_project_spdx_id,
+ "name": project_name,
+ "versionInfo": project_version,
+ "downloadLocation": project_download_url,
+ "checksums": [
+ {"algorithm": "SHA256", "checksumValue": project_checksum_sha256}
+ ],
+ "externalRefs": [
+ {
+ "referenceCategory": "PACKAGE_MANAGER",
+ "referenceLocator": f"pkg:pypi/{project_name}@{project_version}",
+ "referenceType": "purl",
+ },
+ ],
+ "primaryPackagePurpose": "SOURCE"
+ })
break
- else:
- raise ValueError(f"No matching filename on PyPI for '{pip_wheel_filename}'")
- if url["digests"]["sha256"] != pip_checksum_sha256:
- raise ValueError(f"Local pip checksum doesn't match artifact on PyPI")
-
- # Successfully found the download URL for the matching artifact.
- pip_download_url = url["url"]
- except (OSError, ValueError) as e:
- print(f"Couldn't fetch pip's metadata from PyPI: {e}")
- sys.exit(1)
+ PACKAGE_TO_FILES[project_name] = PackageFiles(include=None)
# Remove pip from the existing SBOM packages if it's there
# and then overwrite its entry with our own generated one.
+ sbom_pip_spdx_id = spdx_id("SPDXRef-PACKAGE-pip")
sbom_data["packages"] = [
sbom_package
for sbom_package in sbom_data["packages"]
@@ -181,7 +334,7 @@ def discover_pip_sbom_package(sbom_data: dict[str, typing.Any]) -> None:
]
sbom_data["packages"].append(
{
- "SPDXID": spdx_id("SPDXRef-PACKAGE-pip"),
+ "SPDXID": sbom_pip_spdx_id,
"name": "pip",
"versionInfo": pip_version,
"originator": "Organization: Python Packaging Authority",
@@ -205,12 +358,27 @@ def discover_pip_sbom_package(sbom_data: dict[str, typing.Any]) -> None:
"primaryPackagePurpose": "SOURCE",
}
)
+ for sbom_dep_spdx_id in sorted(sbom_pip_dependency_spdx_ids):
+ sbom_data["relationships"].append({
+ "spdxElementId": sbom_pip_spdx_id,
+ "relatedSpdxElement": sbom_dep_spdx_id,
+ "relationshipType": "DEPENDS_ON"
+ })
def main() -> None:
sbom_path = CPYTHON_ROOT_DIR / "Misc/sbom.spdx.json"
sbom_data = json.loads(sbom_path.read_bytes())
+ # Check if pip should be removed if the wheel is missing.
+ # We can't reset the SBOM relationship data until checking this.
+ maybe_remove_pip_and_deps_from_sbom(sbom_data)
+
+ # We regenerate all of this information. Package information
+ # should be preserved though since that is edited by humans.
+ sbom_data["files"] = []
+ sbom_data["relationships"] = []
+
# Insert pip's SBOM metadata from the wheel.
discover_pip_sbom_package(sbom_data)
@@ -227,9 +395,10 @@ def main() -> None:
"name" not in package,
"Package is missing the 'name' field"
)
+ missing_required_keys = REQUIRED_PROPERTIES_PACKAGE - set(package.keys())
error_if(
- set(package.keys()) != REQUIRED_PROPERTIES_PACKAGE,
- f"Package '{package['name']}' is missing required fields",
+ bool(missing_required_keys),
+ f"Package '{package['name']}' is missing required fields: {missing_required_keys}",
)
error_if(
package["SPDXID"] != spdx_id(f"SPDXRef-PACKAGE-{package['name']}"),
@@ -257,15 +426,11 @@ def main() -> None:
f"License identifier '{license_concluded}' not in SBOM tool allowlist"
)
- # Regenerate file information from current data.
- sbom_files = []
- sbom_relationships = []
-
# We call 'sorted()' here a lot to avoid filesystem scan order issues.
for name, files in sorted(PACKAGE_TO_FILES.items()):
package_spdx_id = spdx_id(f"SPDXRef-PACKAGE-{name}")
exclude = files.exclude or ()
- for include in sorted(files.include):
+ for include in sorted(files.include or ()):
# Find all the paths and then filter them through .gitignore.
paths = glob.glob(include, root_dir=CPYTHON_ROOT_DIR, recursive=True)
paths = filter_gitignored_paths(paths)
@@ -285,7 +450,7 @@ def main() -> None:
checksum_sha256 = hashlib.sha256(data).hexdigest()
file_spdx_id = spdx_id(f"SPDXRef-FILE-{path}")
- sbom_files.append({
+ sbom_data["files"].append({
"SPDXID": file_spdx_id,
"fileName": path,
"checksums": [
@@ -295,15 +460,13 @@ def main() -> None:
})
# Tie each file back to its respective package.
- sbom_relationships.append({
+ sbom_data["relationships"].append({
"spdxElementId": package_spdx_id,
"relatedSpdxElement": file_spdx_id,
"relationshipType": "CONTAINS",
})
# Update the SBOM on disk
- sbom_data["files"] = sbom_files
- sbom_data["relationships"] = sbom_relationships
sbom_path.write_text(json.dumps(sbom_data, indent=2, sort_keys=True))