diff options
-rw-r--r-- | Misc/sbom.spdx.json | 2 | ||||
-rw-r--r-- | Tools/build/generate_sbom.py | 116 |
2 files changed, 107 insertions, 11 deletions
diff --git a/Misc/sbom.spdx.json b/Misc/sbom.spdx.json index 81f8486..5b3cd04 100644 --- a/Misc/sbom.spdx.json +++ b/Misc/sbom.spdx.json @@ -1700,7 +1700,7 @@ "checksums": [ { "algorithm": "SHA256", - "checksumValue": "7ccf472345f20d35bdc9d1841ff5f313260c2c33fe417f48c30ac46cccabf5be" + "checksumValue": "5052d7889c1f9d05224cd41741acb7c5d6fa735ab34e339624a614eaaa7e7d76" } ], "downloadLocation": "https://files.pythonhosted.org/packages/15/aa/3f4c7bcee2057a76562a5b33ecbd199be08cdb4443a02e26bd2c3cf6fc39/pip-23.3.2-py3-none-any.whl", diff --git a/Tools/build/generate_sbom.py b/Tools/build/generate_sbom.py index c02eb88..93d0d8a 100644 --- a/Tools/build/generate_sbom.py +++ b/Tools/build/generate_sbom.py @@ -1,12 +1,16 @@ """Tool for generating Software Bill of Materials (SBOM) for Python's dependencies""" - +import os import re import hashlib import json import glob import pathlib import subprocess +import sys import typing +from urllib.request import urlopen + +CPYTHON_ROOT_DIR = pathlib.Path(__file__).parent.parent.parent # Before adding a new entry to this list, double check that # the license expression is a valid SPDX license expression: @@ -43,15 +47,14 @@ class PackageFiles(typing.NamedTuple): # values to 'exclude' if we create new files within tracked # directories that aren't sourced from third-party packages. PACKAGE_TO_FILES = { + # NOTE: pip's entry in this structure is automatically generated in + # the 'discover_pip_sbom_package()' function below. "mpdecimal": PackageFiles( include=["Modules/_decimal/libmpdec/**"] ), "expat": PackageFiles( include=["Modules/expat/**"] ), - "pip": PackageFiles( - include=["Lib/ensurepip/_bundled/pip-23.3.2-py3-none-any.whl"] - ), "macholib": PackageFiles( include=["Lib/ctypes/macholib/**"], exclude=[ @@ -106,13 +109,106 @@ def filter_gitignored_paths(paths: list[str]) -> list[str]: return sorted([line.split()[-1] for line in git_check_ignore_lines if line.startswith("::")]) +def discover_pip_sbom_package(sbom_data: dict[str, typing.Any]) -> None: + """pip is a part of a packaging ecosystem (Python, surprise!) so it's actually + automatable to discover the metadata we need like the version and checksums + so let's do that on behalf of our friends at the PyPA. + """ + global PACKAGE_TO_FILES + + ensurepip_bundled_dir = CPYTHON_ROOT_DIR / "Lib/ensurepip/_bundled" + pip_wheels = [] + + # Find the hopefully one pip wheel in the bundled directory. + for wheel_filename in os.listdir(ensurepip_bundled_dir): + if wheel_filename.startswith("pip-"): + pip_wheels.append(wheel_filename) + if len(pip_wheels) != 1: + print("Zero or multiple pip wheels detected in 'Lib/ensurepip/_bundled'") + sys.exit(1) + pip_wheel_filename = pip_wheels[0] + + # Add the wheel filename to the list of files so the SBOM file + # and relationship generator can work its magic on the wheel too. + PACKAGE_TO_FILES["pip"] = PackageFiles( + include=[f"Lib/ensurepip/_bundled/{pip_wheel_filename}"] + ) + + # Wheel filename format puts the version right after the project name. + pip_version = pip_wheel_filename.split("-")[1] + pip_checksum_sha256 = hashlib.sha256( + (ensurepip_bundled_dir / pip_wheel_filename).read_bytes() + ).hexdigest() + + # Get pip's download location from PyPI. Check that the checksum is correct too. + try: + raw_text = urlopen(f"https://pypi.org/pypi/pip/{pip_version}/json").read() + pip_release_metadata = json.loads(raw_text) + url: dict[str, typing.Any] + + # Look for a matching artifact filename and then check + # its remote checksum to the local one. + for url in pip_release_metadata["urls"]: + if url["filename"] == pip_wheel_filename: + break + else: + raise ValueError(f"No matching filename on PyPI for '{pip_wheel_filename}'") + if url["digests"]["sha256"] != pip_checksum_sha256: + raise ValueError(f"Local pip checksum doesn't match artifact on PyPI") + + # Successfully found the download URL for the matching artifact. + pip_download_url = url["url"] + + except (OSError, ValueError) as e: + print(f"Couldn't fetch pip's metadata from PyPI: {e}") + sys.exit(1) + + # Remove pip from the existing SBOM packages if it's there + # and then overwrite its entry with our own generated one. + sbom_data["packages"] = [ + sbom_package + for sbom_package in sbom_data["packages"] + if sbom_package["name"] != "pip" + ] + sbom_data["packages"].append( + { + "SPDXID": spdx_id("SPDXRef-PACKAGE-pip"), + "name": "pip", + "versionInfo": pip_version, + "originator": "Organization: Python Packaging Authority", + "licenseConcluded": "MIT", + "downloadLocation": pip_download_url, + "checksums": [ + {"algorithm": "SHA256", "checksumValue": pip_checksum_sha256} + ], + "externalRefs": [ + { + "referenceCategory": "SECURITY", + "referenceLocator": f"cpe:2.3:a:pypa:pip:{pip_version}:*:*:*:*:*:*:*", + "referenceType": "cpe23Type", + }, + { + "referenceCategory": "PACKAGE_MANAGER", + "referenceLocator": f"pkg:pypi/pip@{pip_version}", + "referenceType": "purl", + }, + ], + "primaryPackagePurpose": "SOURCE", + } + ) + + def main() -> None: - root_dir = pathlib.Path(__file__).parent.parent.parent - sbom_path = root_dir / "Misc/sbom.spdx.json" + sbom_path = CPYTHON_ROOT_DIR / "Misc/sbom.spdx.json" sbom_data = json.loads(sbom_path.read_bytes()) - # Make a bunch of assertions about the SBOM data to ensure it's consistent. + # Insert pip's SBOM metadata from the wheel. + discover_pip_sbom_package(sbom_data) + + # Ensure all packages in this tool are represented also in the SBOM file. assert {package["name"] for package in sbom_data["packages"]} == set(PACKAGE_TO_FILES) + + # Make a bunch of assertions about the SBOM data to ensure it's consistent. for package in sbom_data["packages"]: # Properties and ID must be properly formed. @@ -138,17 +234,17 @@ def main() -> None: for include in sorted(files.include): # Find all the paths and then filter them through .gitignore. - paths = glob.glob(include, root_dir=root_dir, recursive=True) + paths = glob.glob(include, root_dir=CPYTHON_ROOT_DIR, recursive=True) paths = filter_gitignored_paths(paths) assert paths, include # Make sure that every value returns something! for path in paths: # Skip directories and excluded files - if not (root_dir / path).is_file() or path in exclude: + if not (CPYTHON_ROOT_DIR / path).is_file() or path in exclude: continue # SPDX requires SHA1 to be used for files, but we provide SHA256 too. - data = (root_dir / path).read_bytes() + data = (CPYTHON_ROOT_DIR / path).read_bytes() checksum_sha1 = hashlib.sha1(data).hexdigest() checksum_sha256 = hashlib.sha256(data).hexdigest() |