diff options
author | Seth Michael Larson <sethmichaellarson@gmail.com> | 2023-12-07 16:01:58 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-12-07 16:01:58 (GMT) |
commit | 21221c398f6d89b2d9295895d8a2fd71d28138fa (patch) | |
tree | 9e0093ac731152e9988c4d28259b4b7110fec79f /Tools/build | |
parent | 2d76be251d0aee89f76e6fa5a63fa1ad3f2b76cf (diff) | |
download | cpython-21221c398f6d89b2d9295895d8a2fd71d28138fa.zip cpython-21221c398f6d89b2d9295895d8a2fd71d28138fa.tar.gz cpython-21221c398f6d89b2d9295895d8a2fd71d28138fa.tar.bz2 |
gh-112302: Add Software Bill-of-Materials (SBOM) tracking for dependencies (#112303)
Diffstat (limited to 'Tools/build')
-rw-r--r-- | Tools/build/generate_sbom.py | 179 | ||||
-rw-r--r-- | Tools/build/mypy.ini | 13 |
2 files changed, 192 insertions, 0 deletions
diff --git a/Tools/build/generate_sbom.py b/Tools/build/generate_sbom.py new file mode 100644 index 0000000..0089db8 --- /dev/null +++ b/Tools/build/generate_sbom.py @@ -0,0 +1,179 @@ +"""Tool for generating Software Bill of Materials (SBOM) for Python's dependencies""" + +import re +import hashlib +import json +import glob +import pathlib +import subprocess +import typing + +# Before adding a new entry to this list, double check that +# the license expression is a valid SPDX license expression: +# See: https://spdx.org/licenses +ALLOWED_LICENSE_EXPRESSIONS = { + "MIT", + "CC0-1.0", + "Apache-2.0", + "BSD-2-Clause", +} + +# Properties which are required for our purposes. +REQUIRED_PROPERTIES_PACKAGE = frozenset([ + "SPDXID", + "name", + "versionInfo", + "downloadLocation", + "checksums", + "licenseConcluded", + "externalRefs", + "originator", + "primaryPackagePurpose", +]) + + +class PackageFiles(typing.NamedTuple): + """Structure for describing the files of a package""" + include: list[str] + exclude: list[str] | None = None + + +# SBOMS don't have a method to specify the sources of files +# so we need to do that external to the SBOM itself. Add new +# values to 'exclude' if we create new files within tracked +# directories that aren't sourced from third-party packages. +PACKAGE_TO_FILES = { + "mpdecimal": PackageFiles( + include=["Modules/_decimal/libmpdec/**"] + ), + "expat": PackageFiles( + include=["Modules/expat/**"] + ), + "pip": PackageFiles( + include=["Lib/ensurepip/_bundled/pip-23.3.1-py3-none-any.whl"] + ), + "macholib": PackageFiles( + include=["Lib/ctypes/macholib/**"], + exclude=[ + "Lib/ctypes/macholib/README.ctypes", + "Lib/ctypes/macholib/fetch_macholib", + "Lib/ctypes/macholib/fetch_macholib.bat", + ], + ), + "libb2": PackageFiles( + include=["Modules/_blake2/impl/**"] + ), + "hacl-star": PackageFiles( + include=["Modules/_hacl/**"], + exclude=[ + "Modules/_hacl/refresh.sh", + "Modules/_hacl/README.md", + "Modules/_hacl/python_hacl_namespace.h", + ] + ), +} + + +def spdx_id(value: str) -> str: + """Encode a value into characters that are valid in an SPDX ID""" + return re.sub(r"[^a-zA-Z0-9.\-]+", "-", value) + + +def filter_gitignored_paths(paths: list[str]) -> list[str]: + """ + Filter out paths excluded by the gitignore file. + The output of 'git check-ignore --non-matching --verbose' looks + like this for non-matching (included) files: + + '::<whitespace><path>' + + And looks like this for matching (excluded) files: + + '.gitignore:9:*.a Tools/lib.a' + """ + # Filter out files in gitignore. + # Non-matching files show up as '::<whitespace><path>' + git_check_ignore_proc = subprocess.run( + ["git", "check-ignore", "--verbose", "--non-matching", *paths], + check=False, + stdout=subprocess.PIPE, + ) + # 1 means matches, 0 means no matches. + assert git_check_ignore_proc.returncode in (0, 1) + + # Return the list of paths sorted + git_check_ignore_lines = git_check_ignore_proc.stdout.decode().splitlines() + return sorted([line.split()[-1] for line in git_check_ignore_lines if line.startswith("::")]) + + +def main() -> None: + root_dir = pathlib.Path(__file__).parent.parent.parent + sbom_path = root_dir / "Misc/sbom.spdx.json" + sbom_data = json.loads(sbom_path.read_bytes()) + + # Make a bunch of assertions about the SBOM data to ensure it's consistent. + assert {package["name"] for package in sbom_data["packages"]} == set(PACKAGE_TO_FILES) + for package in sbom_data["packages"]: + + # Properties and ID must be properly formed. + assert set(package.keys()) == REQUIRED_PROPERTIES_PACKAGE + assert package["SPDXID"] == spdx_id(f"SPDXRef-PACKAGE-{package['name']}") + + # Version must be in the download and external references. + version = package["versionInfo"] + assert version in package["downloadLocation"] + assert all(version in ref["referenceLocator"] for ref in package["externalRefs"]) + + # License must be on the approved list for SPDX. + assert package["licenseConcluded"] in ALLOWED_LICENSE_EXPRESSIONS, package["licenseConcluded"] + + # Regenerate file information from current data. + sbom_files = [] + sbom_relationships = [] + + # We call 'sorted()' here a lot to avoid filesystem scan order issues. + for name, files in sorted(PACKAGE_TO_FILES.items()): + package_spdx_id = spdx_id(f"SPDXRef-PACKAGE-{name}") + exclude = files.exclude or () + for include in sorted(files.include): + + # Find all the paths and then filter them through .gitignore. + paths = glob.glob(include, root_dir=root_dir, recursive=True) + paths = filter_gitignored_paths(paths) + assert paths, include # Make sure that every value returns something! + + for path in paths: + # Skip directories and excluded files + if not (root_dir / path).is_file() or path in exclude: + continue + + # SPDX requires SHA1 to be used for files, but we provide SHA256 too. + data = (root_dir / path).read_bytes() + checksum_sha1 = hashlib.sha1(data).hexdigest() + checksum_sha256 = hashlib.sha256(data).hexdigest() + + file_spdx_id = spdx_id(f"SPDXRef-FILE-{path}") + sbom_files.append({ + "SPDXID": file_spdx_id, + "fileName": path, + "checksums": [ + {"algorithm": "SHA1", "checksumValue": checksum_sha1}, + {"algorithm": "SHA256", "checksumValue": checksum_sha256}, + ], + }) + + # Tie each file back to its respective package. + sbom_relationships.append({ + "spdxElementId": package_spdx_id, + "relatedSpdxElement": file_spdx_id, + "relationshipType": "CONTAINS", + }) + + # Update the SBOM on disk + sbom_data["files"] = sbom_files + sbom_data["relationships"] = sbom_relationships + sbom_path.write_text(json.dumps(sbom_data, indent=2, sort_keys=True)) + + +if __name__ == "__main__": + main() diff --git a/Tools/build/mypy.ini b/Tools/build/mypy.ini new file mode 100644 index 0000000..cf1dac7 --- /dev/null +++ b/Tools/build/mypy.ini @@ -0,0 +1,13 @@ +[mypy] +files = Tools/build/generate_sbom.py +pretty = True + +# Make sure Python can still be built +# using Python 3.10 for `PYTHON_FOR_REGEN`... +python_version = 3.10 + +# ...And be strict: +strict = True +strict_concatenate = True +enable_error_code = ignore-without-code,redundant-expr,truthy-bool,possibly-undefined +warn_unreachable = True |