summaryrefslogtreecommitdiffstats
path: root/Tools/build
diff options
context:
space:
mode:
authorSeth Michael Larson <sethmichaellarson@gmail.com>2023-12-07 16:01:58 (GMT)
committerGitHub <noreply@github.com>2023-12-07 16:01:58 (GMT)
commit21221c398f6d89b2d9295895d8a2fd71d28138fa (patch)
tree9e0093ac731152e9988c4d28259b4b7110fec79f /Tools/build
parent2d76be251d0aee89f76e6fa5a63fa1ad3f2b76cf (diff)
downloadcpython-21221c398f6d89b2d9295895d8a2fd71d28138fa.zip
cpython-21221c398f6d89b2d9295895d8a2fd71d28138fa.tar.gz
cpython-21221c398f6d89b2d9295895d8a2fd71d28138fa.tar.bz2
gh-112302: Add Software Bill-of-Materials (SBOM) tracking for dependencies (#112303)
Diffstat (limited to 'Tools/build')
-rw-r--r--Tools/build/generate_sbom.py179
-rw-r--r--Tools/build/mypy.ini13
2 files changed, 192 insertions, 0 deletions
diff --git a/Tools/build/generate_sbom.py b/Tools/build/generate_sbom.py
new file mode 100644
index 0000000..0089db8
--- /dev/null
+++ b/Tools/build/generate_sbom.py
@@ -0,0 +1,179 @@
+"""Tool for generating Software Bill of Materials (SBOM) for Python's dependencies"""
+
+import re
+import hashlib
+import json
+import glob
+import pathlib
+import subprocess
+import typing
+
+# Before adding a new entry to this list, double check that
+# the license expression is a valid SPDX license expression:
+# See: https://spdx.org/licenses
+ALLOWED_LICENSE_EXPRESSIONS = {
+ "MIT",
+ "CC0-1.0",
+ "Apache-2.0",
+ "BSD-2-Clause",
+}
+
+# Properties which are required for our purposes.
+REQUIRED_PROPERTIES_PACKAGE = frozenset([
+ "SPDXID",
+ "name",
+ "versionInfo",
+ "downloadLocation",
+ "checksums",
+ "licenseConcluded",
+ "externalRefs",
+ "originator",
+ "primaryPackagePurpose",
+])
+
+
+class PackageFiles(typing.NamedTuple):
+ """Structure for describing the files of a package"""
+ include: list[str]
+ exclude: list[str] | None = None
+
+
+# SBOMS don't have a method to specify the sources of files
+# so we need to do that external to the SBOM itself. Add new
+# values to 'exclude' if we create new files within tracked
+# directories that aren't sourced from third-party packages.
+PACKAGE_TO_FILES = {
+ "mpdecimal": PackageFiles(
+ include=["Modules/_decimal/libmpdec/**"]
+ ),
+ "expat": PackageFiles(
+ include=["Modules/expat/**"]
+ ),
+ "pip": PackageFiles(
+ include=["Lib/ensurepip/_bundled/pip-23.3.1-py3-none-any.whl"]
+ ),
+ "macholib": PackageFiles(
+ include=["Lib/ctypes/macholib/**"],
+ exclude=[
+ "Lib/ctypes/macholib/README.ctypes",
+ "Lib/ctypes/macholib/fetch_macholib",
+ "Lib/ctypes/macholib/fetch_macholib.bat",
+ ],
+ ),
+ "libb2": PackageFiles(
+ include=["Modules/_blake2/impl/**"]
+ ),
+ "hacl-star": PackageFiles(
+ include=["Modules/_hacl/**"],
+ exclude=[
+ "Modules/_hacl/refresh.sh",
+ "Modules/_hacl/README.md",
+ "Modules/_hacl/python_hacl_namespace.h",
+ ]
+ ),
+}
+
+
+def spdx_id(value: str) -> str:
+ """Encode a value into characters that are valid in an SPDX ID"""
+ return re.sub(r"[^a-zA-Z0-9.\-]+", "-", value)
+
+
+def filter_gitignored_paths(paths: list[str]) -> list[str]:
+ """
+ Filter out paths excluded by the gitignore file.
+ The output of 'git check-ignore --non-matching --verbose' looks
+ like this for non-matching (included) files:
+
+ '::<whitespace><path>'
+
+ And looks like this for matching (excluded) files:
+
+ '.gitignore:9:*.a Tools/lib.a'
+ """
+ # Filter out files in gitignore.
+ # Non-matching files show up as '::<whitespace><path>'
+ git_check_ignore_proc = subprocess.run(
+ ["git", "check-ignore", "--verbose", "--non-matching", *paths],
+ check=False,
+ stdout=subprocess.PIPE,
+ )
+ # 1 means matches, 0 means no matches.
+ assert git_check_ignore_proc.returncode in (0, 1)
+
+ # Return the list of paths sorted
+ git_check_ignore_lines = git_check_ignore_proc.stdout.decode().splitlines()
+ return sorted([line.split()[-1] for line in git_check_ignore_lines if line.startswith("::")])
+
+
+def main() -> None:
+ root_dir = pathlib.Path(__file__).parent.parent.parent
+ sbom_path = root_dir / "Misc/sbom.spdx.json"
+ sbom_data = json.loads(sbom_path.read_bytes())
+
+ # Make a bunch of assertions about the SBOM data to ensure it's consistent.
+ assert {package["name"] for package in sbom_data["packages"]} == set(PACKAGE_TO_FILES)
+ for package in sbom_data["packages"]:
+
+ # Properties and ID must be properly formed.
+ assert set(package.keys()) == REQUIRED_PROPERTIES_PACKAGE
+ assert package["SPDXID"] == spdx_id(f"SPDXRef-PACKAGE-{package['name']}")
+
+ # Version must be in the download and external references.
+ version = package["versionInfo"]
+ assert version in package["downloadLocation"]
+ assert all(version in ref["referenceLocator"] for ref in package["externalRefs"])
+
+ # License must be on the approved list for SPDX.
+ assert package["licenseConcluded"] in ALLOWED_LICENSE_EXPRESSIONS, package["licenseConcluded"]
+
+ # Regenerate file information from current data.
+ sbom_files = []
+ sbom_relationships = []
+
+ # We call 'sorted()' here a lot to avoid filesystem scan order issues.
+ for name, files in sorted(PACKAGE_TO_FILES.items()):
+ package_spdx_id = spdx_id(f"SPDXRef-PACKAGE-{name}")
+ exclude = files.exclude or ()
+ for include in sorted(files.include):
+
+ # Find all the paths and then filter them through .gitignore.
+ paths = glob.glob(include, root_dir=root_dir, recursive=True)
+ paths = filter_gitignored_paths(paths)
+ assert paths, include # Make sure that every value returns something!
+
+ for path in paths:
+ # Skip directories and excluded files
+ if not (root_dir / path).is_file() or path in exclude:
+ continue
+
+ # SPDX requires SHA1 to be used for files, but we provide SHA256 too.
+ data = (root_dir / path).read_bytes()
+ checksum_sha1 = hashlib.sha1(data).hexdigest()
+ checksum_sha256 = hashlib.sha256(data).hexdigest()
+
+ file_spdx_id = spdx_id(f"SPDXRef-FILE-{path}")
+ sbom_files.append({
+ "SPDXID": file_spdx_id,
+ "fileName": path,
+ "checksums": [
+ {"algorithm": "SHA1", "checksumValue": checksum_sha1},
+ {"algorithm": "SHA256", "checksumValue": checksum_sha256},
+ ],
+ })
+
+ # Tie each file back to its respective package.
+ sbom_relationships.append({
+ "spdxElementId": package_spdx_id,
+ "relatedSpdxElement": file_spdx_id,
+ "relationshipType": "CONTAINS",
+ })
+
+ # Update the SBOM on disk
+ sbom_data["files"] = sbom_files
+ sbom_data["relationships"] = sbom_relationships
+ sbom_path.write_text(json.dumps(sbom_data, indent=2, sort_keys=True))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/Tools/build/mypy.ini b/Tools/build/mypy.ini
new file mode 100644
index 0000000..cf1dac7
--- /dev/null
+++ b/Tools/build/mypy.ini
@@ -0,0 +1,13 @@
+[mypy]
+files = Tools/build/generate_sbom.py
+pretty = True
+
+# Make sure Python can still be built
+# using Python 3.10 for `PYTHON_FOR_REGEN`...
+python_version = 3.10
+
+# ...And be strict:
+strict = True
+strict_concatenate = True
+enable_error_code = ignore-without-code,redundant-expr,truthy-bool,possibly-undefined
+warn_unreachable = True