diff options
Diffstat (limited to 'Tools/build')
-rw-r--r-- | Tools/build/check_extension_modules.py | 484 | ||||
-rw-r--r-- | Tools/build/deepfreeze.py | 504 | ||||
-rw-r--r-- | Tools/build/freeze_modules.py | 733 | ||||
-rw-r--r-- | Tools/build/generate_global_objects.py | 382 | ||||
-rw-r--r-- | Tools/build/generate_levenshtein_examples.py | 70 | ||||
-rw-r--r-- | Tools/build/generate_opcode_h.py | 199 | ||||
-rwxr-xr-x | Tools/build/generate_re_casefix.py | 96 | ||||
-rwxr-xr-x | Tools/build/generate_sre_constants.py | 80 | ||||
-rw-r--r-- | Tools/build/generate_stdlib_module_names.py | 139 | ||||
-rwxr-xr-x | Tools/build/generate_token.py | 282 | ||||
-rwxr-xr-x | Tools/build/parse_html5_entities.py | 115 | ||||
-rwxr-xr-x | Tools/build/smelly.py | 173 | ||||
-rw-r--r-- | Tools/build/stable_abi.py | 757 | ||||
-rw-r--r-- | Tools/build/umarshal.py | 325 | ||||
-rw-r--r-- | Tools/build/update_file.py | 92 | ||||
-rwxr-xr-x | Tools/build/verify_ensurepip_wheels.py | 98 |
16 files changed, 4529 insertions, 0 deletions
diff --git a/Tools/build/check_extension_modules.py b/Tools/build/check_extension_modules.py new file mode 100644 index 0000000..59239c6 --- /dev/null +++ b/Tools/build/check_extension_modules.py @@ -0,0 +1,484 @@ +"""Check extension modules + +The script checks shared and built-in extension modules. It verifies that the +modules have been built and that they can be imported successfully. Missing +modules and failed imports are reported to the user. Shared extension +files are renamed on failed import. + +Module information is parsed from several sources: + +- core modules hard-coded in Modules/config.c.in +- Windows-specific modules that are hard-coded in PC/config.c +- MODULE_{name}_STATE entries in Makefile (provided through sysconfig) +- Various makesetup files: + - $(srcdir)/Modules/Setup + - Modules/Setup.[local|bootstrap|stdlib] files, which are generated + from $(srcdir)/Modules/Setup.*.in files + +See --help for more information +""" +import argparse +import collections +import enum +import logging +import os +import pathlib +import re +import sys +import sysconfig +import warnings + +from importlib._bootstrap import _load as bootstrap_load +from importlib.machinery import BuiltinImporter, ExtensionFileLoader, ModuleSpec +from importlib.util import spec_from_file_location, spec_from_loader +from typing import Iterable + +SRC_DIR = pathlib.Path(__file__).parent.parent.parent + +# core modules, hard-coded in Modules/config.h.in +CORE_MODULES = { + "_ast", + "_imp", + "_string", + "_tokenize", + "_warnings", + "builtins", + "gc", + "marshal", + "sys", +} + +# Windows-only modules +WINDOWS_MODULES = { + "_msi", + "_overlapped", + "_testconsole", + "_winapi", + "msvcrt", + "nt", + "winreg", + "winsound", +} + + +logger = logging.getLogger(__name__) + +parser = argparse.ArgumentParser( + prog="check_extension_modules", + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, +) + +parser.add_argument( + "--verbose", + action="store_true", + help="Verbose, report builtin, shared, and unavailable modules", +) + +parser.add_argument( + "--debug", + action="store_true", + help="Enable debug logging", +) + +parser.add_argument( + "--strict", + action=argparse.BooleanOptionalAction, + help=( + "Strict check, fail when a module is missing or fails to import" + "(default: no, unless env var PYTHONSTRICTEXTENSIONBUILD is set)" + ), + default=bool(os.environ.get("PYTHONSTRICTEXTENSIONBUILD")), +) + +parser.add_argument( + "--cross-compiling", + action=argparse.BooleanOptionalAction, + help=( + "Use cross-compiling checks " + "(default: no, unless env var _PYTHON_HOST_PLATFORM is set)." + ), + default="_PYTHON_HOST_PLATFORM" in os.environ, +) + +parser.add_argument( + "--list-module-names", + action="store_true", + help="Print a list of module names to stdout and exit", +) + + +class ModuleState(enum.Enum): + # Makefile state "yes" + BUILTIN = "builtin" + SHARED = "shared" + + DISABLED = "disabled" + MISSING = "missing" + NA = "n/a" + # disabled by Setup / makesetup rule + DISABLED_SETUP = "disabled_setup" + + def __bool__(self): + return self.value in {"builtin", "shared"} + + +ModuleInfo = collections.namedtuple("ModuleInfo", "name state") + + +class ModuleChecker: + pybuilddir_txt = "pybuilddir.txt" + + setup_files = ( + # see end of configure.ac + "Modules/Setup.local", + "Modules/Setup.stdlib", + "Modules/Setup.bootstrap", + SRC_DIR / "Modules/Setup", + ) + + def __init__(self, cross_compiling: bool = False, strict: bool = False): + self.cross_compiling = cross_compiling + self.strict_extensions_build = strict + self.ext_suffix = sysconfig.get_config_var("EXT_SUFFIX") + self.platform = sysconfig.get_platform() + self.builddir = self.get_builddir() + self.modules = self.get_modules() + + self.builtin_ok = [] + self.shared_ok = [] + self.failed_on_import = [] + self.missing = [] + self.disabled_configure = [] + self.disabled_setup = [] + self.notavailable = [] + + def check(self): + for modinfo in self.modules: + logger.debug("Checking '%s' (%s)", modinfo.name, self.get_location(modinfo)) + if modinfo.state == ModuleState.DISABLED: + self.disabled_configure.append(modinfo) + elif modinfo.state == ModuleState.DISABLED_SETUP: + self.disabled_setup.append(modinfo) + elif modinfo.state == ModuleState.MISSING: + self.missing.append(modinfo) + elif modinfo.state == ModuleState.NA: + self.notavailable.append(modinfo) + else: + try: + if self.cross_compiling: + self.check_module_cross(modinfo) + else: + self.check_module_import(modinfo) + except (ImportError, FileNotFoundError): + self.rename_module(modinfo) + self.failed_on_import.append(modinfo) + else: + if modinfo.state == ModuleState.BUILTIN: + self.builtin_ok.append(modinfo) + else: + assert modinfo.state == ModuleState.SHARED + self.shared_ok.append(modinfo) + + def summary(self, *, verbose: bool = False): + longest = max([len(e.name) for e in self.modules], default=0) + + def print_three_column(modinfos: list[ModuleInfo]): + names = [modinfo.name for modinfo in modinfos] + names.sort(key=str.lower) + # guarantee zip() doesn't drop anything + while len(names) % 3: + names.append("") + for l, m, r in zip(names[::3], names[1::3], names[2::3]): + print("%-*s %-*s %-*s" % (longest, l, longest, m, longest, r)) + + if verbose and self.builtin_ok: + print("The following *built-in* modules have been successfully built:") + print_three_column(self.builtin_ok) + print() + + if verbose and self.shared_ok: + print("The following *shared* modules have been successfully built:") + print_three_column(self.shared_ok) + print() + + if self.disabled_configure: + print("The following modules are *disabled* in configure script:") + print_three_column(self.disabled_configure) + print() + + if self.disabled_setup: + print("The following modules are *disabled* in Modules/Setup files:") + print_three_column(self.disabled_setup) + print() + + if verbose and self.notavailable: + print( + f"The following modules are not available on platform '{self.platform}':" + ) + print_three_column(self.notavailable) + print() + + if self.missing: + print("The necessary bits to build these optional modules were not found:") + print_three_column(self.missing) + print("To find the necessary bits, look in configure.ac and config.log.") + print() + + if self.failed_on_import: + print( + "Following modules built successfully " + "but were removed because they could not be imported:" + ) + print_three_column(self.failed_on_import) + print() + + if any( + modinfo.name == "_ssl" for modinfo in self.missing + self.failed_on_import + ): + print("Could not build the ssl module!") + print("Python requires a OpenSSL 1.1.1 or newer") + if sysconfig.get_config_var("OPENSSL_LDFLAGS"): + print("Custom linker flags may require --with-openssl-rpath=auto") + print() + + disabled = len(self.disabled_configure) + len(self.disabled_setup) + print( + f"Checked {len(self.modules)} modules (" + f"{len(self.builtin_ok)} built-in, " + f"{len(self.shared_ok)} shared, " + f"{len(self.notavailable)} n/a on {self.platform}, " + f"{disabled} disabled, " + f"{len(self.missing)} missing, " + f"{len(self.failed_on_import)} failed on import)" + ) + + def check_strict_build(self): + """Fail if modules are missing and it's a strict build""" + if self.strict_extensions_build and (self.failed_on_import or self.missing): + raise RuntimeError("Failed to build some stdlib modules") + + def list_module_names(self, *, all: bool = False) -> set: + names = {modinfo.name for modinfo in self.modules} + if all: + names.update(WINDOWS_MODULES) + return names + + def get_builddir(self) -> pathlib.Path: + try: + with open(self.pybuilddir_txt, encoding="utf-8") as f: + builddir = f.read() + except FileNotFoundError: + logger.error("%s must be run from the top build directory", __file__) + raise + builddir = pathlib.Path(builddir) + logger.debug("%s: %s", self.pybuilddir_txt, builddir) + return builddir + + def get_modules(self) -> list[ModuleInfo]: + """Get module info from sysconfig and Modules/Setup* files""" + seen = set() + modules = [] + # parsing order is important, first entry wins + for modinfo in self.get_core_modules(): + modules.append(modinfo) + seen.add(modinfo.name) + for setup_file in self.setup_files: + for modinfo in self.parse_setup_file(setup_file): + if modinfo.name not in seen: + modules.append(modinfo) + seen.add(modinfo.name) + for modinfo in self.get_sysconfig_modules(): + if modinfo.name not in seen: + modules.append(modinfo) + seen.add(modinfo.name) + logger.debug("Found %i modules in total", len(modules)) + modules.sort() + return modules + + def get_core_modules(self) -> Iterable[ModuleInfo]: + """Get hard-coded core modules""" + for name in CORE_MODULES: + modinfo = ModuleInfo(name, ModuleState.BUILTIN) + logger.debug("Found core module %s", modinfo) + yield modinfo + + def get_sysconfig_modules(self) -> Iterable[ModuleInfo]: + """Get modules defined in Makefile through sysconfig + + MODBUILT_NAMES: modules in *static* block + MODSHARED_NAMES: modules in *shared* block + MODDISABLED_NAMES: modules in *disabled* block + """ + moddisabled = set(sysconfig.get_config_var("MODDISABLED_NAMES").split()) + if self.cross_compiling: + modbuiltin = set(sysconfig.get_config_var("MODBUILT_NAMES").split()) + else: + modbuiltin = set(sys.builtin_module_names) + + for key, value in sysconfig.get_config_vars().items(): + if not key.startswith("MODULE_") or not key.endswith("_STATE"): + continue + if value not in {"yes", "disabled", "missing", "n/a"}: + raise ValueError(f"Unsupported value '{value}' for {key}") + + modname = key[7:-6].lower() + if modname in moddisabled: + # Setup "*disabled*" rule + state = ModuleState.DISABLED_SETUP + elif value in {"disabled", "missing", "n/a"}: + state = ModuleState(value) + elif modname in modbuiltin: + assert value == "yes" + state = ModuleState.BUILTIN + else: + assert value == "yes" + state = ModuleState.SHARED + + modinfo = ModuleInfo(modname, state) + logger.debug("Found %s in Makefile", modinfo) + yield modinfo + + def parse_setup_file(self, setup_file: pathlib.Path) -> Iterable[ModuleInfo]: + """Parse a Modules/Setup file""" + assign_var = re.compile(r"^\w+=") # EGG_SPAM=foo + # default to static module + state = ModuleState.BUILTIN + logger.debug("Parsing Setup file %s", setup_file) + with open(setup_file, encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line or line.startswith("#") or assign_var.match(line): + continue + match line.split(): + case ["*shared*"]: + state = ModuleState.SHARED + case ["*static*"]: + state = ModuleState.BUILTIN + case ["*disabled*"]: + state = ModuleState.DISABLED + case ["*noconfig*"]: + state = None + case [*items]: + if state == ModuleState.DISABLED: + # *disabled* can disable multiple modules per line + for item in items: + modinfo = ModuleInfo(item, state) + logger.debug("Found %s in %s", modinfo, setup_file) + yield modinfo + elif state in {ModuleState.SHARED, ModuleState.BUILTIN}: + # *shared* and *static*, first item is the name of the module. + modinfo = ModuleInfo(items[0], state) + logger.debug("Found %s in %s", modinfo, setup_file) + yield modinfo + + def get_spec(self, modinfo: ModuleInfo) -> ModuleSpec: + """Get ModuleSpec for builtin or extension module""" + if modinfo.state == ModuleState.SHARED: + location = os.fspath(self.get_location(modinfo)) + loader = ExtensionFileLoader(modinfo.name, location) + return spec_from_file_location(modinfo.name, location, loader=loader) + elif modinfo.state == ModuleState.BUILTIN: + return spec_from_loader(modinfo.name, loader=BuiltinImporter) + else: + raise ValueError(modinfo) + + def get_location(self, modinfo: ModuleInfo) -> pathlib.Path: + """Get shared library location in build directory""" + if modinfo.state == ModuleState.SHARED: + return self.builddir / f"{modinfo.name}{self.ext_suffix}" + else: + return None + + def _check_file(self, modinfo: ModuleInfo, spec: ModuleSpec): + """Check that the module file is present and not empty""" + if spec.loader is BuiltinImporter: + return + try: + st = os.stat(spec.origin) + except FileNotFoundError: + logger.error("%s (%s) is missing", modinfo.name, spec.origin) + raise + if not st.st_size: + raise ImportError(f"{spec.origin} is an empty file") + + def check_module_import(self, modinfo: ModuleInfo): + """Attempt to import module and report errors""" + spec = self.get_spec(modinfo) + self._check_file(modinfo, spec) + try: + with warnings.catch_warnings(): + # ignore deprecation warning from deprecated modules + warnings.simplefilter("ignore", DeprecationWarning) + bootstrap_load(spec) + except ImportError as e: + logger.error("%s failed to import: %s", modinfo.name, e) + raise + except Exception as e: + logger.exception("Importing extension '%s' failed!", modinfo.name) + raise + + def check_module_cross(self, modinfo: ModuleInfo): + """Sanity check for cross compiling""" + spec = self.get_spec(modinfo) + self._check_file(modinfo, spec) + + def rename_module(self, modinfo: ModuleInfo) -> None: + """Rename module file""" + if modinfo.state == ModuleState.BUILTIN: + logger.error("Cannot mark builtin module '%s' as failed!", modinfo.name) + return + + failed_name = f"{modinfo.name}_failed{self.ext_suffix}" + builddir_path = self.get_location(modinfo) + if builddir_path.is_symlink(): + symlink = builddir_path + module_path = builddir_path.resolve().relative_to(os.getcwd()) + failed_path = module_path.parent / failed_name + else: + symlink = None + module_path = builddir_path + failed_path = self.builddir / failed_name + + # remove old failed file + failed_path.unlink(missing_ok=True) + # remove symlink + if symlink is not None: + symlink.unlink(missing_ok=True) + # rename shared extension file + try: + module_path.rename(failed_path) + except FileNotFoundError: + logger.debug("Shared extension file '%s' does not exist.", module_path) + else: + logger.debug("Rename '%s' -> '%s'", module_path, failed_path) + + +def main(): + args = parser.parse_args() + if args.debug: + args.verbose = True + logging.basicConfig( + level=logging.DEBUG if args.debug else logging.INFO, + format="[%(levelname)s] %(message)s", + ) + + checker = ModuleChecker( + cross_compiling=args.cross_compiling, + strict=args.strict, + ) + if args.list_module_names: + names = checker.list_module_names(all=True) + for name in sorted(names): + print(name) + else: + checker.check() + checker.summary(verbose=args.verbose) + try: + checker.check_strict_build() + except RuntimeError as e: + parser.exit(1, f"\nError: {e}\n") + + +if __name__ == "__main__": + main() diff --git a/Tools/build/deepfreeze.py b/Tools/build/deepfreeze.py new file mode 100644 index 0000000..28ac2b1 --- /dev/null +++ b/Tools/build/deepfreeze.py @@ -0,0 +1,504 @@ +"""Deep freeze + +The script may be executed by _bootstrap_python interpreter. +Shared library extension modules are not available in that case. +On Windows, and in cross-compilation cases, it is executed +by Python 3.10, and 3.11 features are not available. +""" +import argparse +import ast +import builtins +import collections +import contextlib +import os +import re +import time +import types +from typing import Dict, FrozenSet, TextIO, Tuple + +import umarshal +from generate_global_objects import get_identifiers_and_strings + +verbose = False +identifiers, strings = get_identifiers_and_strings() + +# This must be kept in sync with opcode.py +RESUME = 151 + +def isprintable(b: bytes) -> bool: + return all(0x20 <= c < 0x7f for c in b) + + +def make_string_literal(b: bytes) -> str: + res = ['"'] + if isprintable(b): + res.append(b.decode("ascii").replace("\\", "\\\\").replace("\"", "\\\"")) + else: + for i in b: + res.append(f"\\x{i:02x}") + res.append('"') + return "".join(res) + + +CO_FAST_LOCAL = 0x20 +CO_FAST_CELL = 0x40 +CO_FAST_FREE = 0x80 + + +def get_localsplus(code: types.CodeType): + a = collections.defaultdict(int) + for name in code.co_varnames: + a[name] |= CO_FAST_LOCAL + for name in code.co_cellvars: + a[name] |= CO_FAST_CELL + for name in code.co_freevars: + a[name] |= CO_FAST_FREE + return tuple(a.keys()), bytes(a.values()) + + +def get_localsplus_counts(code: types.CodeType, + names: Tuple[str, ...], + kinds: bytes) -> Tuple[int, int, int, int]: + nlocals = 0 + nplaincellvars = 0 + ncellvars = 0 + nfreevars = 0 + assert len(names) == len(kinds) + for name, kind in zip(names, kinds): + if kind & CO_FAST_LOCAL: + nlocals += 1 + if kind & CO_FAST_CELL: + ncellvars += 1 + elif kind & CO_FAST_CELL: + ncellvars += 1 + nplaincellvars += 1 + elif kind & CO_FAST_FREE: + nfreevars += 1 + assert nlocals == len(code.co_varnames) == code.co_nlocals, \ + (nlocals, len(code.co_varnames), code.co_nlocals) + assert ncellvars == len(code.co_cellvars) + assert nfreevars == len(code.co_freevars) + assert len(names) == nlocals + nplaincellvars + nfreevars + return nlocals, nplaincellvars, ncellvars, nfreevars + + +PyUnicode_1BYTE_KIND = 1 +PyUnicode_2BYTE_KIND = 2 +PyUnicode_4BYTE_KIND = 4 + + +def analyze_character_width(s: str) -> Tuple[int, bool]: + maxchar = ' ' + for c in s: + maxchar = max(maxchar, c) + ascii = False + if maxchar <= '\xFF': + kind = PyUnicode_1BYTE_KIND + ascii = maxchar <= '\x7F' + elif maxchar <= '\uFFFF': + kind = PyUnicode_2BYTE_KIND + else: + kind = PyUnicode_4BYTE_KIND + return kind, ascii + + +def removesuffix(base: str, suffix: str) -> str: + if base.endswith(suffix): + return base[:len(base) - len(suffix)] + return base + +class Printer: + + def __init__(self, file: TextIO) -> None: + self.level = 0 + self.file = file + self.cache: Dict[tuple[type, object, str], str] = {} + self.hits, self.misses = 0, 0 + self.patchups: list[str] = [] + self.deallocs: list[str] = [] + self.interns: list[str] = [] + self.write('#include "Python.h"') + self.write('#include "internal/pycore_gc.h"') + self.write('#include "internal/pycore_code.h"') + self.write('#include "internal/pycore_frame.h"') + self.write('#include "internal/pycore_long.h"') + self.write("") + + @contextlib.contextmanager + def indent(self) -> None: + save_level = self.level + try: + self.level += 1 + yield + finally: + self.level = save_level + + def write(self, arg: str) -> None: + self.file.writelines((" "*self.level, arg, "\n")) + + @contextlib.contextmanager + def block(self, prefix: str, suffix: str = "") -> None: + self.write(prefix + " {") + with self.indent(): + yield + self.write("}" + suffix) + + def object_head(self, typename: str) -> None: + with self.block(".ob_base =", ","): + self.write(f".ob_refcnt = 999999999,") + self.write(f".ob_type = &{typename},") + + def object_var_head(self, typename: str, size: int) -> None: + with self.block(".ob_base =", ","): + self.object_head(typename) + self.write(f".ob_size = {size},") + + def field(self, obj: object, name: str) -> None: + self.write(f".{name} = {getattr(obj, name)},") + + def generate_bytes(self, name: str, b: bytes) -> str: + if b == b"": + return "(PyObject *)&_Py_SINGLETON(bytes_empty)" + if len(b) == 1: + return f"(PyObject *)&_Py_SINGLETON(bytes_characters[{b[0]}])" + self.write("static") + with self.indent(): + with self.block("struct"): + self.write("PyObject_VAR_HEAD") + self.write("Py_hash_t ob_shash;") + self.write(f"char ob_sval[{len(b) + 1}];") + with self.block(f"{name} =", ";"): + self.object_var_head("PyBytes_Type", len(b)) + self.write(".ob_shash = -1,") + self.write(f".ob_sval = {make_string_literal(b)},") + return f"& {name}.ob_base.ob_base" + + def generate_unicode(self, name: str, s: str) -> str: + if s in strings: + return f"&_Py_STR({strings[s]})" + if s in identifiers: + return f"&_Py_ID({s})" + if re.match(r'\A[A-Za-z0-9_]+\Z', s): + name = f"const_str_{s}" + kind, ascii = analyze_character_width(s) + if kind == PyUnicode_1BYTE_KIND: + datatype = "uint8_t" + elif kind == PyUnicode_2BYTE_KIND: + datatype = "uint16_t" + else: + datatype = "uint32_t" + self.write("static") + with self.indent(): + with self.block("struct"): + if ascii: + self.write("PyASCIIObject _ascii;") + else: + self.write("PyCompactUnicodeObject _compact;") + self.write(f"{datatype} _data[{len(s)+1}];") + with self.block(f"{name} =", ";"): + if ascii: + with self.block("._ascii =", ","): + self.object_head("PyUnicode_Type") + self.write(f".length = {len(s)},") + self.write(".hash = -1,") + with self.block(".state =", ","): + self.write(".kind = 1,") + self.write(".compact = 1,") + self.write(".ascii = 1,") + self.write(f"._data = {make_string_literal(s.encode('ascii'))},") + return f"& {name}._ascii.ob_base" + else: + with self.block("._compact =", ","): + with self.block("._base =", ","): + self.object_head("PyUnicode_Type") + self.write(f".length = {len(s)},") + self.write(".hash = -1,") + with self.block(".state =", ","): + self.write(f".kind = {kind},") + self.write(".compact = 1,") + self.write(".ascii = 0,") + utf8 = s.encode('utf-8') + self.write(f'.utf8 = {make_string_literal(utf8)},') + self.write(f'.utf8_length = {len(utf8)},') + with self.block(f"._data =", ","): + for i in range(0, len(s), 16): + data = s[i:i+16] + self.write(", ".join(map(str, map(ord, data))) + ",") + return f"& {name}._compact._base.ob_base" + + + def generate_code(self, name: str, code: types.CodeType) -> str: + # The ordering here matches PyCode_NewWithPosOnlyArgs() + # (but see below). + co_consts = self.generate(name + "_consts", code.co_consts) + co_names = self.generate(name + "_names", code.co_names) + co_filename = self.generate(name + "_filename", code.co_filename) + co_name = self.generate(name + "_name", code.co_name) + co_qualname = self.generate(name + "_qualname", code.co_qualname) + co_linetable = self.generate(name + "_linetable", code.co_linetable) + co_exceptiontable = self.generate(name + "_exceptiontable", code.co_exceptiontable) + # These fields are not directly accessible + localsplusnames, localspluskinds = get_localsplus(code) + co_localsplusnames = self.generate(name + "_localsplusnames", localsplusnames) + co_localspluskinds = self.generate(name + "_localspluskinds", localspluskinds) + # Derived values + nlocals, nplaincellvars, ncellvars, nfreevars = \ + get_localsplus_counts(code, localsplusnames, localspluskinds) + co_code_adaptive = make_string_literal(code.co_code) + self.write("static") + with self.indent(): + self.write(f"struct _PyCode_DEF({len(code.co_code)})") + with self.block(f"{name} =", ";"): + self.object_var_head("PyCode_Type", len(code.co_code) // 2) + # But the ordering here must match that in cpython/code.h + # (which is a pain because we tend to reorder those for perf) + # otherwise MSVC doesn't like it. + self.write(f".co_consts = {co_consts},") + self.write(f".co_names = {co_names},") + self.write(f".co_exceptiontable = {co_exceptiontable},") + self.field(code, "co_flags") + self.write(".co_warmup = QUICKENING_INITIAL_WARMUP_VALUE,") + self.write("._co_linearray_entry_size = 0,") + self.field(code, "co_argcount") + self.field(code, "co_posonlyargcount") + self.field(code, "co_kwonlyargcount") + self.write(f".co_framesize = {code.co_stacksize + len(localsplusnames)} + FRAME_SPECIALS_SIZE,") + self.field(code, "co_stacksize") + self.field(code, "co_firstlineno") + self.write(f".co_nlocalsplus = {len(localsplusnames)},") + self.field(code, "co_nlocals") + self.write(f".co_nplaincellvars = {nplaincellvars},") + self.write(f".co_ncellvars = {ncellvars},") + self.write(f".co_nfreevars = {nfreevars},") + self.write(f".co_localsplusnames = {co_localsplusnames},") + self.write(f".co_localspluskinds = {co_localspluskinds},") + self.write(f".co_filename = {co_filename},") + self.write(f".co_name = {co_name},") + self.write(f".co_qualname = {co_qualname},") + self.write(f".co_linetable = {co_linetable},") + self.write(f"._co_cached = NULL,") + self.write("._co_linearray = NULL,") + self.write(f".co_code_adaptive = {co_code_adaptive},") + for i, op in enumerate(code.co_code[::2]): + if op == RESUME: + self.write(f"._co_firsttraceable = {i},") + break + name_as_code = f"(PyCodeObject *)&{name}" + self.deallocs.append(f"_PyStaticCode_Dealloc({name_as_code});") + self.interns.append(f"_PyStaticCode_InternStrings({name_as_code})") + return f"& {name}.ob_base.ob_base" + + def generate_tuple(self, name: str, t: Tuple[object, ...]) -> str: + if len(t) == 0: + return f"(PyObject *)& _Py_SINGLETON(tuple_empty)" + items = [self.generate(f"{name}_{i}", it) for i, it in enumerate(t)] + self.write("static") + with self.indent(): + with self.block("struct"): + self.write("PyGC_Head _gc_head;") + with self.block("struct", "_object;"): + self.write("PyObject_VAR_HEAD") + if t: + self.write(f"PyObject *ob_item[{len(t)}];") + with self.block(f"{name} =", ";"): + with self.block("._object =", ","): + self.object_var_head("PyTuple_Type", len(t)) + if items: + with self.block(f".ob_item =", ","): + for item in items: + self.write(item + ",") + return f"& {name}._object.ob_base.ob_base" + + def _generate_int_for_bits(self, name: str, i: int, digit: int) -> None: + sign = -1 if i < 0 else 0 if i == 0 else +1 + i = abs(i) + digits: list[int] = [] + while i: + i, rem = divmod(i, digit) + digits.append(rem) + self.write("static") + with self.indent(): + with self.block("struct"): + self.write("PyObject_VAR_HEAD") + self.write(f"digit ob_digit[{max(1, len(digits))}];") + with self.block(f"{name} =", ";"): + self.object_var_head("PyLong_Type", sign*len(digits)) + if digits: + ds = ", ".join(map(str, digits)) + self.write(f".ob_digit = {{ {ds} }},") + + def generate_int(self, name: str, i: int) -> str: + if -5 <= i <= 256: + return f"(PyObject *)&_PyLong_SMALL_INTS[_PY_NSMALLNEGINTS + {i}]" + if i >= 0: + name = f"const_int_{i}" + else: + name = f"const_int_negative_{abs(i)}" + if abs(i) < 2**15: + self._generate_int_for_bits(name, i, 2**15) + else: + connective = "if" + for bits_in_digit in 15, 30: + self.write(f"#{connective} PYLONG_BITS_IN_DIGIT == {bits_in_digit}") + self._generate_int_for_bits(name, i, 2**bits_in_digit) + connective = "elif" + self.write("#else") + self.write('#error "PYLONG_BITS_IN_DIGIT should be 15 or 30"') + self.write("#endif") + # If neither clause applies, it won't compile + return f"& {name}.ob_base.ob_base" + + def generate_float(self, name: str, x: float) -> str: + with self.block(f"static PyFloatObject {name} =", ";"): + self.object_head("PyFloat_Type") + self.write(f".ob_fval = {x},") + return f"&{name}.ob_base" + + def generate_complex(self, name: str, z: complex) -> str: + with self.block(f"static PyComplexObject {name} =", ";"): + self.object_head("PyComplex_Type") + self.write(f".cval = {{ {z.real}, {z.imag} }},") + return f"&{name}.ob_base" + + def generate_frozenset(self, name: str, fs: FrozenSet[object]) -> str: + try: + fs = sorted(fs) + except TypeError: + # frozen set with incompatible types, fallback to repr() + fs = sorted(fs, key=repr) + ret = self.generate_tuple(name, tuple(fs)) + self.write("// TODO: The above tuple should be a frozenset") + return ret + + def generate_file(self, module: str, code: object)-> None: + module = module.replace(".", "_") + self.generate(f"{module}_toplevel", code) + with self.block(f"static void {module}_do_patchups(void)"): + for p in self.patchups: + self.write(p) + self.patchups.clear() + self.write(EPILOGUE.replace("%%NAME%%", module)) + + def generate(self, name: str, obj: object) -> str: + # Use repr() in the key to distinguish -0.0 from +0.0 + key = (type(obj), obj, repr(obj)) + if key in self.cache: + self.hits += 1 + # print(f"Cache hit {key!r:.40}: {self.cache[key]!r:.40}") + return self.cache[key] + self.misses += 1 + if isinstance(obj, (types.CodeType, umarshal.Code)) : + val = self.generate_code(name, obj) + elif isinstance(obj, tuple): + val = self.generate_tuple(name, obj) + elif isinstance(obj, str): + val = self.generate_unicode(name, obj) + elif isinstance(obj, bytes): + val = self.generate_bytes(name, obj) + elif obj is True: + return "Py_True" + elif obj is False: + return "Py_False" + elif isinstance(obj, int): + val = self.generate_int(name, obj) + elif isinstance(obj, float): + val = self.generate_float(name, obj) + elif isinstance(obj, complex): + val = self.generate_complex(name, obj) + elif isinstance(obj, frozenset): + val = self.generate_frozenset(name, obj) + elif obj is builtins.Ellipsis: + return "Py_Ellipsis" + elif obj is None: + return "Py_None" + else: + raise TypeError( + f"Cannot generate code for {type(obj).__name__} object") + # print(f"Cache store {key!r:.40}: {val!r:.40}") + self.cache[key] = val + return val + + +EPILOGUE = """ +PyObject * +_Py_get_%%NAME%%_toplevel(void) +{ + %%NAME%%_do_patchups(); + return Py_NewRef((PyObject *) &%%NAME%%_toplevel); +} +""" + +FROZEN_COMMENT_C = "/* Auto-generated by Programs/_freeze_module.c */" +FROZEN_COMMENT_PY = "/* Auto-generated by Programs/_freeze_module.py */" + +FROZEN_DATA_LINE = r"\s*(\d+,\s*)+\s*" + + +def is_frozen_header(source: str) -> bool: + return source.startswith((FROZEN_COMMENT_C, FROZEN_COMMENT_PY)) + + +def decode_frozen_data(source: str) -> types.CodeType: + lines = source.splitlines() + while lines and re.match(FROZEN_DATA_LINE, lines[0]) is None: + del lines[0] + while lines and re.match(FROZEN_DATA_LINE, lines[-1]) is None: + del lines[-1] + values: Tuple[int, ...] = ast.literal_eval("".join(lines).strip()) + data = bytes(values) + return umarshal.loads(data) + + +def generate(args: list[str], output: TextIO) -> None: + printer = Printer(output) + for arg in args: + file, modname = arg.rsplit(':', 1) + with open(file, "r", encoding="utf8") as fd: + source = fd.read() + if is_frozen_header(source): + code = decode_frozen_data(source) + else: + code = compile(fd.read(), f"<frozen {modname}>", "exec") + printer.generate_file(modname, code) + with printer.block(f"void\n_Py_Deepfreeze_Fini(void)"): + for p in printer.deallocs: + printer.write(p) + with printer.block(f"int\n_Py_Deepfreeze_Init(void)"): + for p in printer.interns: + with printer.block(f"if ({p} < 0)"): + printer.write("return -1;") + printer.write("return 0;") + if verbose: + print(f"Cache hits: {printer.hits}, misses: {printer.misses}") + + +parser = argparse.ArgumentParser() +parser.add_argument("-o", "--output", help="Defaults to deepfreeze.c", default="deepfreeze.c") +parser.add_argument("-v", "--verbose", action="store_true", help="Print diagnostics") +parser.add_argument('args', nargs="+", help="Input file and module name (required) in file:modname format") + +@contextlib.contextmanager +def report_time(label: str): + t0 = time.time() + try: + yield + finally: + t1 = time.time() + if verbose: + print(f"{label}: {t1-t0:.3f} sec") + + +def main() -> None: + global verbose + args = parser.parse_args() + verbose = args.verbose + output = args.output + with open(output, "w", encoding="utf-8") as file: + with report_time("generate"): + generate(args.args, file) + if verbose: + print(f"Wrote {os.path.getsize(output)} bytes to {output}") + + +if __name__ == "__main__": + main() diff --git a/Tools/build/freeze_modules.py b/Tools/build/freeze_modules.py new file mode 100644 index 0000000..810224b --- /dev/null +++ b/Tools/build/freeze_modules.py @@ -0,0 +1,733 @@ +"""Freeze modules and regen related files (e.g. Python/frozen.c). + +See the notes at the top of Python/frozen.c for more info. +""" + +from collections import namedtuple +import hashlib +import os +import ntpath +import posixpath +import argparse +from update_file import updating_file_with_tmpfile + + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) +ROOT_DIR = os.path.abspath(ROOT_DIR) +FROZEN_ONLY = os.path.join(ROOT_DIR, 'Tools', 'freeze', 'flag.py') + +STDLIB_DIR = os.path.join(ROOT_DIR, 'Lib') +# If FROZEN_MODULES_DIR or DEEPFROZEN_MODULES_DIR is changed then the +# .gitattributes and .gitignore files needs to be updated. +FROZEN_MODULES_DIR = os.path.join(ROOT_DIR, 'Python', 'frozen_modules') +DEEPFROZEN_MODULES_DIR = os.path.join(ROOT_DIR, 'Python', 'deepfreeze') + +FROZEN_FILE = os.path.join(ROOT_DIR, 'Python', 'frozen.c') +MAKEFILE = os.path.join(ROOT_DIR, 'Makefile.pre.in') +PCBUILD_PROJECT = os.path.join(ROOT_DIR, 'PCbuild', '_freeze_module.vcxproj') +PCBUILD_FILTERS = os.path.join(ROOT_DIR, 'PCbuild', '_freeze_module.vcxproj.filters') +PCBUILD_PYTHONCORE = os.path.join(ROOT_DIR, 'PCbuild', 'pythoncore.vcxproj') + + +OS_PATH = 'ntpath' if os.name == 'nt' else 'posixpath' + +# These are modules that get frozen. +TESTS_SECTION = 'Test module' +FROZEN = [ + # See parse_frozen_spec() for the format. + # In cases where the frozenid is duplicated, the first one is re-used. + ('import system', [ + # These frozen modules are necessary for bootstrapping + # the import system. + 'importlib._bootstrap : _frozen_importlib', + 'importlib._bootstrap_external : _frozen_importlib_external', + # This module is important because some Python builds rely + # on a builtin zip file instead of a filesystem. + 'zipimport', + ]), + ('stdlib - startup, without site (python -S)', [ + 'abc', + 'codecs', + # For now we do not freeze the encodings, due # to the noise all + # those extra modules add to the text printed during the build. + # (See https://github.com/python/cpython/pull/28398#pullrequestreview-756856469.) + #'<encodings.*>', + 'io', + ]), + ('stdlib - startup, with site', [ + '_collections_abc', + '_sitebuiltins', + 'genericpath', + 'ntpath', + 'posixpath', + # We must explicitly mark os.path as a frozen module + # even though it will never be imported. + f'{OS_PATH} : os.path', + 'os', + 'site', + 'stat', + ]), + ('runpy - run module with -m', [ + "importlib.util", + "importlib.machinery", + "runpy", + ]), + (TESTS_SECTION, [ + '__hello__', + '__hello__ : __hello_alias__', + '__hello__ : <__phello_alias__>', + '__hello__ : __phello_alias__.spam', + '<__phello__.**.*>', + f'frozen_only : __hello_only__ = {FROZEN_ONLY}', + ]), +] +BOOTSTRAP = { + 'importlib._bootstrap', + 'importlib._bootstrap_external', + 'zipimport', +} + + +####################################### +# platform-specific helpers + +if os.path is posixpath: + relpath_for_posix_display = os.path.relpath + + def relpath_for_windows_display(path, base): + return ntpath.relpath( + ntpath.join(*path.split(os.path.sep)), + ntpath.join(*base.split(os.path.sep)), + ) + +else: + relpath_for_windows_display = ntpath.relpath + + def relpath_for_posix_display(path, base): + return posixpath.relpath( + posixpath.join(*path.split(os.path.sep)), + posixpath.join(*base.split(os.path.sep)), + ) + + +####################################### +# specs + +def parse_frozen_specs(): + seen = {} + for section, specs in FROZEN: + parsed = _parse_specs(specs, section, seen) + for item in parsed: + frozenid, pyfile, modname, ispkg, section = item + try: + source = seen[frozenid] + except KeyError: + source = FrozenSource.from_id(frozenid, pyfile) + seen[frozenid] = source + else: + assert not pyfile or pyfile == source.pyfile, item + yield FrozenModule(modname, ispkg, section, source) + + +def _parse_specs(specs, section, seen): + for spec in specs: + info, subs = _parse_spec(spec, seen, section) + yield info + for info in subs or (): + yield info + + +def _parse_spec(spec, knownids=None, section=None): + """Yield an info tuple for each module corresponding to the given spec. + + The info consists of: (frozenid, pyfile, modname, ispkg, section). + + Supported formats: + + frozenid + frozenid : modname + frozenid : modname = pyfile + + "frozenid" and "modname" must be valid module names (dot-separated + identifiers). If "modname" is not provided then "frozenid" is used. + If "pyfile" is not provided then the filename of the module + corresponding to "frozenid" is used. + + Angle brackets around a frozenid (e.g. '<encodings>") indicate + it is a package. This also means it must be an actual module + (i.e. "pyfile" cannot have been provided). Such values can have + patterns to expand submodules: + + <encodings.*> - also freeze all direct submodules + <encodings.**.*> - also freeze the full submodule tree + + As with "frozenid", angle brackets around "modname" indicate + it is a package. However, in this case "pyfile" should not + have been provided and patterns in "modname" are not supported. + Also, if "modname" has brackets then "frozenid" should not, + and "pyfile" should have been provided.. + """ + frozenid, _, remainder = spec.partition(':') + modname, _, pyfile = remainder.partition('=') + frozenid = frozenid.strip() + modname = modname.strip() + pyfile = pyfile.strip() + + submodules = None + if modname.startswith('<') and modname.endswith('>'): + assert check_modname(frozenid), spec + modname = modname[1:-1] + assert check_modname(modname), spec + if frozenid in knownids: + pass + elif pyfile: + assert not os.path.isdir(pyfile), spec + else: + pyfile = _resolve_module(frozenid, ispkg=False) + ispkg = True + elif pyfile: + assert check_modname(frozenid), spec + assert not knownids or frozenid not in knownids, spec + assert check_modname(modname), spec + assert not os.path.isdir(pyfile), spec + ispkg = False + elif knownids and frozenid in knownids: + assert check_modname(frozenid), spec + assert check_modname(modname), spec + ispkg = False + else: + assert not modname or check_modname(modname), spec + resolved = iter(resolve_modules(frozenid)) + frozenid, pyfile, ispkg = next(resolved) + if not modname: + modname = frozenid + if ispkg: + pkgid = frozenid + pkgname = modname + pkgfiles = {pyfile: pkgid} + def iter_subs(): + for frozenid, pyfile, ispkg in resolved: + if pkgname: + modname = frozenid.replace(pkgid, pkgname, 1) + else: + modname = frozenid + if pyfile: + if pyfile in pkgfiles: + frozenid = pkgfiles[pyfile] + pyfile = None + elif ispkg: + pkgfiles[pyfile] = frozenid + yield frozenid, pyfile, modname, ispkg, section + submodules = iter_subs() + + info = (frozenid, pyfile or None, modname, ispkg, section) + return info, submodules + + +####################################### +# frozen source files + +class FrozenSource(namedtuple('FrozenSource', 'id pyfile frozenfile deepfreezefile')): + + @classmethod + def from_id(cls, frozenid, pyfile=None): + if not pyfile: + pyfile = os.path.join(STDLIB_DIR, *frozenid.split('.')) + '.py' + #assert os.path.exists(pyfile), (frozenid, pyfile) + frozenfile = resolve_frozen_file(frozenid, FROZEN_MODULES_DIR) + deepfreezefile = resolve_frozen_file(frozenid, DEEPFROZEN_MODULES_DIR) + return cls(frozenid, pyfile, frozenfile, deepfreezefile) + + @property + def frozenid(self): + return self.id + + @property + def modname(self): + if self.pyfile.startswith(STDLIB_DIR): + return self.id + return None + + @property + def symbol(self): + # This matches what we do in Programs/_freeze_module.c: + name = self.frozenid.replace('.', '_') + return '_Py_M__' + name + + @property + def ispkg(self): + if not self.pyfile: + return False + elif self.frozenid.endswith('.__init__'): + return False + else: + return os.path.basename(self.pyfile) == '__init__.py' + + @property + def isbootstrap(self): + return self.id in BOOTSTRAP + + +def resolve_frozen_file(frozenid, destdir): + """Return the filename corresponding to the given frozen ID. + + For stdlib modules the ID will always be the full name + of the source module. + """ + if not isinstance(frozenid, str): + try: + frozenid = frozenid.frozenid + except AttributeError: + raise ValueError(f'unsupported frozenid {frozenid!r}') + # We use a consistent naming convention for all frozen modules. + frozenfile = f'{frozenid}.h' + if not destdir: + return frozenfile + return os.path.join(destdir, frozenfile) + + +####################################### +# frozen modules + +class FrozenModule(namedtuple('FrozenModule', 'name ispkg section source')): + + def __getattr__(self, name): + return getattr(self.source, name) + + @property + def modname(self): + return self.name + + @property + def orig(self): + return self.source.modname + + @property + def isalias(self): + orig = self.source.modname + if not orig: + return True + return self.name != orig + + def summarize(self): + source = self.source.modname + if source: + source = f'<{source}>' + else: + source = relpath_for_posix_display(self.pyfile, ROOT_DIR) + return { + 'module': self.name, + 'ispkg': self.ispkg, + 'source': source, + 'frozen': os.path.basename(self.frozenfile), + 'checksum': _get_checksum(self.frozenfile), + } + + +def _iter_sources(modules): + seen = set() + for mod in modules: + if mod.source not in seen: + yield mod.source + seen.add(mod.source) + + +####################################### +# generic helpers + +def _get_checksum(filename): + with open(filename, "rb") as infile: + contents = infile.read() + m = hashlib.sha256() + m.update(contents) + return m.hexdigest() + + +def resolve_modules(modname, pyfile=None): + if modname.startswith('<') and modname.endswith('>'): + if pyfile: + assert os.path.isdir(pyfile) or os.path.basename(pyfile) == '__init__.py', pyfile + ispkg = True + modname = modname[1:-1] + rawname = modname + # For now, we only expect match patterns at the end of the name. + _modname, sep, match = modname.rpartition('.') + if sep: + if _modname.endswith('.**'): + modname = _modname[:-3] + match = f'**.{match}' + elif match and not match.isidentifier(): + modname = _modname + # Otherwise it's a plain name so we leave it alone. + else: + match = None + else: + ispkg = False + rawname = modname + match = None + + if not check_modname(modname): + raise ValueError(f'not a valid module name ({rawname})') + + if not pyfile: + pyfile = _resolve_module(modname, ispkg=ispkg) + elif os.path.isdir(pyfile): + pyfile = _resolve_module(modname, pyfile, ispkg) + yield modname, pyfile, ispkg + + if match: + pkgdir = os.path.dirname(pyfile) + yield from iter_submodules(modname, pkgdir, match) + + +def check_modname(modname): + return all(n.isidentifier() for n in modname.split('.')) + + +def iter_submodules(pkgname, pkgdir=None, match='*'): + if not pkgdir: + pkgdir = os.path.join(STDLIB_DIR, *pkgname.split('.')) + if not match: + match = '**.*' + match_modname = _resolve_modname_matcher(match, pkgdir) + + def _iter_submodules(pkgname, pkgdir): + for entry in sorted(os.scandir(pkgdir), key=lambda e: e.name): + matched, recursive = match_modname(entry.name) + if not matched: + continue + modname = f'{pkgname}.{entry.name}' + if modname.endswith('.py'): + yield modname[:-3], entry.path, False + elif entry.is_dir(): + pyfile = os.path.join(entry.path, '__init__.py') + # We ignore namespace packages. + if os.path.exists(pyfile): + yield modname, pyfile, True + if recursive: + yield from _iter_submodules(modname, entry.path) + + return _iter_submodules(pkgname, pkgdir) + + +def _resolve_modname_matcher(match, rootdir=None): + if isinstance(match, str): + if match.startswith('**.'): + recursive = True + pat = match[3:] + assert match + else: + recursive = False + pat = match + + if pat == '*': + def match_modname(modname): + return True, recursive + else: + raise NotImplementedError(match) + elif callable(match): + match_modname = match(rootdir) + else: + raise ValueError(f'unsupported matcher {match!r}') + return match_modname + + +def _resolve_module(modname, pathentry=STDLIB_DIR, ispkg=False): + assert pathentry, pathentry + pathentry = os.path.normpath(pathentry) + assert os.path.isabs(pathentry) + if ispkg: + return os.path.join(pathentry, *modname.split('.'), '__init__.py') + return os.path.join(pathentry, *modname.split('.')) + '.py' + + +####################################### +# regenerating dependent files + +def find_marker(lines, marker, file): + for pos, line in enumerate(lines): + if marker in line: + return pos + raise Exception(f"Can't find {marker!r} in file {file}") + + +def replace_block(lines, start_marker, end_marker, replacements, file): + start_pos = find_marker(lines, start_marker, file) + end_pos = find_marker(lines, end_marker, file) + if end_pos <= start_pos: + raise Exception(f"End marker {end_marker!r} " + f"occurs before start marker {start_marker!r} " + f"in file {file}") + replacements = [line.rstrip() + '\n' for line in replacements] + return lines[:start_pos + 1] + replacements + lines[end_pos:] + + +def regen_frozen(modules, frozen_modules: bool): + headerlines = [] + parentdir = os.path.dirname(FROZEN_FILE) + if frozen_modules: + for src in _iter_sources(modules): + # Adding a comment to separate sections here doesn't add much, + # so we don't. + header = relpath_for_posix_display(src.frozenfile, parentdir) + headerlines.append(f'#include "{header}"') + + externlines = [] + bootstraplines = [] + stdliblines = [] + testlines = [] + aliaslines = [] + indent = ' ' + lastsection = None + for mod in modules: + if mod.isbootstrap: + lines = bootstraplines + elif mod.section == TESTS_SECTION: + lines = testlines + else: + lines = stdliblines + if mod.section != lastsection: + if lastsection is not None: + lines.append('') + lines.append(f'/* {mod.section} */') + lastsection = mod.section + + # Also add a extern declaration for the corresponding + # deepfreeze-generated function. + orig_name = mod.source.id + code_name = orig_name.replace(".", "_") + get_code_name = "_Py_get_%s_toplevel" % code_name + externlines.append("extern PyObject *%s(void);" % get_code_name) + + symbol = mod.symbol + pkg = 'true' if mod.ispkg else 'false' + if not frozen_modules: + line = ('{"%s", NULL, 0, %s, GET_CODE(%s)},' + ) % (mod.name, pkg, code_name) + else: + line = ('{"%s", %s, (int)sizeof(%s), %s, GET_CODE(%s)},' + ) % (mod.name, symbol, symbol, pkg, code_name) + lines.append(line) + + if mod.isalias: + if not mod.orig: + entry = '{"%s", NULL},' % (mod.name,) + elif mod.source.ispkg: + entry = '{"%s", "<%s"},' % (mod.name, mod.orig) + else: + entry = '{"%s", "%s"},' % (mod.name, mod.orig) + aliaslines.append(indent + entry) + + for lines in (bootstraplines, stdliblines, testlines): + # TODO: Is this necessary any more? + if not lines[0]: + del lines[0] + for i, line in enumerate(lines): + if line: + lines[i] = indent + line + + print(f'# Updating {os.path.relpath(FROZEN_FILE)}') + with updating_file_with_tmpfile(FROZEN_FILE) as (infile, outfile): + lines = infile.readlines() + # TODO: Use more obvious markers, e.g. + # $START GENERATED FOOBAR$ / $END GENERATED FOOBAR$ + lines = replace_block( + lines, + "/* Includes for frozen modules: */", + "/* End includes */", + headerlines, + FROZEN_FILE, + ) + lines = replace_block( + lines, + "/* Start extern declarations */", + "/* End extern declarations */", + externlines, + FROZEN_FILE, + ) + lines = replace_block( + lines, + "static const struct _frozen bootstrap_modules[] =", + "/* bootstrap sentinel */", + bootstraplines, + FROZEN_FILE, + ) + lines = replace_block( + lines, + "static const struct _frozen stdlib_modules[] =", + "/* stdlib sentinel */", + stdliblines, + FROZEN_FILE, + ) + lines = replace_block( + lines, + "static const struct _frozen test_modules[] =", + "/* test sentinel */", + testlines, + FROZEN_FILE, + ) + lines = replace_block( + lines, + "const struct _module_alias aliases[] =", + "/* aliases sentinel */", + aliaslines, + FROZEN_FILE, + ) + outfile.writelines(lines) + + +def regen_makefile(modules): + pyfiles = [] + frozenfiles = [] + rules = [''] + deepfreezerules = ["Python/deepfreeze/deepfreeze.c: $(DEEPFREEZE_DEPS)", + "\t$(PYTHON_FOR_FREEZE) $(srcdir)/Tools/build/deepfreeze.py \\"] + for src in _iter_sources(modules): + frozen_header = relpath_for_posix_display(src.frozenfile, ROOT_DIR) + frozenfiles.append(f'\t\t{frozen_header} \\') + + pyfile = relpath_for_posix_display(src.pyfile, ROOT_DIR) + pyfiles.append(f'\t\t{pyfile} \\') + + if src.isbootstrap: + freezecmd = '$(FREEZE_MODULE_BOOTSTRAP)' + freezedep = '$(FREEZE_MODULE_BOOTSTRAP_DEPS)' + else: + freezecmd = '$(FREEZE_MODULE)' + freezedep = '$(FREEZE_MODULE_DEPS)' + + freeze = (f'{freezecmd} {src.frozenid} ' + f'$(srcdir)/{pyfile} {frozen_header}') + rules.extend([ + f'{frozen_header}: {pyfile} {freezedep}', + f'\t{freeze}', + '', + ]) + deepfreezerules.append(f"\t{frozen_header}:{src.frozenid} \\") + deepfreezerules.append('\t-o Python/deepfreeze/deepfreeze.c') + pyfiles[-1] = pyfiles[-1].rstrip(" \\") + frozenfiles[-1] = frozenfiles[-1].rstrip(" \\") + + print(f'# Updating {os.path.relpath(MAKEFILE)}') + with updating_file_with_tmpfile(MAKEFILE) as (infile, outfile): + lines = infile.readlines() + lines = replace_block( + lines, + "FROZEN_FILES_IN =", + "# End FROZEN_FILES_IN", + pyfiles, + MAKEFILE, + ) + lines = replace_block( + lines, + "FROZEN_FILES_OUT =", + "# End FROZEN_FILES_OUT", + frozenfiles, + MAKEFILE, + ) + lines = replace_block( + lines, + "# BEGIN: freezing modules", + "# END: freezing modules", + rules, + MAKEFILE, + ) + lines = replace_block( + lines, + "# BEGIN: deepfreeze modules", + "# END: deepfreeze modules", + deepfreezerules, + MAKEFILE, + ) + outfile.writelines(lines) + + +def regen_pcbuild(modules): + projlines = [] + filterlines = [] + corelines = [] + deepfreezerules = ['\t<Exec Command=\'$(PythonForBuild) "$(PySourcePath)Tools\\build\\deepfreeze.py" ^'] + for src in _iter_sources(modules): + pyfile = relpath_for_windows_display(src.pyfile, ROOT_DIR) + header = relpath_for_windows_display(src.frozenfile, ROOT_DIR) + intfile = ntpath.splitext(ntpath.basename(header))[0] + '.g.h' + projlines.append(f' <None Include="..\\{pyfile}">') + projlines.append(f' <ModName>{src.frozenid}</ModName>') + projlines.append(f' <IntFile>$(IntDir){intfile}</IntFile>') + projlines.append(f' <OutFile>$(PySourcePath){header}</OutFile>') + projlines.append(f' </None>') + + filterlines.append(f' <None Include="..\\{pyfile}">') + filterlines.append(' <Filter>Python Files</Filter>') + filterlines.append(' </None>') + deepfreezerules.append(f'\t\t "$(PySourcePath){header}:{src.frozenid}" ^') + deepfreezerules.append('\t\t "-o" "$(PySourcePath)Python\\deepfreeze\\deepfreeze.c"\'/>' ) + + corelines.append(f' <ClCompile Include="..\\Python\\deepfreeze\\deepfreeze.c" />') + + print(f'# Updating {os.path.relpath(PCBUILD_PROJECT)}') + with updating_file_with_tmpfile(PCBUILD_PROJECT) as (infile, outfile): + lines = infile.readlines() + lines = replace_block( + lines, + '<!-- BEGIN frozen modules -->', + '<!-- END frozen modules -->', + projlines, + PCBUILD_PROJECT, + ) + outfile.writelines(lines) + with updating_file_with_tmpfile(PCBUILD_PROJECT) as (infile, outfile): + lines = infile.readlines() + lines = replace_block( + lines, + '<!-- BEGIN deepfreeze rule -->', + '<!-- END deepfreeze rule -->', + deepfreezerules, + PCBUILD_PROJECT, + ) + outfile.writelines(lines) + print(f'# Updating {os.path.relpath(PCBUILD_FILTERS)}') + with updating_file_with_tmpfile(PCBUILD_FILTERS) as (infile, outfile): + lines = infile.readlines() + lines = replace_block( + lines, + '<!-- BEGIN frozen modules -->', + '<!-- END frozen modules -->', + filterlines, + PCBUILD_FILTERS, + ) + outfile.writelines(lines) + print(f'# Updating {os.path.relpath(PCBUILD_PYTHONCORE)}') + with updating_file_with_tmpfile(PCBUILD_PYTHONCORE) as (infile, outfile): + lines = infile.readlines() + lines = replace_block( + lines, + '<!-- BEGIN deepfreeze -->', + '<!-- END deepfreeze -->', + corelines, + PCBUILD_FILTERS, + ) + outfile.writelines(lines) + + +####################################### +# the script + +parser = argparse.ArgumentParser() +parser.add_argument("--frozen-modules", action="store_true", + help="Use both frozen and deepfrozen modules. (default: uses only deepfrozen modules)") + +def main(): + args = parser.parse_args() + frozen_modules: bool = args.frozen_modules + # Expand the raw specs, preserving order. + modules = list(parse_frozen_specs()) + + # Regen build-related files. + regen_makefile(modules) + regen_pcbuild(modules) + regen_frozen(modules, frozen_modules) + + +if __name__ == '__main__': + main() diff --git a/Tools/build/generate_global_objects.py b/Tools/build/generate_global_objects.py new file mode 100644 index 0000000..dd67cfe --- /dev/null +++ b/Tools/build/generate_global_objects.py @@ -0,0 +1,382 @@ +import contextlib +import io +import os.path +import re + +SCRIPT_NAME = 'Tools/build/generate_global_objects.py' +__file__ = os.path.abspath(__file__) +ROOT = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) +INTERNAL = os.path.join(ROOT, 'Include', 'internal') + + +IGNORED = { + 'ACTION', # Python/_warnings.c + 'ATTR', # Python/_warnings.c and Objects/funcobject.c + 'DUNDER', # Objects/typeobject.c + 'RDUNDER', # Objects/typeobject.c + 'SPECIAL', # Objects/weakrefobject.c + 'NAME', # Objects/typeobject.c +} +IDENTIFIERS = [ + # from ADD() Python/_warnings.c + 'default', + 'ignore', + + # from GET_WARNINGS_ATTR() in Python/_warnings.c + 'WarningMessage', + '_showwarnmsg', + '_warn_unawaited_coroutine', + 'defaultaction', + 'filters', + 'onceregistry', + + # from WRAP_METHOD() in Objects/weakrefobject.c + '__bytes__', + '__reversed__', + + # from COPY_ATTR() in Objects/funcobject.c + '__module__', + '__name__', + '__qualname__', + '__doc__', + '__annotations__', + + # from SLOT* in Objects/typeobject.c + '__abs__', + '__add__', + '__aiter__', + '__and__', + '__anext__', + '__await__', + '__bool__', + '__call__', + '__contains__', + '__del__', + '__delattr__', + '__delete__', + '__delitem__', + '__eq__', + '__float__', + '__floordiv__', + '__ge__', + '__get__', + '__getattr__', + '__getattribute__', + '__getitem__', + '__gt__', + '__hash__', + '__iadd__', + '__iand__', + '__ifloordiv__', + '__ilshift__', + '__imatmul__', + '__imod__', + '__imul__', + '__index__', + '__init__', + '__int__', + '__invert__', + '__ior__', + '__ipow__', + '__irshift__', + '__isub__', + '__iter__', + '__itruediv__', + '__ixor__', + '__le__', + '__len__', + '__lshift__', + '__lt__', + '__matmul__', + '__mod__', + '__mul__', + '__ne__', + '__neg__', + '__new__', + '__next__', + '__or__', + '__pos__', + '__pow__', + '__radd__', + '__rand__', + '__repr__', + '__rfloordiv__', + '__rlshift__', + '__rmatmul__', + '__rmod__', + '__rmul__', + '__ror__', + '__rpow__', + '__rrshift__', + '__rshift__', + '__rsub__', + '__rtruediv__', + '__rxor__', + '__set__', + '__setattr__', + '__setitem__', + '__str__', + '__sub__', + '__truediv__', + '__xor__', + '__divmod__', + '__rdivmod__', +] + + +####################################### +# helpers + +def iter_files(): + for name in ('Modules', 'Objects', 'Parser', 'PC', 'Programs', 'Python'): + root = os.path.join(ROOT, name) + for dirname, _, files in os.walk(root): + for name in files: + if not name.endswith(('.c', '.h')): + continue + yield os.path.join(dirname, name) + + +def iter_global_strings(): + id_regex = re.compile(r'\b_Py_ID\((\w+)\)') + str_regex = re.compile(r'\b_Py_DECLARE_STR\((\w+), "(.*?)"\)') + for filename in iter_files(): + try: + infile = open(filename, encoding='utf-8') + except FileNotFoundError: + # The file must have been a temporary file. + continue + with infile: + for lno, line in enumerate(infile, 1): + for m in id_regex.finditer(line): + identifier, = m.groups() + yield identifier, None, filename, lno, line + for m in str_regex.finditer(line): + varname, string = m.groups() + yield varname, string, filename, lno, line + + +def iter_to_marker(lines, marker): + for line in lines: + if line.rstrip() == marker: + break + yield line + + +class Printer: + + def __init__(self, file): + self.level = 0 + self.file = file + self.continuation = [False] + + @contextlib.contextmanager + def indent(self): + save_level = self.level + try: + self.level += 1 + yield + finally: + self.level = save_level + + def write(self, arg): + eol = '\n' + if self.continuation[-1]: + eol = f' \\{eol}' if arg else f'\\{eol}' + self.file.writelines((" "*self.level, arg, eol)) + + @contextlib.contextmanager + def block(self, prefix, suffix="", *, continuation=None): + if continuation is None: + continuation = self.continuation[-1] + self.continuation.append(continuation) + + self.write(prefix + " {") + with self.indent(): + yield + self.continuation.pop() + self.write("}" + suffix) + + +@contextlib.contextmanager +def open_for_changes(filename, orig): + """Like open() but only write to the file if it changed.""" + outfile = io.StringIO() + yield outfile + text = outfile.getvalue() + if text != orig: + with open(filename, 'w', encoding='utf-8') as outfile: + outfile.write(text) + else: + print(f'# not changed: {filename}') + + +####################################### +# the global objects + +START = f'/* The following is auto-generated by {SCRIPT_NAME}. */' +END = '/* End auto-generated code */' + + +def generate_global_strings(identifiers, strings): + filename = os.path.join(INTERNAL, 'pycore_global_strings.h') + + # Read the non-generated part of the file. + with open(filename) as infile: + orig = infile.read() + lines = iter(orig.rstrip().splitlines()) + before = '\n'.join(iter_to_marker(lines, START)) + for _ in iter_to_marker(lines, END): + pass + after = '\n'.join(lines) + + # Generate the file. + with open_for_changes(filename, orig) as outfile: + printer = Printer(outfile) + printer.write(before) + printer.write(START) + with printer.block('struct _Py_global_strings', ';'): + with printer.block('struct', ' literals;'): + for literal, name in sorted(strings.items(), key=lambda x: x[1]): + printer.write(f'STRUCT_FOR_STR({name}, "{literal}")') + outfile.write('\n') + with printer.block('struct', ' identifiers;'): + for name in sorted(identifiers): + assert name.isidentifier(), name + printer.write(f'STRUCT_FOR_ID({name})') + with printer.block('struct', ' ascii[128];'): + printer.write("PyASCIIObject _ascii;") + printer.write("uint8_t _data[2];") + with printer.block('struct', ' latin1[128];'): + printer.write("PyCompactUnicodeObject _latin1;") + printer.write("uint8_t _data[2];") + printer.write(END) + printer.write(after) + + +def generate_runtime_init(identifiers, strings): + # First get some info from the declarations. + nsmallposints = None + nsmallnegints = None + with open(os.path.join(INTERNAL, 'pycore_global_objects.h')) as infile: + for line in infile: + if line.startswith('#define _PY_NSMALLPOSINTS'): + nsmallposints = int(line.split()[-1]) + elif line.startswith('#define _PY_NSMALLNEGINTS'): + nsmallnegints = int(line.split()[-1]) + break + else: + raise NotImplementedError + assert nsmallposints and nsmallnegints + + # Then target the runtime initializer. + filename = os.path.join(INTERNAL, 'pycore_runtime_init_generated.h') + + # Read the non-generated part of the file. + with open(filename) as infile: + orig = infile.read() + lines = iter(orig.rstrip().splitlines()) + before = '\n'.join(iter_to_marker(lines, START)) + for _ in iter_to_marker(lines, END): + pass + after = '\n'.join(lines) + + # Generate the file. + with open_for_changes(filename, orig) as outfile: + immortal_objects = [] + printer = Printer(outfile) + printer.write(before) + printer.write(START) + with printer.block('#define _Py_global_objects_INIT', continuation=True): + with printer.block('.singletons =', ','): + # Global int objects. + with printer.block('.small_ints =', ','): + for i in range(-nsmallnegints, nsmallposints): + printer.write(f'_PyLong_DIGIT_INIT({i}),') + immortal_objects.append(f'(PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + {i}]') + printer.write('') + # Global bytes objects. + printer.write('.bytes_empty = _PyBytes_SIMPLE_INIT(0, 0),') + immortal_objects.append(f'(PyObject *)&_Py_SINGLETON(bytes_empty)') + with printer.block('.bytes_characters =', ','): + for i in range(256): + printer.write(f'_PyBytes_CHAR_INIT({i}),') + immortal_objects.append(f'(PyObject *)&_Py_SINGLETON(bytes_characters)[{i}]') + printer.write('') + # Global strings. + with printer.block('.strings =', ','): + with printer.block('.literals =', ','): + for literal, name in sorted(strings.items(), key=lambda x: x[1]): + printer.write(f'INIT_STR({name}, "{literal}"),') + immortal_objects.append(f'(PyObject *)&_Py_STR({name})') + with printer.block('.identifiers =', ','): + for name in sorted(identifiers): + assert name.isidentifier(), name + printer.write(f'INIT_ID({name}),') + immortal_objects.append(f'(PyObject *)&_Py_ID({name})') + with printer.block('.ascii =', ','): + for i in range(128): + printer.write(f'_PyASCIIObject_INIT("\\x{i:02x}"),') + immortal_objects.append(f'(PyObject *)&_Py_SINGLETON(strings).ascii[{i}]') + with printer.block('.latin1 =', ','): + for i in range(128, 256): + utf8 = ['"'] + for c in chr(i).encode('utf-8'): + utf8.append(f"\\x{c:02x}") + utf8.append('"') + printer.write(f'_PyUnicode_LATIN1_INIT("\\x{i:02x}", {"".join(utf8)}),') + immortal_objects.append(f'(PyObject *)&_Py_SINGLETON(strings).latin1[{i} - 128]') + printer.write('') + with printer.block('.tuple_empty =', ','): + printer.write('.ob_base = _PyVarObject_IMMORTAL_INIT(&PyTuple_Type, 0)') + immortal_objects.append(f'(PyObject *)&_Py_SINGLETON(tuple_empty)') + printer.write('') + printer.write("static inline void") + with printer.block("_PyUnicode_InitStaticStrings(void)"): + printer.write(f'PyObject *string;') + for i in sorted(identifiers): + # This use of _Py_ID() is ignored by iter_global_strings() + # since iter_files() ignores .h files. + printer.write(f'string = &_Py_ID({i});') + printer.write(f'PyUnicode_InternInPlace(&string);') + printer.write('') + printer.write('#ifdef Py_DEBUG') + printer.write("static inline void") + with printer.block("_PyStaticObjects_CheckRefcnt(void)"): + for i in immortal_objects: + with printer.block(f'if (Py_REFCNT({i}) < _PyObject_IMMORTAL_REFCNT)', ';'): + printer.write(f'_PyObject_Dump({i});') + printer.write(f'Py_FatalError("immortal object has less refcnt than ' + 'expected _PyObject_IMMORTAL_REFCNT");') + printer.write('#endif') + printer.write(END) + printer.write(after) + + +def get_identifiers_and_strings() -> 'tuple[set[str], dict[str, str]]': + identifiers = set(IDENTIFIERS) + strings = {} + for name, string, *_ in iter_global_strings(): + if string is None: + if name not in IGNORED: + identifiers.add(name) + else: + if string not in strings: + strings[string] = name + elif name != strings[string]: + raise ValueError(f'string mismatch for {name!r} ({string!r} != {strings[name]!r}') + return identifiers, strings + + +####################################### +# the script + +def main() -> None: + identifiers, strings = get_identifiers_and_strings() + + generate_global_strings(identifiers, strings) + generate_runtime_init(identifiers, strings) + + +if __name__ == '__main__': + main() diff --git a/Tools/build/generate_levenshtein_examples.py b/Tools/build/generate_levenshtein_examples.py new file mode 100644 index 0000000..5a8360f --- /dev/null +++ b/Tools/build/generate_levenshtein_examples.py @@ -0,0 +1,70 @@ +"""Generate 10,000 unique examples for the Levenshtein short-circuit tests.""" + +import argparse +from functools import cache +import json +import os.path +from random import choices, randrange + + +# This should be in sync with Lib/traceback.py. It's not importing those values +# because this script is being executed by PYTHON_FOR_REGEN and not by the in-tree +# build of Python. +_MOVE_COST = 2 +_CASE_COST = 1 + + +def _substitution_cost(ch_a, ch_b): + if ch_a == ch_b: + return 0 + if ch_a.lower() == ch_b.lower(): + return _CASE_COST + return _MOVE_COST + + +@cache +def levenshtein(a, b): + if not a or not b: + return (len(a) + len(b)) * _MOVE_COST + option1 = levenshtein(a[:-1], b[:-1]) + _substitution_cost(a[-1], b[-1]) + option2 = levenshtein(a[:-1], b) + _MOVE_COST + option3 = levenshtein(a, b[:-1]) + _MOVE_COST + return min(option1, option2, option3) + + +def main(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('output_path', metavar='FILE', type=str) + parser.add_argument('--overwrite', dest='overwrite', action='store_const', + const=True, default=False, + help='overwrite an existing test file') + + args = parser.parse_args() + output_path = os.path.realpath(args.output_path) + if not args.overwrite and os.path.isfile(output_path): + print(f"{output_path} already exists, skipping regeneration.") + print( + "To force, add --overwrite to the invocation of this tool or" + " delete the existing file." + ) + return + + examples = set() + # Create a lot of non-empty examples, which should end up with a Gauss-like + # distribution for even costs (moves) and odd costs (case substitutions). + while len(examples) < 9990: + a = ''.join(choices("abcABC", k=randrange(1, 10))) + b = ''.join(choices("abcABC", k=randrange(1, 10))) + expected = levenshtein(a, b) + examples.add((a, b, expected)) + # Create one empty case each for strings between 0 and 9 in length. + for i in range(10): + b = ''.join(choices("abcABC", k=i)) + expected = levenshtein("", b) + examples.add(("", b, expected)) + with open(output_path, "w") as f: + json.dump(sorted(examples), f, indent=2) + + +if __name__ == "__main__": + main() diff --git a/Tools/build/generate_opcode_h.py b/Tools/build/generate_opcode_h.py new file mode 100644 index 0000000..372221a --- /dev/null +++ b/Tools/build/generate_opcode_h.py @@ -0,0 +1,199 @@ +# This script generates the opcode.h header file. + +import sys +import tokenize + +SCRIPT_NAME = "Tools/build/generate_opcode_h.py" +PYTHON_OPCODE = "Lib/opcode.py" + +header = f""" +// Auto-generated by {SCRIPT_NAME} from {PYTHON_OPCODE} + +#ifndef Py_OPCODE_H +#define Py_OPCODE_H +#ifdef __cplusplus +extern "C" {{ +#endif + + +/* Instruction opcodes for compiled code */ +""".lstrip() + +footer = """ + +#define IS_PSEUDO_OPCODE(op) (((op) >= MIN_PSEUDO_OPCODE) && ((op) <= MAX_PSEUDO_OPCODE)) + +#ifdef __cplusplus +} +#endif +#endif /* !Py_OPCODE_H */ +""" + +internal_header = f""" +// Auto-generated by {SCRIPT_NAME} from {PYTHON_OPCODE} + +#ifndef Py_INTERNAL_OPCODE_H +#define Py_INTERNAL_OPCODE_H +#ifdef __cplusplus +extern "C" {{ +#endif + +#ifndef Py_BUILD_CORE +# error "this header requires Py_BUILD_CORE define" +#endif + +#include "opcode.h" +""".lstrip() + +internal_footer = """ +#ifdef __cplusplus +} +#endif +#endif // !Py_INTERNAL_OPCODE_H +""" + +DEFINE = "#define {:<38} {:>3}\n" + +UINT32_MASK = (1<<32)-1 + +def write_int_array_from_ops(name, ops, out): + bits = 0 + for op in ops: + bits |= 1<<op + out.write(f"static const uint32_t {name}[9] = {{\n") + for i in range(9): + out.write(f" {bits & UINT32_MASK}U,\n") + bits >>= 32 + assert bits == 0 + out.write(f"}};\n") + +def main(opcode_py, outfile='Include/opcode.h', internaloutfile='Include/internal/pycore_opcode.h'): + opcode = {} + if hasattr(tokenize, 'open'): + fp = tokenize.open(opcode_py) # Python 3.2+ + else: + fp = open(opcode_py) # Python 2.7 + with fp: + code = fp.read() + exec(code, opcode) + opmap = opcode['opmap'] + opname = opcode['opname'] + hasarg = opcode['hasarg'] + hasconst = opcode['hasconst'] + hasjrel = opcode['hasjrel'] + hasjabs = opcode['hasjabs'] + is_pseudo = opcode['is_pseudo'] + _pseudo_ops = opcode['_pseudo_ops'] + + HAVE_ARGUMENT = opcode["HAVE_ARGUMENT"] + MIN_PSEUDO_OPCODE = opcode["MIN_PSEUDO_OPCODE"] + MAX_PSEUDO_OPCODE = opcode["MAX_PSEUDO_OPCODE"] + + NUM_OPCODES = len(opname) + used = [ False ] * len(opname) + next_op = 1 + + for name, op in opmap.items(): + used[op] = True + + specialized_opmap = {} + opname_including_specialized = opname.copy() + for name in opcode['_specialized_instructions']: + while used[next_op]: + next_op += 1 + specialized_opmap[name] = next_op + opname_including_specialized[next_op] = name + used[next_op] = True + specialized_opmap['DO_TRACING'] = 255 + opname_including_specialized[255] = 'DO_TRACING' + used[255] = True + + with (open(outfile, 'w') as fobj, open(internaloutfile, 'w') as iobj): + fobj.write(header) + iobj.write(internal_header) + + for name in opname: + if name in opmap: + op = opmap[name] + if op == HAVE_ARGUMENT: + fobj.write(DEFINE.format("HAVE_ARGUMENT", HAVE_ARGUMENT)) + if op == MIN_PSEUDO_OPCODE: + fobj.write(DEFINE.format("MIN_PSEUDO_OPCODE", MIN_PSEUDO_OPCODE)) + + fobj.write(DEFINE.format(name, op)) + + if op == MAX_PSEUDO_OPCODE: + fobj.write(DEFINE.format("MAX_PSEUDO_OPCODE", MAX_PSEUDO_OPCODE)) + + + for name, op in specialized_opmap.items(): + fobj.write(DEFINE.format(name, op)) + + iobj.write("\nextern const uint8_t _PyOpcode_Caches[256];\n") + iobj.write("\nextern const uint8_t _PyOpcode_Deopt[256];\n") + iobj.write("\n#ifdef NEED_OPCODE_TABLES\n") + write_int_array_from_ops("_PyOpcode_RelativeJump", opcode['hasjrel'], iobj) + write_int_array_from_ops("_PyOpcode_Jump", opcode['hasjrel'] + opcode['hasjabs'], iobj) + + iobj.write("\nconst uint8_t _PyOpcode_Caches[256] = {\n") + for i, entries in enumerate(opcode["_inline_cache_entries"]): + if entries: + iobj.write(f" [{opname[i]}] = {entries},\n") + iobj.write("};\n") + + deoptcodes = {} + for basic, op in opmap.items(): + if not is_pseudo(op): + deoptcodes[basic] = basic + for basic, family in opcode["_specializations"].items(): + for specialized in family: + deoptcodes[specialized] = basic + iobj.write("\nconst uint8_t _PyOpcode_Deopt[256] = {\n") + for opt, deopt in sorted(deoptcodes.items()): + iobj.write(f" [{opt}] = {deopt},\n") + iobj.write("};\n") + iobj.write("#endif // NEED_OPCODE_TABLES\n") + + fobj.write("\n") + fobj.write("#define HAS_ARG(op) ((((op) >= HAVE_ARGUMENT) && (!IS_PSEUDO_OPCODE(op)))\\") + for op in _pseudo_ops: + if opmap[op] in hasarg: + fobj.write(f"\n || ((op) == {op}) \\") + fobj.write("\n )\n") + + fobj.write("\n") + fobj.write("#define HAS_CONST(op) (false\\") + for op in hasconst: + fobj.write(f"\n || ((op) == {opname[op]}) \\") + fobj.write("\n )\n") + + fobj.write("\n") + for i, (op, _) in enumerate(opcode["_nb_ops"]): + fobj.write(DEFINE.format(op, i)) + + iobj.write("\n") + iobj.write("#ifdef Py_DEBUG\n") + iobj.write(f"static const char *const _PyOpcode_OpName[{NUM_OPCODES}] = {{\n") + for op, name in enumerate(opname_including_specialized): + if name[0] != "<": + op = name + iobj.write(f''' [{op}] = "{name}",\n''') + iobj.write("};\n") + iobj.write("#endif\n") + + iobj.write("\n") + iobj.write("#define EXTRA_CASES \\\n") + for i, flag in enumerate(used): + if not flag: + iobj.write(f" case {i}: \\\n") + iobj.write(" ;\n") + + fobj.write(footer) + iobj.write(internal_footer) + + + print(f"{outfile} regenerated from {opcode_py}") + + +if __name__ == '__main__': + main(sys.argv[1], sys.argv[2], sys.argv[3]) diff --git a/Tools/build/generate_re_casefix.py b/Tools/build/generate_re_casefix.py new file mode 100755 index 0000000..b57ac07 --- /dev/null +++ b/Tools/build/generate_re_casefix.py @@ -0,0 +1,96 @@ +#! /usr/bin/env python3 +# This script generates Lib/re/_casefix.py. + +import collections +import sys +import unicodedata + +SCRIPT_NAME = 'Tools/build/generate_re_casefix.py' + +def update_file(file, content): + try: + with open(file, 'r', encoding='utf-8') as fobj: + if fobj.read() == content: + return False + except (OSError, ValueError): + pass + with open(file, 'w', encoding='utf-8') as fobj: + fobj.write(content) + return True + +re_casefix_template = f"""\ +# Auto-generated by {SCRIPT_NAME}. + +# Maps the code of lowercased character to codes of different lowercased +# characters which have the same uppercase. +_EXTRA_CASES = { +%s +} +""" + +def uname(i): + return unicodedata.name(chr(i), r'U+%04X' % i) + +class hexint(int): + def __repr__(self): + return '%#06x' % self + +def alpha(i): + c = chr(i) + return c if c.isalpha() else ascii(c)[1:-1] + + +def main(outfile='Lib/re/_casefix.py'): + # Find sets of characters which have the same uppercase. + equivalent_chars = collections.defaultdict(str) + for c in map(chr, range(sys.maxunicode + 1)): + equivalent_chars[c.upper()] += c + equivalent_chars = [t for t in equivalent_chars.values() if len(t) > 1] + + # List of codes of lowercased characters which have the same uppercase. + equivalent_lower_codes = [sorted(t) + for s in equivalent_chars + for t in [set(ord(c.lower()) for c in s)] + if len(t) > 1] + + bad_codes = [] + for t in equivalent_lower_codes: + for i in t: + if i > 0xffff: + bad_codes.extend(t) + try: + bad_codes.append(ord(chr(i).upper())) + except (ValueError, TypeError): + pass + break + if bad_codes: + print('Case-insensitive matching may not work correctly for character:', + file=sys.stderr) + for i in sorted(bad_codes): + print(" '%s' (U+%04x, %s)" % (alpha(i), i, uname(i)), + file=sys.stderr) + sys.exit(1) + + mapping = {i: tuple(j for j in t if i != j) + for t in equivalent_lower_codes + for i in t} + + items = [] + for i, t in sorted(mapping.items()): + items.append(' # %s: %s' % ( + uname(i), + ', '.join(map(uname, t)), + )) + items.append(" %r: %r, # '%s': '%s'" % ( + hexint(i), + tuple(map(hexint, t)), + alpha(i), + ''.join(map(alpha, t)), + )) + + update_file(outfile, re_casefix_template % '\n'.join(items)) + + +if __name__ == '__main__': + import sys + main(*sys.argv[1:]) diff --git a/Tools/build/generate_sre_constants.py b/Tools/build/generate_sre_constants.py new file mode 100755 index 0000000..abea069 --- /dev/null +++ b/Tools/build/generate_sre_constants.py @@ -0,0 +1,80 @@ +#! /usr/bin/env python3 +# This script generates Modules/_sre/sre_constants.h from Lib/re/_constants.py. + +SCRIPT_NAME = 'Tools/build/generate_sre_constants.py' + + +def update_file(file, content): + try: + with open(file, 'r') as fobj: + if fobj.read() == content: + return False + except (OSError, ValueError): + pass + with open(file, 'w') as fobj: + fobj.write(content) + return True + +sre_constants_header = f"""\ +/* + * Secret Labs' Regular Expression Engine + * + * regular expression matching engine + * + * Auto-generated by {SCRIPT_NAME} from + * Lib/re/_constants.py. + * + * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. + * + * See the sre.c file for information on usage and redistribution. + */ + +""" + +def main( + infile="Lib/re/_constants.py", + outfile_constants="Modules/_sre/sre_constants.h", + outfile_targets="Modules/_sre/sre_targets.h", +): + ns = {} + with open(infile) as fp: + code = fp.read() + exec(code, ns) + + def dump(d, prefix): + items = sorted(d) + for item in items: + yield "#define %s_%s %d\n" % (prefix, item, item) + + def dump2(d, prefix): + items = [(value, name) for name, value in d.items() + if name.startswith(prefix)] + for value, name in sorted(items): + yield "#define %s %d\n" % (name, value) + + def dump_gotos(d, prefix): + for i, item in enumerate(sorted(d)): + assert i == item + yield f" &&{prefix}_{item},\n" + + content = [sre_constants_header] + content.append("#define SRE_MAGIC %d\n" % ns["MAGIC"]) + content.extend(dump(ns["OPCODES"], "SRE_OP")) + content.extend(dump(ns["ATCODES"], "SRE")) + content.extend(dump(ns["CHCODES"], "SRE")) + content.extend(dump2(ns, "SRE_FLAG_")) + content.extend(dump2(ns, "SRE_INFO_")) + + update_file(outfile_constants, ''.join(content)) + + content = [sre_constants_header] + content.append(f"static void *sre_targets[{len(ns['OPCODES'])}] = {{\n") + content.extend(dump_gotos(ns["OPCODES"], "TARGET_SRE_OP")) + content.append("};\n") + + update_file(outfile_targets, ''.join(content)) + + +if __name__ == '__main__': + import sys + main(*sys.argv[1:]) diff --git a/Tools/build/generate_stdlib_module_names.py b/Tools/build/generate_stdlib_module_names.py new file mode 100644 index 0000000..e4f09f8 --- /dev/null +++ b/Tools/build/generate_stdlib_module_names.py @@ -0,0 +1,139 @@ +# This script lists the names of standard library modules +# to update Python/stdlib_mod_names.h +import _imp +import os.path +import re +import subprocess +import sys +import sysconfig + +from check_extension_modules import ModuleChecker + + +SCRIPT_NAME = 'Tools/build/generate_stdlib_module_names.py' + +SRC_DIR = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) +STDLIB_PATH = os.path.join(SRC_DIR, 'Lib') + +IGNORE = { + '__init__', + '__pycache__', + 'site-packages', + + # Test modules and packages + '__hello__', + '__phello__', + '__hello_alias__', + '__phello_alias__', + '__hello_only__', + '_ctypes_test', + '_testbuffer', + '_testcapi', + '_testconsole', + '_testimportmultiple', + '_testinternalcapi', + '_testmultiphase', + '_xxsubinterpreters', + '_xxtestfuzz', + 'distutils.tests', + 'idlelib.idle_test', + 'test', + 'xxlimited', + 'xxlimited_35', + 'xxsubtype', +} + +# Pure Python modules (Lib/*.py) +def list_python_modules(names): + for filename in os.listdir(STDLIB_PATH): + if not filename.endswith(".py"): + continue + name = filename.removesuffix(".py") + names.add(name) + + +# Packages in Lib/ +def list_packages(names): + for name in os.listdir(STDLIB_PATH): + if name in IGNORE: + continue + package_path = os.path.join(STDLIB_PATH, name) + if not os.path.isdir(package_path): + continue + if any(package_file.endswith(".py") + for package_file in os.listdir(package_path)): + names.add(name) + + +# Built-in and extension modules built by Modules/Setup* +# includes Windows and macOS extensions. +def list_modules_setup_extensions(names): + checker = ModuleChecker() + names.update(checker.list_module_names(all=True)) + + +# List frozen modules of the PyImport_FrozenModules list (Python/frozen.c). +# Use the "./Programs/_testembed list_frozen" command. +def list_frozen(names): + submodules = set() + for name in _imp._frozen_module_names(): + # To skip __hello__, __hello_alias__ and etc. + if name.startswith('__'): + continue + if '.' in name: + submodules.add(name) + else: + names.add(name) + # Make sure all frozen submodules have a known parent. + for name in list(submodules): + if name.partition('.')[0] in names: + submodules.remove(name) + if submodules: + raise Exception(f'unexpected frozen submodules: {sorted(submodules)}') + + +def list_modules(): + names = set(sys.builtin_module_names) + list_modules_setup_extensions(names) + list_packages(names) + list_python_modules(names) + list_frozen(names) + + # Remove ignored packages and modules + for name in list(names): + package_name = name.split('.')[0] + # package_name can be equal to name + if package_name in IGNORE: + names.discard(name) + + for name in names: + if "." in name: + raise Exception("sub-modules must not be listed") + + return names + + +def write_modules(fp, names): + print(f"// Auto-generated by {SCRIPT_NAME}.", + file=fp) + print("// List used to create sys.stdlib_module_names.", file=fp) + print(file=fp) + print("static const char* _Py_stdlib_module_names[] = {", file=fp) + for name in sorted(names): + print(f'"{name}",', file=fp) + print("};", file=fp) + + +def main(): + if not sysconfig.is_python_build(): + print(f"ERROR: {sys.executable} is not a Python build", + file=sys.stderr) + sys.exit(1) + + fp = sys.stdout + names = list_modules() + write_modules(fp, names) + + +if __name__ == "__main__": + main() diff --git a/Tools/build/generate_token.py b/Tools/build/generate_token.py new file mode 100755 index 0000000..fc12835 --- /dev/null +++ b/Tools/build/generate_token.py @@ -0,0 +1,282 @@ +#! /usr/bin/env python3 +# This script generates token related files from Grammar/Tokens: +# +# Doc/library/token-list.inc +# Include/token.h +# Parser/token.c +# Lib/token.py + + +SCRIPT_NAME = 'Tools/build/generate_token.py' +AUTO_GENERATED_BY_SCRIPT = f'Auto-generated by {SCRIPT_NAME}' +NT_OFFSET = 256 + +def load_tokens(path): + tok_names = [] + string_to_tok = {} + ERRORTOKEN = None + with open(path) as fp: + for line in fp: + line = line.strip() + # strip comments + i = line.find('#') + if i >= 0: + line = line[:i].strip() + if not line: + continue + fields = line.split() + name = fields[0] + value = len(tok_names) + if name == 'ERRORTOKEN': + ERRORTOKEN = value + string = fields[1] if len(fields) > 1 else None + if string: + string = eval(string) + string_to_tok[string] = value + tok_names.append(name) + return tok_names, ERRORTOKEN, string_to_tok + + +def update_file(file, content): + try: + with open(file, 'r') as fobj: + if fobj.read() == content: + return False + except (OSError, ValueError): + pass + with open(file, 'w') as fobj: + fobj.write(content) + return True + + +token_h_template = f"""\ +/* {AUTO_GENERATED_BY_SCRIPT} */ +""" +token_h_template += """\ + +/* Token types */ +#ifndef Py_INTERNAL_TOKEN_H +#define Py_INTERNAL_TOKEN_H +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef Py_BUILD_CORE +# error "this header requires Py_BUILD_CORE define" +#endif + +#undef TILDE /* Prevent clash of our definition with system macro. Ex AIX, ioctl.h */ + +%s\ +#define N_TOKENS %d +#define NT_OFFSET %d + +/* Special definitions for cooperation with parser */ + +#define ISTERMINAL(x) ((x) < NT_OFFSET) +#define ISNONTERMINAL(x) ((x) >= NT_OFFSET) +#define ISEOF(x) ((x) == ENDMARKER) +#define ISWHITESPACE(x) ((x) == ENDMARKER || \\ + (x) == NEWLINE || \\ + (x) == INDENT || \\ + (x) == DEDENT) + + +// Symbols exported for test_peg_generator +PyAPI_DATA(const char * const) _PyParser_TokenNames[]; /* Token names */ +PyAPI_FUNC(int) _PyToken_OneChar(int); +PyAPI_FUNC(int) _PyToken_TwoChars(int, int); +PyAPI_FUNC(int) _PyToken_ThreeChars(int, int, int); + +#ifdef __cplusplus +} +#endif +#endif // !Py_INTERNAL_TOKEN_H +""" + +def make_h(infile, outfile='Include/internal/pycore_token.h'): + tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile) + + defines = [] + for value, name in enumerate(tok_names[:ERRORTOKEN + 1]): + defines.append("#define %-15s %d\n" % (name, value)) + + if update_file(outfile, token_h_template % ( + ''.join(defines), + len(tok_names), + NT_OFFSET + )): + print("%s regenerated from %s" % (outfile, infile)) + + +token_c_template = f"""\ +/* {AUTO_GENERATED_BY_SCRIPT} */ +""" +token_c_template += """\ + +#include "Python.h" +#include "pycore_token.h" + +/* Token names */ + +const char * const _PyParser_TokenNames[] = { +%s\ +}; + +/* Return the token corresponding to a single character */ + +int +_PyToken_OneChar(int c1) +{ +%s\ + return OP; +} + +int +_PyToken_TwoChars(int c1, int c2) +{ +%s\ + return OP; +} + +int +_PyToken_ThreeChars(int c1, int c2, int c3) +{ +%s\ + return OP; +} +""" + +def generate_chars_to_token(mapping, n=1): + result = [] + write = result.append + indent = ' ' * n + write(indent) + write('switch (c%d) {\n' % (n,)) + for c in sorted(mapping): + write(indent) + value = mapping[c] + if isinstance(value, dict): + write("case '%s':\n" % (c,)) + write(generate_chars_to_token(value, n + 1)) + write(indent) + write(' break;\n') + else: + write("case '%s': return %s;\n" % (c, value)) + write(indent) + write('}\n') + return ''.join(result) + +def make_c(infile, outfile='Parser/token.c'): + tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile) + string_to_tok['<>'] = string_to_tok['!='] + chars_to_token = {} + for string, value in string_to_tok.items(): + assert 1 <= len(string) <= 3 + name = tok_names[value] + m = chars_to_token.setdefault(len(string), {}) + for c in string[:-1]: + m = m.setdefault(c, {}) + m[string[-1]] = name + + names = [] + for value, name in enumerate(tok_names): + if value >= ERRORTOKEN: + name = '<%s>' % name + names.append(' "%s",\n' % name) + names.append(' "<N_TOKENS>",\n') + + if update_file(outfile, token_c_template % ( + ''.join(names), + generate_chars_to_token(chars_to_token[1]), + generate_chars_to_token(chars_to_token[2]), + generate_chars_to_token(chars_to_token[3]) + )): + print("%s regenerated from %s" % (outfile, infile)) + + +token_inc_template = f"""\ +.. {AUTO_GENERATED_BY_SCRIPT} +%s +.. data:: N_TOKENS + +.. data:: NT_OFFSET +""" + +def make_rst(infile, outfile='Doc/library/token-list.inc'): + tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile) + tok_to_string = {value: s for s, value in string_to_tok.items()} + + names = [] + for value, name in enumerate(tok_names[:ERRORTOKEN + 1]): + names.append('.. data:: %s' % (name,)) + if value in tok_to_string: + names.append('') + names.append(' Token value for ``"%s"``.' % tok_to_string[value]) + names.append('') + + if update_file(outfile, token_inc_template % '\n'.join(names)): + print("%s regenerated from %s" % (outfile, infile)) + + +token_py_template = f'''\ +"""Token constants.""" +# {AUTO_GENERATED_BY_SCRIPT} +''' +token_py_template += ''' +__all__ = ['tok_name', 'ISTERMINAL', 'ISNONTERMINAL', 'ISEOF'] + +%s +N_TOKENS = %d +# Special definitions for cooperation with parser +NT_OFFSET = %d + +tok_name = {value: name + for name, value in globals().items() + if isinstance(value, int) and not name.startswith('_')} +__all__.extend(tok_name.values()) + +EXACT_TOKEN_TYPES = { +%s +} + +def ISTERMINAL(x): + return x < NT_OFFSET + +def ISNONTERMINAL(x): + return x >= NT_OFFSET + +def ISEOF(x): + return x == ENDMARKER +''' + +def make_py(infile, outfile='Lib/token.py'): + tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile) + + constants = [] + for value, name in enumerate(tok_names): + constants.append('%s = %d' % (name, value)) + constants.insert(ERRORTOKEN, + "# These aren't used by the C tokenizer but are needed for tokenize.py") + + token_types = [] + for s, value in sorted(string_to_tok.items()): + token_types.append(' %r: %s,' % (s, tok_names[value])) + + if update_file(outfile, token_py_template % ( + '\n'.join(constants), + len(tok_names), + NT_OFFSET, + '\n'.join(token_types), + )): + print("%s regenerated from %s" % (outfile, infile)) + + +def main(op, infile='Grammar/Tokens', *args): + make = globals()['make_' + op] + make(infile, *args) + + +if __name__ == '__main__': + import sys + main(*sys.argv[1:]) diff --git a/Tools/build/parse_html5_entities.py b/Tools/build/parse_html5_entities.py new file mode 100755 index 0000000..d2bf290 --- /dev/null +++ b/Tools/build/parse_html5_entities.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +""" +Utility for parsing HTML5 entity definitions available from: + + https://html.spec.whatwg.org/entities.json + https://html.spec.whatwg.org/multipage/named-characters.html + +The page now contains the following note: + + "This list is static and will not be expanded or changed in the future." + +Written by Ezio Melotti and Iuliia Proskurnia. +""" + +import os +import sys +import json +from urllib.request import urlopen +from html.entities import html5 + +SCRIPT_NAME = 'Tools/build/parse_html5_entities.py' +PAGE_URL = 'https://html.spec.whatwg.org/multipage/named-characters.html' +ENTITIES_URL = 'https://html.spec.whatwg.org/entities.json' +HTML5_SECTION_START = '# HTML5 named character references' + +def get_json(url): + """Download the json file from the url and returns a decoded object.""" + with urlopen(url) as f: + data = f.read().decode('utf-8') + return json.loads(data) + +def create_dict(entities): + """Create the html5 dict from the decoded json object.""" + new_html5 = {} + for name, value in entities.items(): + new_html5[name.lstrip('&')] = value['characters'] + return new_html5 + +def compare_dicts(old, new): + """Compare the old and new dicts and print the differences.""" + added = new.keys() - old.keys() + if added: + print('{} entitie(s) have been added:'.format(len(added))) + for name in sorted(added): + print(' {!r}: {!r}'.format(name, new[name])) + removed = old.keys() - new.keys() + if removed: + print('{} entitie(s) have been removed:'.format(len(removed))) + for name in sorted(removed): + print(' {!r}: {!r}'.format(name, old[name])) + changed = set() + for name in (old.keys() & new.keys()): + if old[name] != new[name]: + changed.add((name, old[name], new[name])) + if changed: + print('{} entitie(s) have been modified:'.format(len(changed))) + for item in sorted(changed): + print(' {!r}: {!r} -> {!r}'.format(*item)) + +def write_items(entities, file=sys.stdout): + """Write the items of the dictionary in the specified file.""" + # The keys in the generated dictionary should be sorted + # in a case-insensitive way, however, when two keys are equal, + # the uppercase version should come first so that the result + # looks like: ['Aacute', 'aacute', 'Aacute;', 'aacute;', ...] + # To do this we first sort in a case-sensitive way (so all the + # uppercase chars come first) and then sort with key=str.lower. + # Since the sorting is stable the uppercase keys will eventually + # be before their equivalent lowercase version. + keys = sorted(entities.keys()) + keys = sorted(keys, key=str.lower) + print(HTML5_SECTION_START, file=file) + print(f'# Generated by {SCRIPT_NAME}\n' + f'# from {ENTITIES_URL} and\n' + f'# {PAGE_URL}.\n' + f'# Map HTML5 named character references to the ' + f'equivalent Unicode character(s).', file=file) + print('html5 = {', file=file) + for name in keys: + print(f' {name!r}: {entities[name]!a},', file=file) + print('}', file=file) + + +if __name__ == '__main__': + # without args print a diff between html.entities.html5 and new_html5 + # with --create print the new html5 dict + # with --patch patch the Lib/html/entities.py file + new_html5 = create_dict(get_json(ENTITIES_URL)) + if '--create' in sys.argv: + write_items(new_html5) + elif '--patch' in sys.argv: + fname = 'Lib/html/entities.py' + temp_fname = fname + '.temp' + with open(fname) as f1, open(temp_fname, 'w') as f2: + skip = False + for line in f1: + if line.startswith(HTML5_SECTION_START): + write_items(new_html5, file=f2) + skip = True + continue + if skip: + # skip the old items until the } + if line.startswith('}'): + skip = False + continue + f2.write(line) + os.remove(fname) + os.rename(temp_fname, fname) + else: + if html5 == new_html5: + print('The current dictionary is updated.') + else: + compare_dicts(html5, new_html5) + print('Run "./python {0} --patch" to update Lib/html/entities.html ' + 'or "./python {0} --create" to see the generated ' 'dictionary.'.format(__file__)) diff --git a/Tools/build/smelly.py b/Tools/build/smelly.py new file mode 100755 index 0000000..276a5ab --- /dev/null +++ b/Tools/build/smelly.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python +# Script checking that all symbols exported by libpython start with Py or _Py + +import os.path +import subprocess +import sys +import sysconfig + + +ALLOWED_PREFIXES = ('Py', '_Py') +if sys.platform == 'darwin': + ALLOWED_PREFIXES += ('__Py',) + +IGNORED_EXTENSION = "_ctypes_test" +# Ignore constructor and destructor functions +IGNORED_SYMBOLS = {'_init', '_fini'} + + +def is_local_symbol_type(symtype): + # Ignore local symbols. + + # If lowercase, the symbol is usually local; if uppercase, the symbol + # is global (external). There are however a few lowercase symbols that + # are shown for special global symbols ("u", "v" and "w"). + if symtype.islower() and symtype not in "uvw": + return True + + # Ignore the initialized data section (d and D) and the BSS data + # section. For example, ignore "__bss_start (type: B)" + # and "_edata (type: D)". + if symtype in "bBdD": + return True + + return False + + +def get_exported_symbols(library, dynamic=False): + print(f"Check that {library} only exports symbols starting with Py or _Py") + + # Only look at dynamic symbols + args = ['nm', '--no-sort'] + if dynamic: + args.append('--dynamic') + args.append(library) + print("+ %s" % ' '.join(args)) + proc = subprocess.run(args, stdout=subprocess.PIPE, universal_newlines=True) + if proc.returncode: + sys.stdout.write(proc.stdout) + sys.exit(proc.returncode) + + stdout = proc.stdout.rstrip() + if not stdout: + raise Exception("command output is empty") + return stdout + + +def get_smelly_symbols(stdout): + smelly_symbols = [] + python_symbols = [] + local_symbols = [] + + for line in stdout.splitlines(): + # Split line '0000000000001b80 D PyTextIOWrapper_Type' + if not line: + continue + + parts = line.split(maxsplit=2) + if len(parts) < 3: + continue + + symtype = parts[1].strip() + symbol = parts[-1] + result = '%s (type: %s)' % (symbol, symtype) + + if symbol.startswith(ALLOWED_PREFIXES): + python_symbols.append(result) + continue + + if is_local_symbol_type(symtype): + local_symbols.append(result) + elif symbol in IGNORED_SYMBOLS: + local_symbols.append(result) + else: + smelly_symbols.append(result) + + if local_symbols: + print(f"Ignore {len(local_symbols)} local symbols") + return smelly_symbols, python_symbols + + +def check_library(library, dynamic=False): + nm_output = get_exported_symbols(library, dynamic) + smelly_symbols, python_symbols = get_smelly_symbols(nm_output) + + if not smelly_symbols: + print(f"OK: no smelly symbol found ({len(python_symbols)} Python symbols)") + return 0 + + print() + smelly_symbols.sort() + for symbol in smelly_symbols: + print("Smelly symbol: %s" % symbol) + + print() + print("ERROR: Found %s smelly symbols!" % len(smelly_symbols)) + return len(smelly_symbols) + + +def check_extensions(): + print(__file__) + # This assumes pybuilddir.txt is in same directory as pyconfig.h. + # In the case of out-of-tree builds, we can't assume pybuilddir.txt is + # in the source folder. + config_dir = os.path.dirname(sysconfig.get_config_h_filename()) + filename = os.path.join(config_dir, "pybuilddir.txt") + try: + with open(filename, encoding="utf-8") as fp: + pybuilddir = fp.readline() + except FileNotFoundError: + print(f"Cannot check extensions because {filename} does not exist") + return True + + print(f"Check extension modules from {pybuilddir} directory") + builddir = os.path.join(config_dir, pybuilddir) + nsymbol = 0 + for name in os.listdir(builddir): + if not name.endswith(".so"): + continue + if IGNORED_EXTENSION in name: + print() + print(f"Ignore extension: {name}") + continue + + print() + filename = os.path.join(builddir, name) + nsymbol += check_library(filename, dynamic=True) + + return nsymbol + + +def main(): + nsymbol = 0 + + # static library + LIBRARY = sysconfig.get_config_var('LIBRARY') + if not LIBRARY: + raise Exception("failed to get LIBRARY variable from sysconfig") + if os.path.exists(LIBRARY): + nsymbol += check_library(LIBRARY) + + # dynamic library + LDLIBRARY = sysconfig.get_config_var('LDLIBRARY') + if not LDLIBRARY: + raise Exception("failed to get LDLIBRARY variable from sysconfig") + if LDLIBRARY != LIBRARY: + print() + nsymbol += check_library(LDLIBRARY, dynamic=True) + + # Check extension modules like _ssl.cpython-310d-x86_64-linux-gnu.so + nsymbol += check_extensions() + + if nsymbol: + print() + print(f"ERROR: Found {nsymbol} smelly symbols in total!") + sys.exit(1) + + print() + print(f"OK: all exported symbols of all libraries " + f"are prefixed with {' or '.join(map(repr, ALLOWED_PREFIXES))}") + + +if __name__ == "__main__": + main() diff --git a/Tools/build/stable_abi.py b/Tools/build/stable_abi.py new file mode 100644 index 0000000..88db93e --- /dev/null +++ b/Tools/build/stable_abi.py @@ -0,0 +1,757 @@ +"""Check the stable ABI manifest or generate files from it + +By default, the tool only checks existing files/libraries. +Pass --generate to recreate auto-generated files instead. + +For actions that take a FILENAME, the filename can be left out to use a default +(relative to the manifest file, as they appear in the CPython codebase). +""" + +from functools import partial +from pathlib import Path +import dataclasses +import subprocess +import sysconfig +import argparse +import textwrap +import tomllib +import difflib +import pprint +import sys +import os +import os.path +import io +import re +import csv + +SCRIPT_NAME = 'Tools/build/stable_abi.py' +MISSING = object() + +EXCLUDED_HEADERS = { + "bytes_methods.h", + "cellobject.h", + "classobject.h", + "code.h", + "compile.h", + "datetime.h", + "dtoa.h", + "frameobject.h", + "genobject.h", + "longintrepr.h", + "parsetok.h", + "pyatomic.h", + "pytime.h", + "token.h", + "ucnhash.h", +} +MACOS = (sys.platform == "darwin") +UNIXY = MACOS or (sys.platform == "linux") # XXX should this be "not Windows"? + + +# The stable ABI manifest (Misc/stable_abi.toml) exists only to fill the +# following dataclasses. +# Feel free to change its syntax (and the `parse_manifest` function) +# to better serve that purpose (while keeping it human-readable). + +class Manifest: + """Collection of `ABIItem`s forming the stable ABI/limited API.""" + def __init__(self): + self.contents = dict() + + def add(self, item): + if item.name in self.contents: + # We assume that stable ABI items do not share names, + # even if they're different kinds (e.g. function vs. macro). + raise ValueError(f'duplicate ABI item {item.name}') + self.contents[item.name] = item + + def select(self, kinds, *, include_abi_only=True, ifdef=None): + """Yield selected items of the manifest + + kinds: set of requested kinds, e.g. {'function', 'macro'} + include_abi_only: if True (default), include all items of the + stable ABI. + If False, include only items from the limited API + (i.e. items people should use today) + ifdef: set of feature macros (e.g. {'HAVE_FORK', 'MS_WINDOWS'}). + If None (default), items are not filtered by this. (This is + different from the empty set, which filters out all such + conditional items.) + """ + for name, item in sorted(self.contents.items()): + if item.kind not in kinds: + continue + if item.abi_only and not include_abi_only: + continue + if (ifdef is not None + and item.ifdef is not None + and item.ifdef not in ifdef): + continue + yield item + + def dump(self): + """Yield lines to recreate the manifest file (sans comments/newlines)""" + for item in self.contents.values(): + fields = dataclasses.fields(item) + yield f"[{item.kind}.{item.name}]" + for field in fields: + if field.name in {'name', 'value', 'kind'}: + continue + value = getattr(item, field.name) + if value == field.default: + pass + elif value is True: + yield f" {field.name} = true" + elif value: + yield f" {field.name} = {value!r}" + + +itemclasses = {} +def itemclass(kind): + """Register the decorated class in `itemclasses`""" + def decorator(cls): + itemclasses[kind] = cls + return cls + return decorator + +@itemclass('function') +@itemclass('macro') +@itemclass('data') +@itemclass('const') +@itemclass('typedef') +@dataclasses.dataclass +class ABIItem: + """Information on one item (function, macro, struct, etc.)""" + + name: str + kind: str + added: str = None + abi_only: bool = False + ifdef: str = None + +@itemclass('feature_macro') +@dataclasses.dataclass(kw_only=True) +class FeatureMacro(ABIItem): + name: str + doc: str + windows: bool = False + abi_only: bool = True + +@itemclass('struct') +@dataclasses.dataclass(kw_only=True) +class Struct(ABIItem): + struct_abi_kind: str + members: list = None + + +def parse_manifest(file): + """Parse the given file (iterable of lines) to a Manifest""" + + manifest = Manifest() + + data = tomllib.load(file) + + for kind, itemclass in itemclasses.items(): + for name, item_data in data[kind].items(): + try: + item = itemclass(name=name, kind=kind, **item_data) + manifest.add(item) + except BaseException as exc: + exc.add_note(f'in {kind} {name}') + raise + + return manifest + +# The tool can run individual "actions". +# Most actions are "generators", which generate a single file from the +# manifest. (Checking works by generating a temp file & comparing.) +# Other actions, like "--unixy-check", don't work on a single file. + +generators = [] +def generator(var_name, default_path): + """Decorates a file generator: function that writes to a file""" + def _decorator(func): + func.var_name = var_name + func.arg_name = '--' + var_name.replace('_', '-') + func.default_path = default_path + generators.append(func) + return func + return _decorator + + +@generator("python3dll", 'PC/python3dll.c') +def gen_python3dll(manifest, args, outfile): + """Generate/check the source for the Windows stable ABI library""" + write = partial(print, file=outfile) + content = f""" + /* Re-export stable Python ABI */ + + /* Generated by {SCRIPT_NAME} */ + """ + content += r""" + #ifdef _M_IX86 + #define DECORATE "_" + #else + #define DECORATE + #endif + + #define EXPORT_FUNC(name) \ + __pragma(comment(linker, "/EXPORT:" DECORATE #name "=" PYTHON_DLL_NAME "." #name)) + #define EXPORT_DATA(name) \ + __pragma(comment(linker, "/EXPORT:" DECORATE #name "=" PYTHON_DLL_NAME "." #name ",DATA")) + """ + write(textwrap.dedent(content)) + + def sort_key(item): + return item.name.lower() + + windows_feature_macros = { + item.name for item in manifest.select({'feature_macro'}) if item.windows + } + for item in sorted( + manifest.select( + {'function'}, + include_abi_only=True, + ifdef=windows_feature_macros), + key=sort_key): + write(f'EXPORT_FUNC({item.name})') + + write() + + for item in sorted( + manifest.select( + {'data'}, + include_abi_only=True, + ifdef=windows_feature_macros), + key=sort_key): + write(f'EXPORT_DATA({item.name})') + +REST_ROLES = { + 'function': 'function', + 'data': 'var', + 'struct': 'type', + 'macro': 'macro', + # 'const': 'const', # all undocumented + 'typedef': 'type', +} + +@generator("doc_list", 'Doc/data/stable_abi.dat') +def gen_doc_annotations(manifest, args, outfile): + """Generate/check the stable ABI list for documentation annotations""" + writer = csv.DictWriter( + outfile, + ['role', 'name', 'added', 'ifdef_note', 'struct_abi_kind'], + lineterminator='\n') + writer.writeheader() + for item in manifest.select(REST_ROLES.keys(), include_abi_only=False): + if item.ifdef: + ifdef_note = manifest.contents[item.ifdef].doc + else: + ifdef_note = None + row = { + 'role': REST_ROLES[item.kind], + 'name': item.name, + 'added': item.added, + 'ifdef_note': ifdef_note} + rows = [row] + if item.kind == 'struct': + row['struct_abi_kind'] = item.struct_abi_kind + for member_name in item.members or (): + rows.append({ + 'role': 'member', + 'name': f'{item.name}.{member_name}', + 'added': item.added}) + writer.writerows(rows) + +@generator("ctypes_test", 'Lib/test/test_stable_abi_ctypes.py') +def gen_ctypes_test(manifest, args, outfile): + """Generate/check the ctypes-based test for exported symbols""" + write = partial(print, file=outfile) + write(textwrap.dedent(''' + # Generated by Tools/scripts/stable_abi.py + + """Test that all symbols of the Stable ABI are accessible using ctypes + """ + + import sys + import unittest + from test.support.import_helper import import_module + from _testcapi import get_feature_macros + + feature_macros = get_feature_macros() + ctypes_test = import_module('ctypes') + + class TestStableABIAvailability(unittest.TestCase): + def test_available_symbols(self): + + for symbol_name in SYMBOL_NAMES: + with self.subTest(symbol_name): + ctypes_test.pythonapi[symbol_name] + + def test_feature_macros(self): + self.assertEqual( + set(get_feature_macros()), EXPECTED_FEATURE_MACROS) + + # The feature macros for Windows are used in creating the DLL + # definition, so they must be known on all platforms. + # If we are on Windows, we check that the hardcoded data matches + # the reality. + @unittest.skipIf(sys.platform != "win32", "Windows specific test") + def test_windows_feature_macros(self): + for name, value in WINDOWS_FEATURE_MACROS.items(): + if value != 'maybe': + with self.subTest(name): + self.assertEqual(feature_macros[name], value) + + SYMBOL_NAMES = ( + ''')) + items = manifest.select( + {'function', 'data'}, + include_abi_only=True, + ) + optional_items = {} + for item in items: + if item.name in ( + # Some symbols aren't exported on all platforms. + # This is a bug: https://bugs.python.org/issue44133 + 'PyModule_Create2', 'PyModule_FromDefAndSpec2', + ): + continue + if item.ifdef: + optional_items.setdefault(item.ifdef, []).append(item.name) + else: + write(f' "{item.name}",') + write(")") + for ifdef, names in optional_items.items(): + write(f"if feature_macros[{ifdef!r}]:") + write(f" SYMBOL_NAMES += (") + for name in names: + write(f" {name!r},") + write(" )") + write("") + feature_macros = list(manifest.select({'feature_macro'})) + feature_names = sorted(m.name for m in feature_macros) + write(f"EXPECTED_FEATURE_MACROS = set({pprint.pformat(feature_names)})") + + windows_feature_macros = {m.name: m.windows for m in feature_macros} + write(f"WINDOWS_FEATURE_MACROS = {pprint.pformat(windows_feature_macros)}") + + +@generator("testcapi_feature_macros", 'Modules/_testcapi_feature_macros.inc') +def gen_testcapi_feature_macros(manifest, args, outfile): + """Generate/check the stable ABI list for documentation annotations""" + write = partial(print, file=outfile) + write('// Generated by Tools/scripts/stable_abi.py') + write() + write('// Add an entry in dict `result` for each Stable ABI feature macro.') + write() + for macro in manifest.select({'feature_macro'}): + name = macro.name + write(f'#ifdef {name}') + write(f' res = PyDict_SetItemString(result, "{name}", Py_True);') + write('#else') + write(f' res = PyDict_SetItemString(result, "{name}", Py_False);') + write('#endif') + write('if (res) {') + write(' Py_DECREF(result); return NULL;') + write('}') + write() + + +def generate_or_check(manifest, args, path, func): + """Generate/check a file with a single generator + + Return True if successful; False if a comparison failed. + """ + + outfile = io.StringIO() + func(manifest, args, outfile) + generated = outfile.getvalue() + existing = path.read_text() + + if generated != existing: + if args.generate: + path.write_text(generated) + else: + print(f'File {path} differs from expected!') + diff = difflib.unified_diff( + generated.splitlines(), existing.splitlines(), + str(path), '<expected>', + lineterm='', + ) + for line in diff: + print(line) + return False + return True + + +def do_unixy_check(manifest, args): + """Check headers & library using "Unixy" tools (GCC/clang, binutils)""" + okay = True + + # Get all macros first: we'll need feature macros like HAVE_FORK and + # MS_WINDOWS for everything else + present_macros = gcc_get_limited_api_macros(['Include/Python.h']) + feature_macros = set(m.name for m in manifest.select({'feature_macro'})) + feature_macros &= present_macros + + # Check that we have all needed macros + expected_macros = set( + item.name for item in manifest.select({'macro'}) + ) + missing_macros = expected_macros - present_macros + okay &= _report_unexpected_items( + missing_macros, + 'Some macros from are not defined from "Include/Python.h"' + + 'with Py_LIMITED_API:') + + expected_symbols = set(item.name for item in manifest.select( + {'function', 'data'}, include_abi_only=True, ifdef=feature_macros, + )) + + # Check the static library (*.a) + LIBRARY = sysconfig.get_config_var("LIBRARY") + if not LIBRARY: + raise Exception("failed to get LIBRARY variable from sysconfig") + if os.path.exists(LIBRARY): + okay &= binutils_check_library( + manifest, LIBRARY, expected_symbols, dynamic=False) + + # Check the dynamic library (*.so) + LDLIBRARY = sysconfig.get_config_var("LDLIBRARY") + if not LDLIBRARY: + raise Exception("failed to get LDLIBRARY variable from sysconfig") + okay &= binutils_check_library( + manifest, LDLIBRARY, expected_symbols, dynamic=False) + + # Check definitions in the header files + expected_defs = set(item.name for item in manifest.select( + {'function', 'data'}, include_abi_only=False, ifdef=feature_macros, + )) + found_defs = gcc_get_limited_api_definitions(['Include/Python.h']) + missing_defs = expected_defs - found_defs + okay &= _report_unexpected_items( + missing_defs, + 'Some expected declarations were not declared in ' + + '"Include/Python.h" with Py_LIMITED_API:') + + # Some Limited API macros are defined in terms of private symbols. + # These are not part of Limited API (even though they're defined with + # Py_LIMITED_API). They must be part of the Stable ABI, though. + private_symbols = {n for n in expected_symbols if n.startswith('_')} + extra_defs = found_defs - expected_defs - private_symbols + okay &= _report_unexpected_items( + extra_defs, + 'Some extra declarations were found in "Include/Python.h" ' + + 'with Py_LIMITED_API:') + + return okay + + +def _report_unexpected_items(items, msg): + """If there are any `items`, report them using "msg" and return false""" + if items: + print(msg, file=sys.stderr) + for item in sorted(items): + print(' -', item, file=sys.stderr) + return False + return True + + +def binutils_get_exported_symbols(library, dynamic=False): + """Retrieve exported symbols using the nm(1) tool from binutils""" + # Only look at dynamic symbols + args = ["nm", "--no-sort"] + if dynamic: + args.append("--dynamic") + args.append(library) + proc = subprocess.run(args, stdout=subprocess.PIPE, universal_newlines=True) + if proc.returncode: + sys.stdout.write(proc.stdout) + sys.exit(proc.returncode) + + stdout = proc.stdout.rstrip() + if not stdout: + raise Exception("command output is empty") + + for line in stdout.splitlines(): + # Split line '0000000000001b80 D PyTextIOWrapper_Type' + if not line: + continue + + parts = line.split(maxsplit=2) + if len(parts) < 3: + continue + + symbol = parts[-1] + if MACOS and symbol.startswith("_"): + yield symbol[1:] + else: + yield symbol + + +def binutils_check_library(manifest, library, expected_symbols, dynamic): + """Check that library exports all expected_symbols""" + available_symbols = set(binutils_get_exported_symbols(library, dynamic)) + missing_symbols = expected_symbols - available_symbols + if missing_symbols: + print(textwrap.dedent(f"""\ + Some symbols from the limited API are missing from {library}: + {', '.join(missing_symbols)} + + This error means that there are some missing symbols among the + ones exported in the library. + This normally means that some symbol, function implementation or + a prototype belonging to a symbol in the limited API has been + deleted or is missing. + """), file=sys.stderr) + return False + return True + + +def gcc_get_limited_api_macros(headers): + """Get all limited API macros from headers. + + Runs the preprocessor over all the header files in "Include" setting + "-DPy_LIMITED_API" to the correct value for the running version of the + interpreter and extracting all macro definitions (via adding -dM to the + compiler arguments). + + Requires Python built with a GCC-compatible compiler. (clang might work) + """ + + api_hexversion = sys.version_info.major << 24 | sys.version_info.minor << 16 + + preprocesor_output_with_macros = subprocess.check_output( + sysconfig.get_config_var("CC").split() + + [ + # Prevent the expansion of the exported macros so we can + # capture them later + "-DSIZEOF_WCHAR_T=4", # The actual value is not important + f"-DPy_LIMITED_API={api_hexversion}", + "-I.", + "-I./Include", + "-dM", + "-E", + ] + + [str(file) for file in headers], + text=True, + ) + + return { + target + for target in re.findall( + r"#define (\w+)", preprocesor_output_with_macros + ) + } + + +def gcc_get_limited_api_definitions(headers): + """Get all limited API definitions from headers. + + Run the preprocessor over all the header files in "Include" setting + "-DPy_LIMITED_API" to the correct value for the running version of the + interpreter. + + The limited API symbols will be extracted from the output of this command + as it includes the prototypes and definitions of all the exported symbols + that are in the limited api. + + This function does *NOT* extract the macros defined on the limited API + + Requires Python built with a GCC-compatible compiler. (clang might work) + """ + api_hexversion = sys.version_info.major << 24 | sys.version_info.minor << 16 + preprocesor_output = subprocess.check_output( + sysconfig.get_config_var("CC").split() + + [ + # Prevent the expansion of the exported macros so we can capture + # them later + "-DPyAPI_FUNC=__PyAPI_FUNC", + "-DPyAPI_DATA=__PyAPI_DATA", + "-DEXPORT_DATA=__EXPORT_DATA", + "-D_Py_NO_RETURN=", + "-DSIZEOF_WCHAR_T=4", # The actual value is not important + f"-DPy_LIMITED_API={api_hexversion}", + "-I.", + "-I./Include", + "-E", + ] + + [str(file) for file in headers], + text=True, + stderr=subprocess.DEVNULL, + ) + stable_functions = set( + re.findall(r"__PyAPI_FUNC\(.*?\)\s*(.*?)\s*\(", preprocesor_output) + ) + stable_exported_data = set( + re.findall(r"__EXPORT_DATA\((.*?)\)", preprocesor_output) + ) + stable_data = set( + re.findall(r"__PyAPI_DATA\(.*?\)[\s\*\(]*([^);]*)\)?.*;", preprocesor_output) + ) + return stable_data | stable_exported_data | stable_functions + +def check_private_names(manifest): + """Ensure limited API doesn't contain private names + + Names prefixed by an underscore are private by definition. + """ + for name, item in manifest.contents.items(): + if name.startswith('_') and not item.abi_only: + raise ValueError( + f'`{name}` is private (underscore-prefixed) and should be ' + + 'removed from the stable ABI list or or marked `abi_only`') + +def check_dump(manifest, filename): + """Check that manifest.dump() corresponds to the data. + + Mainly useful when debugging this script. + """ + dumped = tomllib.loads('\n'.join(manifest.dump())) + with filename.open('rb') as file: + from_file = tomllib.load(file) + if dumped != from_file: + print(f'Dump differs from loaded data!', file=sys.stderr) + diff = difflib.unified_diff( + pprint.pformat(dumped).splitlines(), + pprint.pformat(from_file).splitlines(), + '<dumped>', str(filename), + lineterm='', + ) + for line in diff: + print(line, file=sys.stderr) + return False + else: + return True + +def main(): + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "file", type=Path, metavar='FILE', + help="file with the stable abi manifest", + ) + parser.add_argument( + "--generate", action='store_true', + help="generate file(s), rather than just checking them", + ) + parser.add_argument( + "--generate-all", action='store_true', + help="as --generate, but generate all file(s) using default filenames." + + " (unlike --all, does not run any extra checks)", + ) + parser.add_argument( + "-a", "--all", action='store_true', + help="run all available checks using default filenames", + ) + parser.add_argument( + "-l", "--list", action='store_true', + help="list available generators and their default filenames; then exit", + ) + parser.add_argument( + "--dump", action='store_true', + help="dump the manifest contents (used for debugging the parser)", + ) + + actions_group = parser.add_argument_group('actions') + for gen in generators: + actions_group.add_argument( + gen.arg_name, dest=gen.var_name, + type=str, nargs="?", default=MISSING, + metavar='FILENAME', + help=gen.__doc__, + ) + actions_group.add_argument( + '--unixy-check', action='store_true', + help=do_unixy_check.__doc__, + ) + args = parser.parse_args() + + base_path = args.file.parent.parent + + if args.list: + for gen in generators: + print(f'{gen.arg_name}: {base_path / gen.default_path}') + sys.exit(0) + + run_all_generators = args.generate_all + + if args.generate_all: + args.generate = True + + if args.all: + run_all_generators = True + args.unixy_check = True + + try: + file = args.file.open('rb') + except FileNotFoundError as err: + if args.file.suffix == '.txt': + # Provide a better error message + suggestion = args.file.with_suffix('.toml') + raise FileNotFoundError( + f'{args.file} not found. Did you mean {suggestion} ?') from err + raise + with file: + manifest = parse_manifest(file) + + check_private_names(manifest) + + # Remember results of all actions (as booleans). + # At the end we'll check that at least one action was run, + # and also fail if any are false. + results = {} + + if args.dump: + for line in manifest.dump(): + print(line) + results['dump'] = check_dump(manifest, args.file) + + for gen in generators: + filename = getattr(args, gen.var_name) + if filename is None or (run_all_generators and filename is MISSING): + filename = base_path / gen.default_path + elif filename is MISSING: + continue + + results[gen.var_name] = generate_or_check(manifest, args, filename, gen) + + if args.unixy_check: + results['unixy_check'] = do_unixy_check(manifest, args) + + if not results: + if args.generate: + parser.error('No file specified. Use --help for usage.') + parser.error('No check specified. Use --help for usage.') + + failed_results = [name for name, result in results.items() if not result] + + if failed_results: + raise Exception(f""" + These checks related to the stable ABI did not succeed: + {', '.join(failed_results)} + + If you see diffs in the output, files derived from the stable + ABI manifest the were not regenerated. + Run `make regen-limited-abi` to fix this. + + Otherwise, see the error(s) above. + + The stable ABI manifest is at: {args.file} + Note that there is a process to follow when modifying it. + + You can read more about the limited API and its contracts at: + + https://docs.python.org/3/c-api/stable.html + + And in PEP 384: + + https://peps.python.org/pep-0384/ + """) + + +if __name__ == "__main__": + main() diff --git a/Tools/build/umarshal.py b/Tools/build/umarshal.py new file mode 100644 index 0000000..f61570c --- /dev/null +++ b/Tools/build/umarshal.py @@ -0,0 +1,325 @@ +# Implementat marshal.loads() in pure Python + +import ast + +from typing import Any, Tuple + + +class Type: + # Adapted from marshal.c + NULL = ord('0') + NONE = ord('N') + FALSE = ord('F') + TRUE = ord('T') + STOPITER = ord('S') + ELLIPSIS = ord('.') + INT = ord('i') + INT64 = ord('I') + FLOAT = ord('f') + BINARY_FLOAT = ord('g') + COMPLEX = ord('x') + BINARY_COMPLEX = ord('y') + LONG = ord('l') + STRING = ord('s') + INTERNED = ord('t') + REF = ord('r') + TUPLE = ord('(') + LIST = ord('[') + DICT = ord('{') + CODE = ord('c') + UNICODE = ord('u') + UNKNOWN = ord('?') + SET = ord('<') + FROZENSET = ord('>') + ASCII = ord('a') + ASCII_INTERNED = ord('A') + SMALL_TUPLE = ord(')') + SHORT_ASCII = ord('z') + SHORT_ASCII_INTERNED = ord('Z') + + +FLAG_REF = 0x80 # with a type, add obj to index + +NULL = object() # marker + +# Cell kinds +CO_FAST_LOCAL = 0x20 +CO_FAST_CELL = 0x40 +CO_FAST_FREE = 0x80 + + +class Code: + def __init__(self, **kwds: Any): + self.__dict__.update(kwds) + + def __repr__(self) -> str: + return f"Code(**{self.__dict__})" + + co_localsplusnames: Tuple[str] + co_localspluskinds: Tuple[int] + + def get_localsplus_names(self, select_kind: int) -> Tuple[str, ...]: + varnames: list[str] = [] + for name, kind in zip(self.co_localsplusnames, + self.co_localspluskinds): + if kind & select_kind: + varnames.append(name) + return tuple(varnames) + + @property + def co_varnames(self) -> Tuple[str, ...]: + return self.get_localsplus_names(CO_FAST_LOCAL) + + @property + def co_cellvars(self) -> Tuple[str, ...]: + return self.get_localsplus_names(CO_FAST_CELL) + + @property + def co_freevars(self) -> Tuple[str, ...]: + return self.get_localsplus_names(CO_FAST_FREE) + + @property + def co_nlocals(self) -> int: + return len(self.co_varnames) + + +class Reader: + # A fairly literal translation of the marshal reader. + + def __init__(self, data: bytes): + self.data: bytes = data + self.end: int = len(self.data) + self.pos: int = 0 + self.refs: list[Any] = [] + self.level: int = 0 + + def r_string(self, n: int) -> bytes: + assert 0 <= n <= self.end - self.pos + buf = self.data[self.pos : self.pos + n] + self.pos += n + return buf + + def r_byte(self) -> int: + buf = self.r_string(1) + return buf[0] + + def r_short(self) -> int: + buf = self.r_string(2) + x = buf[0] + x |= buf[1] << 8 + x |= -(x & (1<<15)) # Sign-extend + return x + + def r_long(self) -> int: + buf = self.r_string(4) + x = buf[0] + x |= buf[1] << 8 + x |= buf[2] << 16 + x |= buf[3] << 24 + x |= -(x & (1<<31)) # Sign-extend + return x + + def r_long64(self) -> int: + buf = self.r_string(8) + x = buf[0] + x |= buf[1] << 8 + x |= buf[2] << 16 + x |= buf[3] << 24 + x |= buf[1] << 32 + x |= buf[1] << 40 + x |= buf[1] << 48 + x |= buf[1] << 56 + x |= -(x & (1<<63)) # Sign-extend + return x + + def r_PyLong(self) -> int: + n = self.r_long() + size = abs(n) + x = 0 + # Pray this is right + for i in range(size): + x |= self.r_short() << i*15 + if n < 0: + x = -x + return x + + def r_float_bin(self) -> float: + buf = self.r_string(8) + import struct # Lazy import to avoid breaking UNIX build + return struct.unpack("d", buf)[0] + + def r_float_str(self) -> float: + n = self.r_byte() + buf = self.r_string(n) + return ast.literal_eval(buf.decode("ascii")) + + def r_ref_reserve(self, flag: int) -> int: + if flag: + idx = len(self.refs) + self.refs.append(None) + return idx + else: + return 0 + + def r_ref_insert(self, obj: Any, idx: int, flag: int) -> Any: + if flag: + self.refs[idx] = obj + return obj + + def r_ref(self, obj: Any, flag: int) -> Any: + assert flag & FLAG_REF + self.refs.append(obj) + return obj + + def r_object(self) -> Any: + old_level = self.level + try: + return self._r_object() + finally: + self.level = old_level + + def _r_object(self) -> Any: + code = self.r_byte() + flag = code & FLAG_REF + type = code & ~FLAG_REF + # print(" "*self.level + f"{code} {flag} {type} {chr(type)!r}") + self.level += 1 + + def R_REF(obj: Any) -> Any: + if flag: + obj = self.r_ref(obj, flag) + return obj + + if type == Type.NULL: + return NULL + elif type == Type.NONE: + return None + elif type == Type.ELLIPSIS: + return Ellipsis + elif type == Type.FALSE: + return False + elif type == Type.TRUE: + return True + elif type == Type.INT: + return R_REF(self.r_long()) + elif type == Type.INT64: + return R_REF(self.r_long64()) + elif type == Type.LONG: + return R_REF(self.r_PyLong()) + elif type == Type.FLOAT: + return R_REF(self.r_float_str()) + elif type == Type.BINARY_FLOAT: + return R_REF(self.r_float_bin()) + elif type == Type.COMPLEX: + return R_REF(complex(self.r_float_str(), + self.r_float_str())) + elif type == Type.BINARY_COMPLEX: + return R_REF(complex(self.r_float_bin(), + self.r_float_bin())) + elif type == Type.STRING: + n = self.r_long() + return R_REF(self.r_string(n)) + elif type == Type.ASCII_INTERNED or type == Type.ASCII: + n = self.r_long() + return R_REF(self.r_string(n).decode("ascii")) + elif type == Type.SHORT_ASCII_INTERNED or type == Type.SHORT_ASCII: + n = self.r_byte() + return R_REF(self.r_string(n).decode("ascii")) + elif type == Type.INTERNED or type == Type.UNICODE: + n = self.r_long() + return R_REF(self.r_string(n).decode("utf8", "surrogatepass")) + elif type == Type.SMALL_TUPLE: + n = self.r_byte() + idx = self.r_ref_reserve(flag) + retval: Any = tuple(self.r_object() for _ in range(n)) + self.r_ref_insert(retval, idx, flag) + return retval + elif type == Type.TUPLE: + n = self.r_long() + idx = self.r_ref_reserve(flag) + retval = tuple(self.r_object() for _ in range(n)) + self.r_ref_insert(retval, idx, flag) + return retval + elif type == Type.LIST: + n = self.r_long() + retval = R_REF([]) + for _ in range(n): + retval.append(self.r_object()) + return retval + elif type == Type.DICT: + retval = R_REF({}) + while True: + key = self.r_object() + if key == NULL: + break + val = self.r_object() + retval[key] = val + return retval + elif type == Type.SET: + n = self.r_long() + retval = R_REF(set()) + for _ in range(n): + v = self.r_object() + retval.add(v) + return retval + elif type == Type.FROZENSET: + n = self.r_long() + s: set[Any] = set() + idx = self.r_ref_reserve(flag) + for _ in range(n): + v = self.r_object() + s.add(v) + retval = frozenset(s) + self.r_ref_insert(retval, idx, flag) + return retval + elif type == Type.CODE: + retval = R_REF(Code()) + retval.co_argcount = self.r_long() + retval.co_posonlyargcount = self.r_long() + retval.co_kwonlyargcount = self.r_long() + retval.co_stacksize = self.r_long() + retval.co_flags = self.r_long() + retval.co_code = self.r_object() + retval.co_consts = self.r_object() + retval.co_names = self.r_object() + retval.co_localsplusnames = self.r_object() + retval.co_localspluskinds = self.r_object() + retval.co_filename = self.r_object() + retval.co_name = self.r_object() + retval.co_qualname = self.r_object() + retval.co_firstlineno = self.r_long() + retval.co_linetable = self.r_object() + retval.co_exceptiontable = self.r_object() + return retval + elif type == Type.REF: + n = self.r_long() + retval = self.refs[n] + assert retval is not None + return retval + else: + breakpoint() + raise AssertionError(f"Unknown type {type} {chr(type)!r}") + + +def loads(data: bytes) -> Any: + assert isinstance(data, bytes) + r = Reader(data) + return r.r_object() + + +def main(): + # Test + import marshal, pprint + sample = {'foo': {(42, "bar", 3.14)}} + data = marshal.dumps(sample) + retval = loads(data) + assert retval == sample, retval + sample = main.__code__ + data = marshal.dumps(sample) + retval = loads(data) + assert isinstance(retval, Code), retval + pprint.pprint(retval.__dict__) + + +if __name__ == "__main__": + main() diff --git a/Tools/build/update_file.py b/Tools/build/update_file.py new file mode 100644 index 0000000..b4182c1 --- /dev/null +++ b/Tools/build/update_file.py @@ -0,0 +1,92 @@ +""" +A script that replaces an old file with a new one, only if the contents +actually changed. If not, the new file is simply deleted. + +This avoids wholesale rebuilds when a code (re)generation phase does not +actually change the in-tree generated code. +""" + +import contextlib +import os +import os.path +import sys + + +@contextlib.contextmanager +def updating_file_with_tmpfile(filename, tmpfile=None): + """A context manager for updating a file via a temp file. + + The context manager provides two open files: the source file open + for reading, and the temp file, open for writing. + + Upon exiting: both files are closed, and the source file is replaced + with the temp file. + """ + # XXX Optionally use tempfile.TemporaryFile? + if not tmpfile: + tmpfile = filename + '.tmp' + elif os.path.isdir(tmpfile): + tmpfile = os.path.join(tmpfile, filename + '.tmp') + + with open(filename, 'rb') as infile: + line = infile.readline() + + if line.endswith(b'\r\n'): + newline = "\r\n" + elif line.endswith(b'\r'): + newline = "\r" + elif line.endswith(b'\n'): + newline = "\n" + else: + raise ValueError(f"unknown end of line: {filename}: {line!a}") + + with open(tmpfile, 'w', newline=newline) as outfile: + with open(filename) as infile: + yield infile, outfile + update_file_with_tmpfile(filename, tmpfile) + + +def update_file_with_tmpfile(filename, tmpfile, *, create=False): + try: + targetfile = open(filename, 'rb') + except FileNotFoundError: + if not create: + raise # re-raise + outcome = 'created' + os.replace(tmpfile, filename) + else: + with targetfile: + old_contents = targetfile.read() + with open(tmpfile, 'rb') as f: + new_contents = f.read() + # Now compare! + if old_contents != new_contents: + outcome = 'updated' + os.replace(tmpfile, filename) + else: + outcome = 'same' + os.unlink(tmpfile) + return outcome + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--create', action='store_true') + parser.add_argument('--exitcode', action='store_true') + parser.add_argument('filename', help='path to be updated') + parser.add_argument('tmpfile', help='path with new contents') + args = parser.parse_args() + kwargs = vars(args) + setexitcode = kwargs.pop('exitcode') + + outcome = update_file_with_tmpfile(**kwargs) + if setexitcode: + if outcome == 'same': + sys.exit(0) + elif outcome == 'updated': + sys.exit(1) + elif outcome == 'created': + sys.exit(2) + else: + raise NotImplementedError diff --git a/Tools/build/verify_ensurepip_wheels.py b/Tools/build/verify_ensurepip_wheels.py new file mode 100755 index 0000000..044d1fd --- /dev/null +++ b/Tools/build/verify_ensurepip_wheels.py @@ -0,0 +1,98 @@ +#! /usr/bin/env python3 + +""" +Compare checksums for wheels in :mod:`ensurepip` against the Cheeseshop. + +When GitHub Actions executes the script, output is formatted accordingly. +https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#setting-a-notice-message +""" + +import hashlib +import json +import os +import re +from pathlib import Path +from urllib.request import urlopen + +PACKAGE_NAMES = ("pip", "setuptools") +ENSURE_PIP_ROOT = Path(__file__).parent.parent.parent / "Lib/ensurepip" +WHEEL_DIR = ENSURE_PIP_ROOT / "_bundled" +ENSURE_PIP_INIT_PY_TEXT = (ENSURE_PIP_ROOT / "__init__.py").read_text(encoding="utf-8") +GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true" + + +def print_notice(file_path: str, message: str) -> None: + if GITHUB_ACTIONS: + message = f"::notice file={file_path}::{message}" + print(message, end="\n\n") + + +def print_error(file_path: str, message: str) -> None: + if GITHUB_ACTIONS: + message = f"::error file={file_path}::{message}" + print(message, end="\n\n") + + +def verify_wheel(package_name: str) -> bool: + # Find the package on disk + package_path = next(WHEEL_DIR.glob(f"{package_name}*.whl"), None) + if not package_path: + print_error("", f"Could not find a {package_name} wheel on disk.") + return False + + print(f"Verifying checksum for {package_path}.") + + # Find the version of the package used by ensurepip + package_version_match = re.search( + f'_{package_name.upper()}_VERSION = "([^"]+)', ENSURE_PIP_INIT_PY_TEXT + ) + if not package_version_match: + print_error( + package_path, + f"No {package_name} version found in Lib/ensurepip/__init__.py.", + ) + return False + package_version = package_version_match[1] + + # Get the SHA 256 digest from the Cheeseshop + try: + raw_text = urlopen(f"https://pypi.org/pypi/{package_name}/json").read() + except (OSError, ValueError): + print_error(package_path, f"Could not fetch JSON metadata for {package_name}.") + return False + + release_files = json.loads(raw_text)["releases"][package_version] + for release_info in release_files: + if package_path.name != release_info["filename"]: + continue + expected_digest = release_info["digests"].get("sha256", "") + break + else: + print_error(package_path, f"No digest for {package_name} found from PyPI.") + return False + + # Compute the SHA 256 digest of the wheel on disk + actual_digest = hashlib.sha256(package_path.read_bytes()).hexdigest() + + print(f"Expected digest: {expected_digest}") + print(f"Actual digest: {actual_digest}") + + if actual_digest != expected_digest: + print_error( + package_path, f"Failed to verify the checksum of the {package_name} wheel." + ) + return False + + print_notice( + package_path, + f"Successfully verified the checksum of the {package_name} wheel.", + ) + return True + + +if __name__ == "__main__": + exit_status = 0 + for package_name in PACKAGE_NAMES: + if not verify_wheel(package_name): + exit_status = 1 + raise SystemExit(exit_status) |