diff options
author | Eric Snow <ericsnowcurrently@gmail.com> | 2022-02-08 20:39:07 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-02-08 20:39:07 (GMT) |
commit | 81c72044a181dbbfbf689d7a977d0d99090f26a8 (patch) | |
tree | 14329746bd6f179cf2ae7c9818e1ae881eb46360 /Tools | |
parent | c018d3037b5b62e6d48d5985d1a37b91762fbffb (diff) | |
download | cpython-81c72044a181dbbfbf689d7a977d0d99090f26a8.zip cpython-81c72044a181dbbfbf689d7a977d0d99090f26a8.tar.gz cpython-81c72044a181dbbfbf689d7a977d0d99090f26a8.tar.bz2 |
bpo-46541: Replace core use of _Py_IDENTIFIER() with statically initialized global objects. (gh-30928)
We're no longer using _Py_IDENTIFIER() (or _Py_static_string()) in any core CPython code. It is still used in a number of non-builtin stdlib modules.
The replacement is: PyUnicodeObject (not pointer) fields under _PyRuntimeState, statically initialized as part of _PyRuntime. A new _Py_GET_GLOBAL_IDENTIFIER() macro facilitates lookup of the fields (along with _Py_GET_GLOBAL_STRING() for non-identifier strings).
https://bugs.python.org/issue46541#msg411799 explains the rationale for this change.
The core of the change is in:
* (new) Include/internal/pycore_global_strings.h - the declarations for the global strings, along with the macros
* Include/internal/pycore_runtime_init.h - added the static initializers for the global strings
* Include/internal/pycore_global_objects.h - where the struct in pycore_global_strings.h is hooked into _PyRuntimeState
* Tools/scripts/generate_global_objects.py - added generation of the global string declarations and static initializers
I've also added a --check flag to generate_global_objects.py (along with make check-global-objects) to check for unused global strings. That check is added to the PR CI config.
The remainder of this change updates the core code to use _Py_GET_GLOBAL_IDENTIFIER() instead of _Py_IDENTIFIER() and the related _Py*Id functions (likewise for _Py_GET_GLOBAL_STRING() instead of _Py_static_string()). This includes adding a few functions where there wasn't already an alternative to _Py*Id(), replacing the _Py_Identifier * parameter with PyObject *.
The following are not changed (yet):
* stop using _Py_IDENTIFIER() in the stdlib modules
* (maybe) get rid of _Py_IDENTIFIER(), etc. entirely -- this may not be doable as at least one package on PyPI using this (private) API
* (maybe) intern the strings during runtime init
https://bugs.python.org/issue46541
Diffstat (limited to 'Tools')
-rw-r--r-- | Tools/c-analyzer/TODO | 2 | ||||
-rw-r--r-- | Tools/scripts/generate_global_objects.py | 495 |
2 files changed, 486 insertions, 11 deletions
diff --git a/Tools/c-analyzer/TODO b/Tools/c-analyzer/TODO index 4b9b285..55338eb 100644 --- a/Tools/c-analyzer/TODO +++ b/Tools/c-analyzer/TODO @@ -156,8 +156,6 @@ Objects/codeobject.c:PyCode_NewEmpty():emptystring static PyObject Objects/exceptions.c:_check_for_legacy_statements():exec_prefix static PyObject *exec_prefix Objects/exceptions.c:_check_for_legacy_statements():print_prefix static PyObject *print_prefix Objects/funcobject.c:PyFunction_NewWithQualName():__name__ static PyObject *__name__ -Objects/typeobject.c:object___reduce_ex___impl():objreduce static PyObject *objreduce -Objects/typeobject.c:resolve_slotdups():pname static PyObject *pname Objects/unicodeobject.c:unicode_empty static PyObject *unicode_empty Objects/unicodeobject.c:unicode_latin1 static PyObject *unicode_latin1[256] Python/_warnings.c:is_internal_frame():bootstrap_string static PyObject *bootstrap_string diff --git a/Tools/scripts/generate_global_objects.py b/Tools/scripts/generate_global_objects.py index a06d201..b184e74 100644 --- a/Tools/scripts/generate_global_objects.py +++ b/Tools/scripts/generate_global_objects.py @@ -1,9 +1,7 @@ -import argparse -import ast -import builtins -import collections import contextlib +import glob import os.path +import re import sys @@ -12,6 +10,298 @@ ROOT = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) INTERNAL = os.path.join(ROOT, 'Include', 'internal') +STRING_LITERALS = { + 'empty': '', + 'dot': '.', + 'comma_sep': ', ', + 'percent': '%', + 'dbl_percent': '%%', + + '"anonymous" labels': None, + 'anon_dictcomp': '<dictcomp>', + 'anon_genexpr': '<genexpr>', + 'anon_lambda': '<lambda>', + 'anon_listcomp': '<listcomp>', + 'anon_module': '<module>', + 'anon_setcomp': '<setcomp>', + 'anon_string': '<string>', + 'dot_locals': '.<locals>', +} +IDENTIFIERS = [ + 'Py_Repr', + 'TextIOWrapper', + 'WarningMessage', + '_', + '__IOBase_closed', + '__abc_tpflags__', + '__abs__', + '__abstractmethods__', + '__add__', + '__aenter__', + '__aexit__', + '__aiter__', + '__all__', + '__and__', + '__anext__', + '__annotations__', + '__args__', + '__await__', + '__bases__', + '__bool__', + '__build_class__', + '__builtins__', + '__bytes__', + '__call__', + '__cantrace__', + '__class__', + '__class_getitem__', + '__classcell__', + '__complex__', + '__contains__', + '__copy__', + '__del__', + '__delattr__', + '__delete__', + '__delitem__', + '__dict__', + '__dir__', + '__divmod__', + '__doc__', + '__enter__', + '__eq__', + '__exit__', + '__file__', + '__float__', + '__floordiv__', + '__format__', + '__fspath__', + '__ge__', + '__get__', + '__getattr__', + '__getattribute__', + '__getinitargs__', + '__getitem__', + '__getnewargs__', + '__getnewargs_ex__', + '__getstate__', + '__gt__', + '__hash__', + '__iadd__', + '__iand__', + '__ifloordiv__', + '__ilshift__', + '__imatmul__', + '__imod__', + '__import__', + '__imul__', + '__index__', + '__init__', + '__init_subclass__', + '__instancecheck__', + '__int__', + '__invert__', + '__ior__', + '__ipow__', + '__irshift__', + '__isabstractmethod__', + '__isub__', + '__iter__', + '__itruediv__', + '__ixor__', + '__le__', + '__len__', + '__length_hint__', + '__loader__', + '__lshift__', + '__lt__', + '__ltrace__', + '__main__', + '__matmul__', + '__missing__', + '__mod__', + '__module__', + '__mro_entries__', + '__mul__', + '__name__', + '__ne__', + '__neg__', + '__new__', + '__newobj__', + '__newobj_ex__', + '__next__', + '__note__', + '__or__', + '__origin__', + '__package__', + '__parameters__', + '__path__', + '__pos__', + '__pow__', + '__prepare__', + '__qualname__', + '__radd__', + '__rand__', + '__rdivmod__', + '__reduce__', + '__reduce_ex__', + '__repr__', + '__reversed__', + '__rfloordiv__', + '__rlshift__', + '__rmatmul__', + '__rmod__', + '__rmul__', + '__ror__', + '__round__', + '__rpow__', + '__rrshift__', + '__rshift__', + '__rsub__', + '__rtruediv__', + '__rxor__', + '__set__', + '__set_name__', + '__setattr__', + '__setitem__', + '__setstate__', + '__sizeof__', + '__slotnames__', + '__slots__', + '__spec__', + '__str__', + '__sub__', + '__subclasscheck__', + '__subclasshook__', + '__truediv__', + '__trunc__', + '__warningregistry__', + '__weakref__', + '__xor__', + '_abc_impl', + '_blksize', + '_dealloc_warn', + '_finalizing', + '_find_and_load', + '_fix_up_module', + '_get_sourcefile', + '_handle_fromlist', + '_initializing', + '_is_text_encoding', + '_lock_unlock_module', + '_showwarnmsg', + '_shutdown', + '_slotnames', + '_strptime_time', + '_uninitialized_submodules', + '_warn_unawaited_coroutine', + '_xoptions', + 'add', + 'append', + 'big', + 'buffer', + 'builtins', + 'clear', + 'close', + 'code', + 'copy', + 'copyreg', + 'decode', + 'default', + 'defaultaction', + 'difference_update', + 'dispatch_table', + 'displayhook', + 'enable', + 'encoding', + 'end_lineno', + 'end_offset', + 'errors', + 'excepthook', + 'extend', + 'filename', + 'fileno', + 'fillvalue', + 'filters', + 'find_class', + 'flush', + 'get', + 'get_source', + 'getattr', + 'ignore', + 'importlib', + 'intersection', + 'isatty', + 'items', + 'iter', + 'keys', + 'last_traceback', + 'last_type', + 'last_value', + 'latin1', + 'lineno', + 'little', + 'match', + 'metaclass', + 'mode', + 'modules', + 'mro', + 'msg', + 'n_fields', + 'n_sequence_fields', + 'n_unnamed_fields', + 'name', + 'obj', + 'offset', + 'onceregistry', + 'open', + 'parent', + 'partial', + 'path', + 'peek', + 'persistent_id', + 'persistent_load', + 'print_file_and_line', + 'ps1', + 'ps2', + 'raw', + 'read', + 'read1', + 'readable', + 'readall', + 'readinto', + 'readinto1', + 'readline', + 'reducer_override', + 'reload', + 'replace', + 'reset', + 'return', + 'reversed', + 'seek', + 'seekable', + 'send', + 'setstate', + 'sort', + 'stderr', + 'stdin', + 'stdout', + 'strict', + 'symmetric_difference_update', + 'tell', + 'text', + 'threading', + 'throw', + 'unraisablehook', + 'values', + 'version', + 'warnings', + 'warnoptions', + 'writable', + 'write', + 'zipimporter', +] + + ####################################### # helpers @@ -64,6 +354,38 @@ START = '/* The following is auto-generated by Tools/scripts/generate_global_obj END = '/* End auto-generated code */' +def generate_global_strings(): + filename = os.path.join(INTERNAL, 'pycore_global_strings.h') + + # Read the non-generated part of the file. + with open(filename) as infile: + before = ''.join(iter_to_marker(infile, START))[:-1] + for _ in iter_to_marker(infile, END): + pass + after = infile.read()[:-1] + + # Generate the file. + with open(filename, 'w', encoding='utf-8') as outfile: + printer = Printer(outfile) + printer.write(before) + printer.write(START) + with printer.block('struct _Py_global_strings', ';'): + with printer.block('struct', ' literals;'): + for name, literal in STRING_LITERALS.items(): + if literal is None: + outfile.write('\n') + printer.write(f'// {name}') + else: + printer.write(f'STRUCT_FOR_STR({name}, "{literal}")') + outfile.write('\n') + with printer.block('struct', ' identifiers;'): + for name in sorted(IDENTIFIERS): + assert name.isidentifier(), name + printer.write(f'STRUCT_FOR_ID({name})') + printer.write(END) + printer.write(after) + + def generate_runtime_init(): # First get some info from the declarations. nsmallposints = None @@ -106,19 +428,174 @@ def generate_runtime_init(): with printer.block('.bytes_characters =', ','): for i in range(256): printer.write(f'_PyBytes_CHAR_INIT({i}),') + printer.write('') + # Global strings. + with printer.block('.strings =', ','): + with printer.block('.literals =', ','): + for name, literal in STRING_LITERALS.items(): + if literal is None: + printer.write('') + else: + printer.write(f'INIT_STR({name}, "{literal}"),') + with printer.block('.identifiers =', ','): + for name in sorted(IDENTIFIERS): + assert name.isidentifier(), name + printer.write(f'INIT_ID({name}),') printer.write(END) printer.write(after) ####################################### +# checks + +def err(msg): + print(msg, file=sys.stderr) + + +GETTER_RE = re.compile(r''' + ^ + .*? + (?: + (?: + _Py_ID + [(] + ( \w+ ) # <identifier> + [)] + ) + | + (?: + _Py_STR + [(] + ( \w+ ) # <literal> + [)] + ) + ) +''', re.VERBOSE) +TYPESLOTS_RE = re.compile(r''' + ^ + .*? + (?: + (?: + SLOT0 [(] .*?, \s* + ( \w+ ) # <slot0> + [)] + ) + | + (?: + SLOT1 [(] .*?, \s* + ( \w+ ) # <slot1> + , .* [)] + ) + | + (?: + SLOT1BIN [(] .*?, .*?, \s* + ( \w+ ) # <slot1bin> + , \s* + ( \w+ ) # <reverse> + [)] + ) + | + (?: + SLOT1BINFULL [(] .*?, .*?, .*?, \s* + ( \w+ ) # <slot1binfull> + , \s* + ( \w+ ) # <fullreverse> + [)] + ) + | + ( SLOT \d .* [^)] $ ) # <wrapped> + ) +''', re.VERBOSE) + +def check_orphan_strings(): + literals = set(n for n, s in STRING_LITERALS.items() if s) + identifiers = set(IDENTIFIERS) + files = glob.iglob(os.path.join(ROOT, '**', '*.[ch]'), recursive=True) + for i, filename in enumerate(files, start=1): + print('.', end='') + if i % 5 == 0: + print(' ', end='') + if i % 20 == 0: + print() + if i % 100 == 0: + print() + with open(filename) as infile: + wrapped = None + for line in infile: + identifier = literal = reverse = None + + line = line.splitlines()[0] + if wrapped: + line = f'{wrapped.rstrip()} {line}' + wrapped = None + + if os.path.basename(filename) == '_warnings.c': + m = re.match(r'^.* = GET_WARNINGS_ATTR[(][^,]*, (\w+),', line) + if m: + identifier, = m.groups() + elif os.path.basename(filename) == 'typeobject.c': + m = TYPESLOTS_RE.match(line) + if m: + (slot0, + slot1, + slot1bin, reverse, + slot1binfull, fullreverse, + wrapped, + ) = m.groups() + identifier = slot0 or slot1 or slot1bin or slot1binfull + reverse = reverse or fullreverse + + if not identifier and not literal: + m = GETTER_RE.match(line) + if not m: + continue + identifier, literal = m.groups() + + if literal: + if literals and literal in literals: + literals.remove(literal) + if identifier: + if identifiers and identifier in identifiers: + identifiers.remove(identifier) + if reverse: + if identifiers and reverse in identifiers: + identifiers.remove(reverse) + if not literals and not identifiers: + break + else: + continue + break + if i % 20: + print() + if not literals and not identifiers: + return + print('ERROR:', file=sys.stderr) + if literals: + err(' unused global string literals:') + for name in sorted(literals): + err(f' {name}') + if identifiers: + if literals: + print() + err(' unused global identifiers:') + for name in sorted(identifiers): + err(f' {name}') + + +####################################### # the script -def main() -> None: +def main(*, check=False) -> None: + generate_global_strings() generate_runtime_init() + if check: + check_orphan_strings() + if __name__ == '__main__': - argv = sys.argv[1:] - if argv: - sys.exit(f'ERROR: got unexpected args {argv}') - main() + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--check', action='store_true') + args = parser.parse_args() + main(**vars(args)) |