diff options
author | Eric Snow <ericsnowcurrently@gmail.com> | 2019-10-19 02:00:04 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-10-19 02:00:04 (GMT) |
commit | e4c431ecf50def40eb93c3969c1e4eeaf7bf32f1 (patch) | |
tree | 071224bbded262901b9742eb82c5d82d2f744fe1 /Tools/c-analyzer/cpython | |
parent | ea55c51bd937f6019c35b39b87029644e469c059 (diff) | |
download | cpython-e4c431ecf50def40eb93c3969c1e4eeaf7bf32f1.zip cpython-e4c431ecf50def40eb93c3969c1e4eeaf7bf32f1.tar.gz cpython-e4c431ecf50def40eb93c3969c1e4eeaf7bf32f1.tar.bz2 |
bpo-36876: Re-organize the c-analyzer tool code. (gh-16841)
This is partly a cleanup of the code. It also is preparation for getting the variables from the source (cross-platform) rather than from the symbols.
The change only touches the tool (and its tests).
Diffstat (limited to 'Tools/c-analyzer/cpython')
-rw-r--r-- | Tools/c-analyzer/cpython/README | 72 | ||||
-rw-r--r-- | Tools/c-analyzer/cpython/__init__.py | 29 | ||||
-rw-r--r-- | Tools/c-analyzer/cpython/__main__.py | 212 | ||||
-rw-r--r-- | Tools/c-analyzer/cpython/_generate.py | 329 | ||||
-rw-r--r-- | Tools/c-analyzer/cpython/files.py | 29 | ||||
-rw-r--r-- | Tools/c-analyzer/cpython/find.py | 101 | ||||
-rw-r--r-- | Tools/c-analyzer/cpython/known.py | 66 | ||||
-rw-r--r-- | Tools/c-analyzer/cpython/supported.py | 398 |
8 files changed, 1236 insertions, 0 deletions
diff --git a/Tools/c-analyzer/cpython/README b/Tools/c-analyzer/cpython/README new file mode 100644 index 0000000..772b8be --- /dev/null +++ b/Tools/c-analyzer/cpython/README @@ -0,0 +1,72 @@ +####################################### +# C Globals and CPython Runtime State. + +CPython's C code makes extensive use of global variables (whether static +globals or static locals). Each such variable falls into one of several +categories: + +* strictly const data +* used exclusively in main or in the REPL +* process-global state (e.g. managing process-level resources + like signals and file descriptors) +* Python "global" runtime state +* per-interpreter runtime state + +The last one can be a problem as soon as anyone creates a second +interpreter (AKA "subinterpreter") in a process. It is definitely a +problem under subinterpreters if they are no longer sharing the GIL, +since the GIL protects us from a lot of race conditions. Keep in mind +that ultimately *all* objects (PyObject) should be treated as +per-interpreter state. This includes "static types", freelists, +_PyIdentifier, and singletons. Take that in for a second. It has +significant implications on where we use static variables! + +Be aware that module-global state (stored in C statics) is a kind of +per-interpreter state. There have been efforts across many years, and +still going, to provide extension module authors mechanisms to store +that state safely (see PEPs 3121, 489, etc.). + +(Note that there has been discussion around support for running multiple +Python runtimes in the same process. That would ends up with the same +problems, relative to static variables, that subinterpreters have.) + +Historically we have been bad at keeping per-interpreter state out of +static variables, mostly because until recently subinterpreters were +not widely used nor even factored in to solutions. However, the +feature is growing in popularity and use in the community. + +Mandate: "Eliminate use of static variables for per-interpreter state." + +The "c-statics.py" script in this directory, along with its accompanying +data files, are part of the effort to resolve existing problems with +our use of static variables and to prevent future problems. + +#------------------------- +## statics for actually-global state (and runtime state consolidation) + +In general, holding any kind of state in static variables +increases maintenance burden and increases the complexity of code (e.g. +we use TSS to identify the active thread state). So it is a good idea +to avoid using statics for state even if for the "global" runtime or +for process-global state. + +Relative to maintenance burden, one problem is where the runtime +state is spread throughout the codebase in dozens of individual +globals. Unlike the other globals, the runtime state represents a set +of values that are constantly shifting in a complex way. When they are +spread out it's harder to get a clear picture of what the runtime +involves. Furthermore, when they are spread out it complicates efforts +that change the runtime. + +Consequently, the globals for Python's runtime state have been +consolidated under a single top-level _PyRuntime global. No new globals +should be added for runtime state. Instead, they should be added to +_PyRuntimeState or one of its sub-structs. The tools in this directory +are run as part of the test suite to ensure that no new globals have +been added. The script can be run manually as well: + + ./python Lib/test/test_c_statics/c-statics.py check + +If it reports any globals then they should be resolved. If the globals +are runtime state then they should be folded into _PyRuntimeState. +Otherwise they should be marked as ignored. diff --git a/Tools/c-analyzer/cpython/__init__.py b/Tools/c-analyzer/cpython/__init__.py new file mode 100644 index 0000000..ae45b42 --- /dev/null +++ b/Tools/c-analyzer/cpython/__init__.py @@ -0,0 +1,29 @@ +import os.path +import sys + + +TOOL_ROOT = os.path.abspath( + os.path.dirname( # c-analyzer/ + os.path.dirname(__file__))) # cpython/ +DATA_DIR = TOOL_ROOT +REPO_ROOT = ( + os.path.dirname( # .. + os.path.dirname(TOOL_ROOT))) # Tools/ + +INCLUDE_DIRS = [os.path.join(REPO_ROOT, name) for name in [ + 'Include', + ]] +SOURCE_DIRS = [os.path.join(REPO_ROOT, name) for name in [ + 'Python', + 'Parser', + 'Objects', + 'Modules', + ]] + +#PYTHON = os.path.join(REPO_ROOT, 'python') +PYTHON = sys.executable + + +# Clean up the namespace. +del sys +del os diff --git a/Tools/c-analyzer/cpython/__main__.py b/Tools/c-analyzer/cpython/__main__.py new file mode 100644 index 0000000..6b0f9bc --- /dev/null +++ b/Tools/c-analyzer/cpython/__main__.py @@ -0,0 +1,212 @@ +import argparse +import re +import sys + +from c_analyzer.common import show +from c_analyzer.common.info import UNKNOWN + +from . import SOURCE_DIRS +from .find import supported_vars +from .known import ( + from_file as known_from_file, + DATA_FILE as KNOWN_FILE, + ) +from .supported import IGNORED_FILE + + +def _check_results(unknown, knownvars, used): + def _match_unused_global(variable): + found = [] + for varid in knownvars: + if varid in used: + continue + if varid.funcname is not None: + continue + if varid.name != variable.name: + continue + if variable.filename and variable.filename != UNKNOWN: + if variable.filename == varid.filename: + found.append(varid) + else: + found.append(varid) + return found + + badknown = set() + for variable in sorted(unknown): + msg = None + if variable.funcname != UNKNOWN: + msg = f'could not find global symbol {variable.id}' + elif m := _match_unused_global(variable): + assert isinstance(m, list) + badknown.update(m) + elif variable.name in ('completed', 'id'): # XXX Figure out where these variables are. + unknown.remove(variable) + else: + msg = f'could not find local symbol {variable.id}' + if msg: + #raise Exception(msg) + print(msg) + if badknown: + print('---') + print(f'{len(badknown)} globals in known.tsv, but may actually be local:') + for varid in sorted(badknown): + print(f'{varid.filename:30} {varid.name}') + unused = sorted(varid + for varid in set(knownvars) - used + if varid.name != 'id') # XXX Figure out where these variables are. + if unused: + print('---') + print(f'did not use {len(unused)} known vars:') + for varid in unused: + print(f'{varid.filename:30} {varid.funcname or "-":20} {varid.name}') + raise Exception('not all known symbols used') + if unknown: + print('---') + raise Exception('could not find all symbols') + + +# XXX Move this check to its own command. +def cmd_check_cache(cmd, *, + known=KNOWN_FILE, + ignored=IGNORED_FILE, + _known_from_file=known_from_file, + _find=supported_vars, + ): + known = _known_from_file(known) + + used = set() + unknown = set() + for var, supported in _find(known=known, ignored=ignored): + if supported is None: + unknown.add(var) + continue + used.add(var.id) + _check_results(unknown, known['variables'], used) + + +def cmd_check(cmd, *, + known=KNOWN_FILE, + ignored=IGNORED_FILE, + _find=supported_vars, + _show=show.basic, + _print=print, + ): + """ + Fail if there are unsupported globals variables. + + In the failure case, the list of unsupported variables + will be printed out. + """ + unsupported = [] + for var, supported in _find(known=known, ignored=ignored): + if not supported: + unsupported.append(var) + + if not unsupported: + #_print('okay') + return + + _print('ERROR: found unsupported global variables') + _print() + _show(sorted(unsupported)) + _print(f' ({len(unsupported)} total)') + sys.exit(1) + + +def cmd_show(cmd, *, + known=KNOWN_FILE, + ignored=IGNORED_FILE, + skip_objects=False, + _find=supported_vars, + _show=show.basic, + _print=print, + ): + """ + Print out the list of found global variables. + + The variables will be distinguished as "supported" or "unsupported". + """ + allsupported = [] + allunsupported = [] + for found, supported in _find(known=known, + ignored=ignored, + skip_objects=skip_objects, + ): + if supported is None: + continue + (allsupported if supported else allunsupported + ).append(found) + + _print('supported:') + _print('----------') + _show(sorted(allsupported)) + _print(f' ({len(allsupported)} total)') + _print() + _print('unsupported:') + _print('------------') + _show(sorted(allunsupported)) + _print(f' ({len(allunsupported)} total)') + + +############################# +# the script + +COMMANDS = { + 'check': cmd_check, + 'show': cmd_show, + } + +PROG = sys.argv[0] +PROG = 'c-globals.py' + + +def parse_args(prog=PROG, argv=sys.argv[1:], *, _fail=None): + common = argparse.ArgumentParser(add_help=False) + common.add_argument('--ignored', metavar='FILE', + default=IGNORED_FILE, + help='path to file that lists ignored vars') + common.add_argument('--known', metavar='FILE', + default=KNOWN_FILE, + help='path to file that lists known types') + #common.add_argument('dirs', metavar='DIR', nargs='*', + # default=SOURCE_DIRS, + # help='a directory to check') + + parser = argparse.ArgumentParser( + prog=prog, + ) + subs = parser.add_subparsers(dest='cmd') + + check = subs.add_parser('check', parents=[common]) + + show = subs.add_parser('show', parents=[common]) + show.add_argument('--skip-objects', action='store_true') + + if _fail is None: + def _fail(msg): + parser.error(msg) + + # Now parse the args. + args = parser.parse_args(argv) + ns = vars(args) + + cmd = ns.pop('cmd') + if not cmd: + _fail('missing command') + + return cmd, ns + + +def main(cmd, cmdkwargs=None, *, _COMMANDS=COMMANDS): + try: + cmdfunc = _COMMANDS[cmd] + except KeyError: + raise ValueError( + f'unsupported cmd {cmd!r}' if cmd else 'missing cmd') + + cmdfunc(cmd, **cmdkwargs or {}) + + +if __name__ == '__main__': + cmd, cmdkwargs = parse_args() + main(cmd, cmdkwargs) diff --git a/Tools/c-analyzer/cpython/_generate.py b/Tools/c-analyzer/cpython/_generate.py new file mode 100644 index 0000000..4c340ac --- /dev/null +++ b/Tools/c-analyzer/cpython/_generate.py @@ -0,0 +1,329 @@ +# The code here consists of hacks for pre-populating the known.tsv file. + +from c_analyzer.parser.preprocessor import _iter_clean_lines +from c_analyzer.parser.naive import ( + iter_variables, parse_variable_declaration, find_variables, + ) +from c_analyzer.common.known import HEADER as KNOWN_HEADER +from c_analyzer.common.info import UNKNOWN, ID +from c_analyzer.variables import Variable +from c_analyzer.util import write_tsv + +from . import SOURCE_DIRS, REPO_ROOT +from .known import DATA_FILE as KNOWN_FILE +from .files import iter_cpython_files + + +POTS = ('char ', 'wchar_t ', 'int ', 'Py_ssize_t ') +POTS += tuple('const ' + v for v in POTS) +STRUCTS = ('PyTypeObject', 'PyObject', 'PyMethodDef', 'PyModuleDef', 'grammar') + + +def _parse_global(line, funcname=None): + line = line.strip() + if line.startswith('static '): + if '(' in line and '[' not in line and ' = ' not in line: + return None, None + name, decl = parse_variable_declaration(line) + elif line.startswith(('Py_LOCAL(', 'Py_LOCAL_INLINE(')): + name, decl = parse_variable_declaration(line) + elif line.startswith('_Py_static_string('): + decl = line.strip(';').strip() + name = line.split('(')[1].split(',')[0].strip() + elif line.startswith('_Py_IDENTIFIER('): + decl = line.strip(';').strip() + name = 'PyId_' + line.split('(')[1].split(')')[0].strip() + elif funcname: + return None, None + + # global-only + elif line.startswith('PyAPI_DATA('): # only in .h files + name, decl = parse_variable_declaration(line) + elif line.startswith('extern '): # only in .h files + name, decl = parse_variable_declaration(line) + elif line.startswith('PyDoc_VAR('): + decl = line.strip(';').strip() + name = line.split('(')[1].split(')')[0].strip() + elif line.startswith(POTS): # implied static + if '(' in line and '[' not in line and ' = ' not in line: + return None, None + name, decl = parse_variable_declaration(line) + elif line.startswith(STRUCTS) and line.endswith(' = {'): # implied static + name, decl = parse_variable_declaration(line) + elif line.startswith(STRUCTS) and line.endswith(' = NULL;'): # implied static + name, decl = parse_variable_declaration(line) + elif line.startswith('struct '): + if not line.endswith(' = {'): + return None, None + if not line.partition(' ')[2].startswith(STRUCTS): + return None, None + # implied static + name, decl = parse_variable_declaration(line) + + # file-specific + elif line.startswith(('SLOT1BINFULL(', 'SLOT1BIN(')): + # Objects/typeobject.c + funcname = line.split('(')[1].split(',')[0] + return [ + ('op_id', funcname, '_Py_static_string(op_id, OPSTR)'), + ('rop_id', funcname, '_Py_static_string(op_id, OPSTR)'), + ] + elif line.startswith('WRAP_METHOD('): + # Objects/weakrefobject.c + funcname, name = (v.strip() for v in line.split('(')[1].split(')')[0].split(',')) + return [ + ('PyId_' + name, funcname, f'_Py_IDENTIFIER({name})'), + ] + + else: + return None, None + return name, decl + + +def _pop_cached(varcache, filename, funcname, name, *, + _iter_variables=iter_variables, + ): + # Look for the file. + try: + cached = varcache[filename] + except KeyError: + cached = varcache[filename] = {} + for variable in _iter_variables(filename, + parse_variable=_parse_global, + ): + variable._isglobal = True + cached[variable.id] = variable + for var in cached: + print(' ', var) + + # Look for the variable. + if funcname == UNKNOWN: + for varid in cached: + if varid.name == name: + break + else: + return None + return cached.pop(varid) + else: + return cached.pop((filename, funcname, name), None) + + +def find_matching_variable(varid, varcache, allfilenames, *, + _pop_cached=_pop_cached, + ): + if varid.filename and varid.filename != UNKNOWN: + filenames = [varid.filename] + else: + filenames = allfilenames + for filename in filenames: + variable = _pop_cached(varcache, filename, varid.funcname, varid.name) + if variable is not None: + return variable + else: + if varid.filename and varid.filename != UNKNOWN and varid.funcname is None: + for filename in allfilenames: + if not filename.endswith('.h'): + continue + variable = _pop_cached(varcache, filename, None, varid.name) + if variable is not None: + return variable + return None + + +MULTILINE = { + # Python/Python-ast.c + 'Load_singleton': 'PyObject *', + 'Store_singleton': 'PyObject *', + 'Del_singleton': 'PyObject *', + 'AugLoad_singleton': 'PyObject *', + 'AugStore_singleton': 'PyObject *', + 'Param_singleton': 'PyObject *', + 'And_singleton': 'PyObject *', + 'Or_singleton': 'PyObject *', + 'Add_singleton': 'static PyObject *', + 'Sub_singleton': 'static PyObject *', + 'Mult_singleton': 'static PyObject *', + 'MatMult_singleton': 'static PyObject *', + 'Div_singleton': 'static PyObject *', + 'Mod_singleton': 'static PyObject *', + 'Pow_singleton': 'static PyObject *', + 'LShift_singleton': 'static PyObject *', + 'RShift_singleton': 'static PyObject *', + 'BitOr_singleton': 'static PyObject *', + 'BitXor_singleton': 'static PyObject *', + 'BitAnd_singleton': 'static PyObject *', + 'FloorDiv_singleton': 'static PyObject *', + 'Invert_singleton': 'static PyObject *', + 'Not_singleton': 'static PyObject *', + 'UAdd_singleton': 'static PyObject *', + 'USub_singleton': 'static PyObject *', + 'Eq_singleton': 'static PyObject *', + 'NotEq_singleton': 'static PyObject *', + 'Lt_singleton': 'static PyObject *', + 'LtE_singleton': 'static PyObject *', + 'Gt_singleton': 'static PyObject *', + 'GtE_singleton': 'static PyObject *', + 'Is_singleton': 'static PyObject *', + 'IsNot_singleton': 'static PyObject *', + 'In_singleton': 'static PyObject *', + 'NotIn_singleton': 'static PyObject *', + # Python/symtable.c + 'top': 'static identifier ', + 'lambda': 'static identifier ', + 'genexpr': 'static identifier ', + 'listcomp': 'static identifier ', + 'setcomp': 'static identifier ', + 'dictcomp': 'static identifier ', + '__class__': 'static identifier ', + # Python/compile.c + '__doc__': 'static PyObject *', + '__annotations__': 'static PyObject *', + # Objects/floatobject.c + 'double_format': 'static float_format_type ', + 'float_format': 'static float_format_type ', + 'detected_double_format': 'static float_format_type ', + 'detected_float_format': 'static float_format_type ', + # Parser/listnode.c + 'level': 'static int ', + 'atbol': 'static int ', + # Python/dtoa.c + 'private_mem': 'static double private_mem[PRIVATE_mem]', + 'pmem_next': 'static double *', + # Modules/_weakref.c + 'weakref_functions': 'static PyMethodDef ', +} +INLINE = { + # Modules/_tracemalloc.c + 'allocators': 'static struct { PyMemAllocatorEx mem; PyMemAllocatorEx raw; PyMemAllocatorEx obj; } ', + # Modules/faulthandler.c + 'fatal_error': 'static struct { int enabled; PyObject *file; int fd; int all_threads; PyInterpreterState *interp; void *exc_handler; } ', + 'thread': 'static struct { PyObject *file; int fd; PY_TIMEOUT_T timeout_us; int repeat; PyInterpreterState *interp; int exit; char *header; size_t header_len; PyThread_type_lock cancel_event; PyThread_type_lock running; } ', + # Modules/signalmodule.c + 'Handlers': 'static volatile struct { _Py_atomic_int tripped; PyObject *func; } Handlers[NSIG]', + 'wakeup': 'static volatile struct { SOCKET_T fd; int warn_on_full_buffer; int use_send; } ', + # Python/dynload_shlib.c + 'handles': 'static struct { dev_t dev; ino_t ino; void *handle; } handles[128]', + # Objects/obmalloc.c + '_PyMem_Debug': 'static struct { debug_alloc_api_t raw; debug_alloc_api_t mem; debug_alloc_api_t obj; } ', + # Python/bootstrap_hash.c + 'urandom_cache': 'static struct { int fd; dev_t st_dev; ino_t st_ino; } ', + } +FUNC = { + # Objects/object.c + '_Py_abstract_hack': 'Py_ssize_t (*_Py_abstract_hack)(PyObject *)', + # Parser/myreadline.c + 'PyOS_InputHook': 'int (*PyOS_InputHook)(void)', + # Python/pylifecycle.c + '_PyOS_mystrnicmp_hack': 'int (*_PyOS_mystrnicmp_hack)(const char *, const char *, Py_ssize_t)', + # Parser/myreadline.c + 'PyOS_ReadlineFunctionPointer': 'char *(*PyOS_ReadlineFunctionPointer)(FILE *, FILE *, const char *)', + } +IMPLIED = { + # Objects/boolobject.c + '_Py_FalseStruct': 'static struct _longobject ', + '_Py_TrueStruct': 'static struct _longobject ', + # Modules/config.c + '_PyImport_Inittab': 'struct _inittab _PyImport_Inittab[]', + } +GLOBALS = {} +GLOBALS.update(MULTILINE) +GLOBALS.update(INLINE) +GLOBALS.update(FUNC) +GLOBALS.update(IMPLIED) + +LOCALS = { + 'buildinfo': ('Modules/getbuildinfo.c', + 'Py_GetBuildInfo', + 'static char buildinfo[50 + sizeof(GITVERSION) + ((sizeof(GITTAG) > sizeof(GITBRANCH)) ? sizeof(GITTAG) : sizeof(GITBRANCH))]'), + 'methods': ('Python/codecs.c', + '_PyCodecRegistry_Init', + 'static struct { char *name; PyMethodDef def; } methods[]'), + } + + +def _known(symbol): + if symbol.funcname: + if symbol.funcname != UNKNOWN or symbol.filename != UNKNOWN: + raise KeyError(symbol.name) + filename, funcname, decl = LOCALS[symbol.name] + varid = ID(filename, funcname, symbol.name) + elif not symbol.filename or symbol.filename == UNKNOWN: + raise KeyError(symbol.name) + else: + varid = symbol.id + try: + decl = GLOBALS[symbol.name] + except KeyError: + + if symbol.name.endswith('_methods'): + decl = 'static PyMethodDef ' + elif symbol.filename == 'Objects/exceptions.c' and symbol.name.startswith(('PyExc_', '_PyExc_')): + decl = 'static PyTypeObject ' + else: + raise + if symbol.name not in decl: + decl = decl + symbol.name + return Variable(varid, 'static', decl) + + +def known_row(varid, decl): + return ( + varid.filename, + varid.funcname or '-', + varid.name, + 'variable', + decl, + ) + + +def known_rows(symbols, *, + cached=True, + _get_filenames=iter_cpython_files, + _find_match=find_matching_variable, + _find_symbols=find_variables, + _as_known=known_row, + ): + filenames = list(_get_filenames()) + cache = {} + if cached: + for symbol in symbols: + try: + found = _known(symbol) + except KeyError: + found = _find_match(symbol, cache, filenames) + if found is None: + found = Variable(symbol.id, UNKNOWN, UNKNOWN) + yield _as_known(found.id, found.vartype) + else: + raise NotImplementedError # XXX incorporate KNOWN + for variable in _find_symbols(symbols, filenames, + srccache=cache, + parse_variable=_parse_global, + ): + #variable = variable._replace( + # filename=os.path.relpath(variable.filename, REPO_ROOT)) + if variable.funcname == UNKNOWN: + print(variable) + if variable.vartype== UNKNOWN: + print(variable) + yield _as_known(variable.id, variable.vartype) + + +def generate(symbols, filename=None, *, + _generate_rows=known_rows, + _write_tsv=write_tsv, + ): + if not filename: + filename = KNOWN_FILE + '.new' + + rows = _generate_rows(symbols) + _write_tsv(filename, KNOWN_HEADER, rows) + + +if __name__ == '__main__': + from c_symbols import binary + symbols = binary.iter_symbols( + binary.PYTHON, + find_local_symbol=None, + ) + generate(symbols) diff --git a/Tools/c-analyzer/cpython/files.py b/Tools/c-analyzer/cpython/files.py new file mode 100644 index 0000000..543097a --- /dev/null +++ b/Tools/c-analyzer/cpython/files.py @@ -0,0 +1,29 @@ +from c_analyzer.common.files import ( + C_SOURCE_SUFFIXES, walk_tree, iter_files_by_suffix, + ) + +from . import SOURCE_DIRS, REPO_ROOT + +# XXX need tests: +# * iter_files() + + +def iter_files(*, + walk=walk_tree, + _files=iter_files_by_suffix, + ): + """Yield each file in the tree for each of the given directory names.""" + excludedtrees = [ + os.path.join('Include', 'cpython', ''), + ] + def is_excluded(filename): + for root in excludedtrees: + if filename.startswith(root): + return True + return False + for filename in _files(SOURCE_DIRS, C_SOURCE_SUFFIXES, REPO_ROOT, + walk=walk, + ): + if is_excluded(filename): + continue + yield filename diff --git a/Tools/c-analyzer/cpython/find.py b/Tools/c-analyzer/cpython/find.py new file mode 100644 index 0000000..a7bc0b4 --- /dev/null +++ b/Tools/c-analyzer/cpython/find.py @@ -0,0 +1,101 @@ +import os.path + +from c_analyzer.common import files +from c_analyzer.common.info import UNKNOWN, ID +from c_analyzer.variables import find as _common + +from . import SOURCE_DIRS, PYTHON, REPO_ROOT +from .known import ( + from_file as known_from_file, + DATA_FILE as KNOWN_FILE, + ) +from .supported import ( + ignored_from_file, IGNORED_FILE, is_supported, _is_object, + ) + +# XXX need tests: +# * vars_from_binary() +# * vars_from_source() +# * supported_vars() + + +def _handle_id(filename, funcname, name, *, + _relpath=os.path.relpath, + ): + filename = _relpath(filename, REPO_ROOT) + return ID(filename, funcname, name) + + +def vars_from_binary(*, + known=KNOWN_FILE, + _known_from_file=known_from_file, + _iter_files=files.iter_files_by_suffix, + _iter_vars=_common.vars_from_binary, + ): + """Yield a Variable for each found Symbol. + + Details are filled in from the given "known" variables and types. + """ + if isinstance(known, str): + known = _known_from_file(known) + dirnames = SOURCE_DIRS + suffixes = ('.c',) + filenames = _iter_files(dirnames, suffixes) + # XXX For now we only use known variables (no source lookup). + filenames = None + yield from _iter_vars(PYTHON, + known=known, + filenames=filenames, + handle_id=_handle_id, + check_filename=(lambda n: True), + ) + + +def vars_from_source(*, + preprocessed=None, + known=KNOWN_FILE, + _known_from_file=known_from_file, + _iter_files=files.iter_files_by_suffix, + _iter_vars=_common.vars_from_source, + ): + """Yield a Variable for each declaration in the raw source code. + + Details are filled in from the given "known" variables and types. + """ + if isinstance(known, str): + known = _known_from_file(known) + dirnames = SOURCE_DIRS + suffixes = ('.c',) + filenames = _iter_files(dirnames, suffixes) + yield from _iter_vars(filenames, + preprocessed=preprocessed, + known=known, + handle_id=_handle_id, + ) + + +def supported_vars(*, + known=KNOWN_FILE, + ignored=IGNORED_FILE, + skip_objects=False, + _known_from_file=known_from_file, + _ignored_from_file=ignored_from_file, + _iter_vars=vars_from_binary, + _is_supported=is_supported, + ): + """Yield (var, is supported) for each found variable.""" + if isinstance(known, str): + known = _known_from_file(known) + if isinstance(ignored, str): + ignored = _ignored_from_file(ignored) + + for var in _iter_vars(known=known): + if not var.isglobal: + continue + elif var.vartype == UNKNOWN: + yield var, None + # XXX Support proper filters instead. + elif skip_objects and _is_object(found.vartype): + continue + else: + yield var, _is_supported(var, ignored, known) diff --git a/Tools/c-analyzer/cpython/known.py b/Tools/c-analyzer/cpython/known.py new file mode 100644 index 0000000..c3cc2c0 --- /dev/null +++ b/Tools/c-analyzer/cpython/known.py @@ -0,0 +1,66 @@ +import csv +import os.path + +from c_analyzer.parser.declarations import extract_storage +from c_analyzer.variables import known as _common +from c_analyzer.variables.info import Variable + +from . import DATA_DIR + + +# XXX need tests: +# * from_file() +# * look_up_variable() + + +DATA_FILE = os.path.join(DATA_DIR, 'known.tsv') + + +def _get_storage(decl, infunc): + # statics + if decl.startswith(('Py_LOCAL(', 'Py_LOCAL_INLINE(')): + return 'static' + if decl.startswith(('_Py_IDENTIFIER(', '_Py_static_string(')): + return 'static' + if decl.startswith('PyDoc_VAR('): + return 'static' + if decl.startswith(('SLOT1BINFULL(', 'SLOT1BIN(')): + return 'static' + if decl.startswith('WRAP_METHOD('): + return 'static' + # public extern + if decl.startswith('PyAPI_DATA('): + return 'extern' + # Fall back to the normal handler. + return extract_storage(decl, infunc=infunc) + + +def _handle_var(varid, decl): +# if varid.name == 'id' and decl == UNKNOWN: +# # None of these are variables. +# decl = 'int id'; + storage = _get_storage(decl, varid.funcname) + return Variable(varid, storage, decl) + + +def from_file(infile=DATA_FILE, *, + _from_file=_common.from_file, + _handle_var=_handle_var, + ): + """Return the info for known declarations in the given file.""" + return _from_file(infile, handle_var=_handle_var) + + +def look_up_variable(varid, knownvars, *, + _lookup=_common.look_up_variable, + ): + """Return the known variable matching the given ID. + + "knownvars" is a mapping of ID to Variable. + + "match_files" is used to verify if two filenames point to + the same file. + + If no match is found then None is returned. + """ + return _lookup(varid, knownvars) diff --git a/Tools/c-analyzer/cpython/supported.py b/Tools/c-analyzer/cpython/supported.py new file mode 100644 index 0000000..18786ee --- /dev/null +++ b/Tools/c-analyzer/cpython/supported.py @@ -0,0 +1,398 @@ +import os.path +import re + +from c_analyzer.common.info import ID +from c_analyzer.common.util import read_tsv, write_tsv + +from . import DATA_DIR + +# XXX need tests: +# * generate / script + + +IGNORED_FILE = os.path.join(DATA_DIR, 'ignored.tsv') + +IGNORED_COLUMNS = ('filename', 'funcname', 'name', 'kind', 'reason') +IGNORED_HEADER = '\t'.join(IGNORED_COLUMNS) + +# XXX Move these to ignored.tsv. +IGNORED = { + # global + 'PyImport_FrozenModules': 'process-global', + 'M___hello__': 'process-global', + 'inittab_copy': 'process-global', + 'PyHash_Func': 'process-global', + '_Py_HashSecret_Initialized': 'process-global', + '_TARGET_LOCALES': 'process-global', + + # startup (only changed before/during) + '_PyRuntime': 'runtime startup', + 'runtime_initialized': 'runtime startup', + 'static_arg_parsers': 'runtime startup', + 'orig_argv': 'runtime startup', + 'opt_ptr': 'runtime startup', + '_preinit_warnoptions': 'runtime startup', + '_Py_StandardStreamEncoding': 'runtime startup', + 'Py_FileSystemDefaultEncoding': 'runtime startup', + '_Py_StandardStreamErrors': 'runtime startup', + 'Py_FileSystemDefaultEncodeErrors': 'runtime startup', + 'Py_BytesWarningFlag': 'runtime startup', + 'Py_DebugFlag': 'runtime startup', + 'Py_DontWriteBytecodeFlag': 'runtime startup', + 'Py_FrozenFlag': 'runtime startup', + 'Py_HashRandomizationFlag': 'runtime startup', + 'Py_IgnoreEnvironmentFlag': 'runtime startup', + 'Py_InspectFlag': 'runtime startup', + 'Py_InteractiveFlag': 'runtime startup', + 'Py_IsolatedFlag': 'runtime startup', + 'Py_NoSiteFlag': 'runtime startup', + 'Py_NoUserSiteDirectory': 'runtime startup', + 'Py_OptimizeFlag': 'runtime startup', + 'Py_QuietFlag': 'runtime startup', + 'Py_UTF8Mode': 'runtime startup', + 'Py_UnbufferedStdioFlag': 'runtime startup', + 'Py_VerboseFlag': 'runtime startup', + '_Py_path_config': 'runtime startup', + '_PyOS_optarg': 'runtime startup', + '_PyOS_opterr': 'runtime startup', + '_PyOS_optind': 'runtime startup', + '_Py_HashSecret': 'runtime startup', + + # REPL + '_PyOS_ReadlineLock': 'repl', + '_PyOS_ReadlineTState': 'repl', + + # effectively const + 'tracemalloc_empty_traceback': 'const', + '_empty_bitmap_node': 'const', + 'posix_constants_pathconf': 'const', + 'posix_constants_confstr': 'const', + 'posix_constants_sysconf': 'const', + '_PySys_ImplCacheTag': 'const', + '_PySys_ImplName': 'const', + 'PyImport_Inittab': 'const', + '_PyImport_DynLoadFiletab': 'const', + '_PyParser_Grammar': 'const', + 'Py_hexdigits': 'const', + '_PyImport_Inittab': 'const', + '_PyByteArray_empty_string': 'const', + '_PyLong_DigitValue': 'const', + '_Py_SwappedOp': 'const', + 'PyStructSequence_UnnamedField': 'const', + + # signals are main-thread only + 'faulthandler_handlers': 'signals are main-thread only', + 'user_signals': 'signals are main-thread only', + 'wakeup': 'signals are main-thread only', + + # hacks + '_PySet_Dummy': 'only used as a placeholder', + } + +BENIGN = 'races here are benign and unlikely' + + +def is_supported(variable, ignored=None, known=None, *, + _ignored=(lambda *a, **k: _is_ignored(*a, **k)), + _vartype_okay=(lambda *a, **k: _is_vartype_okay(*a, **k)), + ): + """Return True if the given global variable is okay in CPython.""" + if _ignored(variable, + ignored and ignored.get('variables')): + return True + elif _vartype_okay(variable.vartype, + ignored.get('types')): + return True + else: + return False + + +def _is_ignored(variable, ignoredvars=None, *, + _IGNORED=IGNORED, + ): + """Return the reason if the variable is a supported global. + + Return None if the variable is not a supported global. + """ + if ignoredvars and (reason := ignoredvars.get(variable.id)): + return reason + + if variable.funcname is None: + if reason := _IGNORED.get(variable.name): + return reason + + # compiler + if variable.filename == 'Python/graminit.c': + if variable.vartype.startswith('static state '): + return 'compiler' + if variable.filename == 'Python/symtable.c': + if variable.vartype.startswith('static identifier '): + return 'compiler' + if variable.filename == 'Python/Python-ast.c': + # These should be const. + if variable.name.endswith('_field'): + return 'compiler' + if variable.name.endswith('_attribute'): + return 'compiler' + + # other + if variable.filename == 'Python/dtoa.c': + # guarded by lock? + if variable.name in ('p5s', 'freelist'): + return 'dtoa is thread-safe?' + if variable.name in ('private_mem', 'pmem_next'): + return 'dtoa is thread-safe?' + if variable.filename == 'Python/thread.c': + # Threads do not become an issue until after these have been set + # and these never get changed after that. + if variable.name in ('initialized', 'thread_debug'): + return 'thread-safe' + if variable.filename == 'Python/getversion.c': + if variable.name == 'version': + # Races are benign here, as well as unlikely. + return BENIGN + if variable.filename == 'Python/fileutils.c': + if variable.name == 'force_ascii': + return BENIGN + if variable.name == 'ioctl_works': + return BENIGN + if variable.name == '_Py_open_cloexec_works': + return BENIGN + if variable.filename == 'Python/codecs.c': + if variable.name == 'ucnhash_CAPI': + return BENIGN + if variable.filename == 'Python/bootstrap_hash.c': + if variable.name == 'getrandom_works': + return BENIGN + if variable.filename == 'Objects/unicodeobject.c': + if variable.name == 'ucnhash_CAPI': + return BENIGN + if variable.name == 'bloom_linebreak': + # *mostly* benign + return BENIGN + if variable.filename == 'Modules/getbuildinfo.c': + if variable.name == 'buildinfo': + # The static is used for pre-allocation. + return BENIGN + if variable.filename == 'Modules/posixmodule.c': + if variable.name == 'ticks_per_second': + return BENIGN + if variable.name == 'dup3_works': + return BENIGN + if variable.filename == 'Modules/timemodule.c': + if variable.name == 'ticks_per_second': + return BENIGN + if variable.filename == 'Objects/longobject.c': + if variable.name == 'log_base_BASE': + return BENIGN + if variable.name == 'convwidth_base': + return BENIGN + if variable.name == 'convmultmax_base': + return BENIGN + + return None + + +def _is_vartype_okay(vartype, ignoredtypes=None): + if _is_object(vartype): + return None + + if vartype.startswith('static const '): + return 'const' + if vartype.startswith('const '): + return 'const' + + # components for TypeObject definitions + for name in ('PyMethodDef', 'PyGetSetDef', 'PyMemberDef'): + if name in vartype: + return 'const' + for name in ('PyNumberMethods', 'PySequenceMethods', 'PyMappingMethods', + 'PyBufferProcs', 'PyAsyncMethods'): + if name in vartype: + return 'const' + for name in ('slotdef', 'newfunc'): + if name in vartype: + return 'const' + + # structseq + for name in ('PyStructSequence_Desc', 'PyStructSequence_Field'): + if name in vartype: + return 'const' + + # other definiitions + if 'PyModuleDef' in vartype: + return 'const' + + # thread-safe + if '_Py_atomic_int' in vartype: + return 'thread-safe' + if 'pthread_condattr_t' in vartype: + return 'thread-safe' + + # startup + if '_Py_PreInitEntry' in vartype: + return 'startup' + + # global +# if 'PyMemAllocatorEx' in vartype: +# return True + + # others +# if 'PyThread_type_lock' in vartype: +# return True + + # XXX ??? + # _Py_tss_t + # _Py_hashtable_t + # stack_t + # _PyUnicode_Name_CAPI + + # functions + if '(' in vartype and '[' not in vartype: + return 'function pointer' + + # XXX finish! + # * allow const values? + #raise NotImplementedError + return None + + +PYOBJECT_RE = re.compile(r''' + ^ + ( + # must start with "static " + static \s+ + ( + identifier + ) + \b + ) | + ( + # may start with "static " + ( static \s+ )? + ( + .* + ( + PyObject | + PyTypeObject | + _? Py \w+ Object | + _PyArg_Parser | + _Py_Identifier | + traceback_t | + PyAsyncGenASend | + _PyAsyncGenWrappedValue | + PyContext | + method_cache_entry + ) + \b + ) | + ( + ( + _Py_IDENTIFIER | + _Py_static_string + ) + [(] + ) + ) + ''', re.VERBOSE) + + +def _is_object(vartype): + if 'PyDictKeysObject' in vartype: + return False + if PYOBJECT_RE.match(vartype): + return True + if vartype.endswith((' _Py_FalseStruct', ' _Py_TrueStruct')): + return True + + # XXX Add more? + + #for part in vartype.split(): + # # XXX const is automatic True? + # if part == 'PyObject' or part.startswith('PyObject['): + # return True + return False + + +def ignored_from_file(infile, *, + _read_tsv=read_tsv, + ): + """Yield a Variable for each ignored var in the file.""" + ignored = { + 'variables': {}, + #'types': {}, + #'constants': {}, + #'macros': {}, + } + for row in _read_tsv(infile, IGNORED_HEADER): + filename, funcname, name, kind, reason = row + if not funcname or funcname == '-': + funcname = None + id = ID(filename, funcname, name) + if kind == 'variable': + values = ignored['variables'] + else: + raise ValueError(f'unsupported kind in row {row}') + values[id] = reason + return ignored + + +################################## +# generate + +def _get_row(varid, reason): + return ( + varid.filename, + varid.funcname or '-', + varid.name, + 'variable', + str(reason), + ) + + +def _get_rows(variables, ignored=None, *, + _as_row=_get_row, + _is_ignored=_is_ignored, + _vartype_okay=_is_vartype_okay, + ): + count = 0 + for variable in variables: + reason = _is_ignored(variable, + ignored and ignored.get('variables'), + ) + if not reason: + reason = _vartype_okay(variable.vartype, + ignored and ignored.get('types')) + if not reason: + continue + + print(' ', variable, repr(reason)) + yield _as_row(variable.id, reason) + count += 1 + print(f'total: {count}') + + +def _generate_ignored_file(variables, filename=None, *, + _generate_rows=_get_rows, + _write_tsv=write_tsv, + ): + if not filename: + filename = IGNORED_FILE + '.new' + rows = _generate_rows(variables) + _write_tsv(filename, IGNORED_HEADER, rows) + + +if __name__ == '__main__': + from cpython import SOURCE_DIRS + from cpython.known import ( + from_file as known_from_file, + DATA_FILE as KNOWN_FILE, + ) + # XXX This is wrong! + from . import find + known = known_from_file(KNOWN_FILE) + knownvars = (known or {}).get('variables') + variables = find.globals_from_binary(knownvars=knownvars, + dirnames=SOURCE_DIRS) + + _generate_ignored_file(variables) |