diff options
author | Eric Snow <ericsnowcurrently@gmail.com> | 2019-10-19 02:00:04 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-10-19 02:00:04 (GMT) |
commit | e4c431ecf50def40eb93c3969c1e4eeaf7bf32f1 (patch) | |
tree | 071224bbded262901b9742eb82c5d82d2f744fe1 /Tools | |
parent | ea55c51bd937f6019c35b39b87029644e469c059 (diff) | |
download | cpython-e4c431ecf50def40eb93c3969c1e4eeaf7bf32f1.zip cpython-e4c431ecf50def40eb93c3969c1e4eeaf7bf32f1.tar.gz cpython-e4c431ecf50def40eb93c3969c1e4eeaf7bf32f1.tar.bz2 |
bpo-36876: Re-organize the c-analyzer tool code. (gh-16841)
This is partly a cleanup of the code. It also is preparation for getting the variables from the source (cross-platform) rather than from the symbols.
The change only touches the tool (and its tests).
Diffstat (limited to 'Tools')
36 files changed, 1030 insertions, 706 deletions
diff --git a/Tools/c-analyzer/c-globals.py b/Tools/c-analyzer/c-globals.py index 9afe059..b36b791 100644 --- a/Tools/c-analyzer/c-globals.py +++ b/Tools/c-analyzer/c-globals.py @@ -1,6 +1,6 @@ # This is a script equivalent of running "python -m test.test_c_globals.cg". -from c_globals.__main__ import parse_args, main +from cpython.__main__ import parse_args, main # This is effectively copied from cg/__main__.py: diff --git a/Tools/c-analyzer/c_symbols/__init__.py b/Tools/c-analyzer/c_analyzer/__init__.py index e69de29..e69de29 100644 --- a/Tools/c-analyzer/c_symbols/__init__.py +++ b/Tools/c-analyzer/c_analyzer/__init__.py diff --git a/Tools/c-analyzer/c_parser/__init__.py b/Tools/c-analyzer/c_analyzer/common/__init__.py index e69de29..e69de29 100644 --- a/Tools/c-analyzer/c_parser/__init__.py +++ b/Tools/c-analyzer/c_analyzer/common/__init__.py diff --git a/Tools/c-analyzer/c_analyzer_common/files.py b/Tools/c-analyzer/c_analyzer/common/files.py index b3cd16c..ab551a8 100644 --- a/Tools/c-analyzer/c_analyzer_common/files.py +++ b/Tools/c-analyzer/c_analyzer/common/files.py @@ -2,7 +2,10 @@ import glob import os import os.path -from . import SOURCE_DIRS, REPO_ROOT +# XXX need tests: +# * walk_tree() +# * glob_tree() +# * iter_files_by_suffix() C_SOURCE_SUFFIXES = ('.c', '.h') @@ -115,24 +118,3 @@ def iter_files_by_suffix(root, suffixes, relparent=None, *, # XXX Ignore repeated suffixes? for suffix in suffixes: yield from _iter_files(root, suffix, relparent) - - -def iter_cpython_files(*, - walk=walk_tree, - _files=iter_files_by_suffix, - ): - """Yield each file in the tree for each of the given directory names.""" - excludedtrees = [ - os.path.join('Include', 'cpython', ''), - ] - def is_excluded(filename): - for root in excludedtrees: - if filename.startswith(root): - return True - return False - for filename in _files(SOURCE_DIRS, C_SOURCE_SUFFIXES, REPO_ROOT, - walk=walk, - ): - if is_excluded(filename): - continue - yield filename diff --git a/Tools/c-analyzer/c_analyzer/common/info.py b/Tools/c-analyzer/c_analyzer/common/info.py new file mode 100644 index 0000000..3f3f8c5 --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/common/info.py @@ -0,0 +1,138 @@ +from collections import namedtuple +import re + +from .util import classonly, _NTBase + +# XXX need tests: +# * ID.match() + + +UNKNOWN = '???' + +NAME_RE = re.compile(r'^([a-zA-Z]|_\w*[a-zA-Z]\w*|[a-zA-Z]\w*)$') + + +class ID(_NTBase, namedtuple('ID', 'filename funcname name')): + """A unique ID for a single symbol or declaration.""" + + __slots__ = () + # XXX Add optional conditions (tuple of strings) field. + #conditions = Slot() + + @classonly + def from_raw(cls, raw): + if not raw: + return None + if isinstance(raw, str): + return cls(None, None, raw) + try: + name, = raw + filename = None + except ValueError: + try: + filename, name = raw + except ValueError: + return super().from_raw(raw) + return cls(filename, None, name) + + def __new__(cls, filename, funcname, name): + self = super().__new__( + cls, + filename=str(filename) if filename else None, + funcname=str(funcname) if funcname else None, + name=str(name) if name else None, + ) + #cls.conditions.set(self, tuple(str(s) if s else None + # for s in conditions or ())) + return self + + def validate(self): + """Fail if the object is invalid (i.e. init with bad data).""" + if not self.name: + raise TypeError('missing name') + else: + if not NAME_RE.match(self.name): + raise ValueError( + f'name must be an identifier, got {self.name!r}') + + # Symbols from a binary might not have filename/funcname info. + + if self.funcname: + if not self.filename: + raise TypeError('missing filename') + if not NAME_RE.match(self.funcname) and self.funcname != UNKNOWN: + raise ValueError( + f'name must be an identifier, got {self.funcname!r}') + + # XXX Require the filename (at least UNKONWN)? + # XXX Check the filename? + + @property + def islocal(self): + return self.funcname is not None + + def match(self, other, *, + match_files=(lambda f1, f2: f1 == f2), + ): + """Return True if the two match. + + At least one of the two must be completely valid (no UNKNOWN + anywhere). Otherwise False is returned. The remaining one + *may* have UNKNOWN for both funcname and filename. It must + have a valid name though. + + The caller is responsible for knowing which of the two is valid + (and which to use if both are valid). + """ + # First check the name. + if self.name is None: + return False + if other.name != self.name: + return False + + # Then check the filename. + if self.filename is None: + return False + if other.filename is None: + return False + if self.filename == UNKNOWN: + # "other" must be the valid one. + if other.funcname == UNKNOWN: + return False + elif self.funcname != UNKNOWN: + # XXX Try matching funcname even though we don't + # know the filename? + raise NotImplementedError + else: + return True + elif other.filename == UNKNOWN: + # "self" must be the valid one. + if self.funcname == UNKNOWN: + return False + elif other.funcname != UNKNOWN: + # XXX Try matching funcname even though we don't + # know the filename? + raise NotImplementedError + else: + return True + elif not match_files(self.filename, other.filename): + return False + + # Finally, check the funcname. + if self.funcname == UNKNOWN: + # "other" must be the valid one. + if other.funcname == UNKNOWN: + return False + else: + return other.funcname is not None + elif other.funcname == UNKNOWN: + # "self" must be the valid one. + if self.funcname == UNKNOWN: + return False + else: + return self.funcname is not None + elif self.funcname == other.funcname: + # Both are valid. + return True + + return False diff --git a/Tools/c-analyzer/c_analyzer/common/show.py b/Tools/c-analyzer/c_analyzer/common/show.py new file mode 100644 index 0000000..5f3cb1c --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/common/show.py @@ -0,0 +1,11 @@ + +def basic(variables, *, + _print=print): + """Print each row simply.""" + for var in variables: + if var.funcname: + line = f'{var.filename}:{var.funcname}():{var.name}' + else: + line = f'{var.filename}:{var.name}' + line = f'{line:<64} {var.vartype}' + _print(line) diff --git a/Tools/c-analyzer/c_analyzer_common/util.py b/Tools/c-analyzer/c_analyzer/common/util.py index 43d0bb6..43d0bb6 100644 --- a/Tools/c-analyzer/c_analyzer_common/util.py +++ b/Tools/c-analyzer/c_analyzer/common/util.py diff --git a/Tools/c-analyzer/c_globals/__init__.py b/Tools/c-analyzer/c_analyzer/parser/__init__.py index e69de29..e69de29 100644 --- a/Tools/c-analyzer/c_globals/__init__.py +++ b/Tools/c-analyzer/c_analyzer/parser/__init__.py diff --git a/Tools/c-analyzer/c_parser/declarations.py b/Tools/c-analyzer/c_analyzer/parser/declarations.py index 19fa3ff..f37072c 100644 --- a/Tools/c-analyzer/c_parser/declarations.py +++ b/Tools/c-analyzer/c_analyzer/parser/declarations.py @@ -2,6 +2,8 @@ import re import shlex import subprocess +from ..common.info import UNKNOWN + from . import source @@ -194,7 +196,28 @@ def parse_func(stmt, body): return name, signature -def parse_var(stmt): +#TYPE_SPEC = rf'''(?: +# )''' +#VAR_DECLARATOR = rf'''(?: +# )''' +#VAR_DECL = rf'''(?: +# {TYPE_SPEC}+ +# {VAR_DECLARATOR} +# \s* +# )''' +#VAR_DECLARATION = rf'''(?: +# {VAR_DECL} +# (?: = [^=] [^;]* )? +# ; +# )''' +# +# +#def parse_variable(decl, *, inFunc=False): +# """Return [(name, storage, vartype)] for the given variable declaration.""" +# ... + + +def _parse_var(stmt): """Return (name, vartype) for the given variable declaration.""" stmt = stmt.rstrip(';') m = LOCAL_STMT_START_RE.match(stmt) @@ -220,6 +243,27 @@ def parse_var(stmt): return name, vartype +def extract_storage(decl, *, infunc=None): + """Return (storage, vartype) based on the given declaration. + + The default storage is "implicit" (or "local" if infunc is True). + """ + if decl == UNKNOWN: + return decl + if decl.startswith('static '): + return 'static' + #return 'static', decl.partition(' ')[2].strip() + elif decl.startswith('extern '): + return 'extern' + #return 'extern', decl.partition(' ')[2].strip() + elif re.match('.*\b(static|extern)\b', decl): + raise NotImplementedError + elif infunc: + return 'local' + else: + return 'implicit' + + def parse_compound(stmt, blocks): """Return (headers, bodies) for the given compound statement.""" # XXX Identify declarations inside compound statements @@ -228,14 +272,17 @@ def parse_compound(stmt, blocks): def iter_variables(filename, *, + preprocessed=False, _iter_source_lines=source.iter_lines, _iter_global=iter_global_declarations, _iter_local=iter_local_statements, _parse_func=parse_func, - _parse_var=parse_var, + _parse_var=_parse_var, _parse_compound=parse_compound, ): """Yield (funcname, name, vartype) for every variable in the given file.""" + if preprocessed: + raise NotImplementedError lines = _iter_source_lines(filename) for stmt, body in _iter_global(lines): # At the file top-level we only have to worry about vars & funcs. @@ -256,7 +303,7 @@ def iter_variables(filename, *, def _iter_locals(lines, *, _iter_statements=iter_local_statements, - _parse_var=parse_var, + _parse_var=_parse_var, _parse_compound=parse_compound, ): compound = [lines] @@ -278,18 +325,15 @@ def _iter_locals(lines, *, compound.extend(bodies) -def iter_all(dirnames): +def iter_all(filename, *, + preprocessed=False, + ): """Yield a Declaration for each one found. If there are duplicates, due to preprocessor conditionals, then they are checked to make sure they are the same. """ - raise NotImplementedError - - -def iter_preprocessed(dirnames): - """Yield a Declaration for each one found. - - All source files are run through the preprocessor first. - """ - raise NotImplementedError + # XXX For the moment we cheat. + for funcname, name, decl in iter_variables(filename, + preprocessed=preprocessed): + yield 'variable', funcname, name, decl diff --git a/Tools/c-analyzer/c_analyzer/parser/find.py b/Tools/c-analyzer/c_analyzer/parser/find.py new file mode 100644 index 0000000..3860d3d --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/parser/find.py @@ -0,0 +1,107 @@ +from ..common.info import UNKNOWN, ID + +from . import declarations + +# XXX need tests: +# * variables +# * variable +# * variable_from_id + + +def _iter_vars(filenames, preprocessed, *, + handle_id=None, + _iter_decls=declarations.iter_all, + ): + if handle_id is None: + handle_id = ID + + for filename in filenames or (): + for kind, funcname, name, decl in _iter_decls(filename, + preprocessed=preprocessed, + ): + if kind != 'variable': + continue + varid = handle_id(filename, funcname, name) + yield varid, decl + + +# XXX Add a "handle_var" arg like we did for get_resolver()? + +def variables(*filenames, + perfilecache=None, + preprocessed=False, + known=None, # for types + handle_id=None, + _iter_vars=_iter_vars, + ): + """Yield (varid, decl) for each variable found in the given files. + + If "preprocessed" is provided (and not False/None) then it is used + to decide which tool to use to parse the source code after it runs + through the C preprocessor. Otherwise the raw + """ + if len(filenames) == 1 and not (filenames[0], str): + filenames, = filenames + + if perfilecache is None: + yield from _iter_vars(filenames, preprocessed) + else: + # XXX Cache per-file variables (e.g. `{filename: [(varid, decl)]}`). + raise NotImplementedError + + +def variable(name, filenames, *, + local=False, + perfilecache=None, + preprocessed=False, + handle_id=None, + _iter_vars=variables, + ): + """Return (varid, decl) for the first found variable that matches. + + If "local" is True then the first matching local variable in the + file will always be returned. To avoid that, pass perfilecache and + pop each variable from the cache after using it. + """ + for varid, decl in _iter_vars(filenames, + perfilecache=perfilecache, + preprocessed=preprocessed, + ): + if varid.name != name: + continue + if local: + if varid.funcname: + if varid.funcname == UNKNOWN: + raise NotImplementedError + return varid, decl + elif not varid.funcname: + return varid, decl + else: + return None, None # No matching variable was found. + + +def variable_from_id(id, filenames, *, + perfilecache=None, + preprocessed=False, + handle_id=None, + _get_var=variable, + ): + """Return (varid, decl) for the first found variable that matches.""" + local = False + if isinstance(id, str): + name = id + else: + if id.funcname == UNKNOWN: + local = True + elif id.funcname: + raise NotImplementedError + + name = id.name + if id.filename and id.filename != UNKNOWN: + filenames = [id.filename] + return _get_var(name, filenames, + local=local, + perfilecache=perfilecache, + preprocessed=preprocessed, + handle_id=handle_id, + ) diff --git a/Tools/c-analyzer/c_parser/naive.py b/Tools/c-analyzer/c_analyzer/parser/naive.py index 160f96c..4a4822d 100644 --- a/Tools/c-analyzer/c_parser/naive.py +++ b/Tools/c-analyzer/c_analyzer/parser/naive.py @@ -1,8 +1,7 @@ import re -from c_analyzer_common.info import UNKNOWN +from ..common.info import UNKNOWN, ID -from .info import Variable from .preprocessor import _iter_clean_lines @@ -55,7 +54,7 @@ def parse_variable_declaration(srcline): def parse_variable(srcline, funcname=None): - """Return a Variable for the variable declared on the line (or None).""" + """Return (varid, decl) for the variable declared on the line (or None).""" line = srcline.strip() # XXX Handle more than just static variables. @@ -74,7 +73,7 @@ def iter_variables(filename, *, _get_srclines=get_srclines, _default_parse_variable=parse_variable, ): - """Yield a Variable for each in the given source file.""" + """Yield (varid, decl) for each variable in the given source file.""" if parse_variable is None: parse_variable = _default_parse_variable @@ -99,13 +98,13 @@ def iter_variables(filename, *, info = parse_variable(line, funcname) if isinstance(info, list): for name, _funcname, decl in info: - yield Variable.from_parts(filename, _funcname, name, decl) + yield ID(filename, _funcname, name), decl continue name, decl = info if name is None: continue - yield Variable.from_parts(filename, funcname, name, decl) + yield ID(filename, funcname, name), decl def _match_varid(variable, name, funcname, ignored=None): @@ -134,12 +133,12 @@ def find_variable(filename, funcname, name, *, Return None if the variable is not found. """ - for variable in _iter_variables(filename, + for varid, decl in _iter_variables(filename, srccache=srccache, parse_variable=parse_variable, ): - if _match_varid(variable, name, funcname, ignored): - return variable + if _match_varid(varid, name, funcname, ignored): + return varid, decl else: return None @@ -149,10 +148,10 @@ def find_variables(varids, filenames=None, *, parse_variable=None, _find_symbol=find_variable, ): - """Yield a Variable for each ID. + """Yield (varid, decl) for each ID. If the variable is not found then its decl will be UNKNOWN. That - way there will be one resulting Variable per given ID. + way there will be one resulting variable per given ID. """ if srccache is _NOT_SET: srccache = {} @@ -163,18 +162,18 @@ def find_variables(varids, filenames=None, *, srcfiles = [varid.filename] else: if not filenames: - yield Variable(varid, UNKNOWN, UNKNOWN) + yield varid, UNKNOWN continue srcfiles = filenames for filename in srcfiles: - found = _find_varid(filename, varid.funcname, varid.name, - ignored=used, - srccache=srccache, - parse_variable=parse_variable, - ) - if found: - yield found - used.add(found) + varid, decl = _find_varid(filename, varid.funcname, varid.name, + ignored=used, + srccache=srccache, + parse_variable=parse_variable, + ) + if varid: + yield varid, decl + used.add(varid) break else: - yield Variable(varid, UNKNOWN, UNKNOWN) + yield varid, UNKNOWN diff --git a/Tools/c-analyzer/c_parser/preprocessor.py b/Tools/c-analyzer/c_analyzer/parser/preprocessor.py index 0e2866e..41f306e 100644 --- a/Tools/c-analyzer/c_parser/preprocessor.py +++ b/Tools/c-analyzer/c_analyzer/parser/preprocessor.py @@ -3,8 +3,7 @@ import shlex import os import re -from c_analyzer_common import util -from . import info +from ..common import util, info CONTINUATION = '\\' + os.linesep diff --git a/Tools/c-analyzer/c_parser/source.py b/Tools/c-analyzer/c_analyzer/parser/source.py index f8998c8..f8998c8 100644 --- a/Tools/c-analyzer/c_parser/source.py +++ b/Tools/c-analyzer/c_analyzer/parser/source.py diff --git a/Tools/c-analyzer/c_analyzer/symbols/__init__.py b/Tools/c-analyzer/c_analyzer/symbols/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/symbols/__init__.py diff --git a/Tools/c-analyzer/c_symbols/binary.py b/Tools/c-analyzer/c_analyzer/symbols/_nm.py index e125dbd5..f3a75a6 100644 --- a/Tools/c-analyzer/c_symbols/binary.py +++ b/Tools/c-analyzer/c_analyzer/symbols/_nm.py @@ -1,46 +1,24 @@ -import os import os.path import shutil -import sys - -from c_analyzer_common import util, info -from . import source -from .info import Symbol - - -#PYTHON = os.path.join(REPO_ROOT, 'python') -PYTHON = sys.executable +from c_analyzer.common import util, info -def iter_symbols(binary=PYTHON, dirnames=None, *, - # Alternately, use look_up_known_symbol() - # from c_globals.supported. - find_local_symbol=source.find_symbol, - _file_exists=os.path.exists, - _iter_symbols_nm=(lambda b, *a: _iter_symbols_nm(b, *a)), - ): - """Yield a Symbol for each symbol found in the binary.""" - if not _file_exists(binary): - raise Exception('executable missing (need to build it first?)') - - if find_local_symbol: - cache = {} - def find_local_symbol(name, *, _find=find_local_symbol): - return _find(name, dirnames, _perfilecache=cache) - else: - find_local_symbol = None +from .info import Symbol - if os.name == 'nt': - # XXX Support this. - raise NotImplementedError - else: - yield from _iter_symbols_nm(binary, find_local_symbol) +# XXX need tests: +# * iter_symbols -############################# -# binary format (e.g. ELF) +NM_KINDS = { + 'b': Symbol.KIND.VARIABLE, # uninitialized + 'd': Symbol.KIND.VARIABLE, # initialized + #'g': Symbol.KIND.VARIABLE, # uninitialized + #'s': Symbol.KIND.VARIABLE, # initialized + 't': Symbol.KIND.FUNCTION, + } SPECIAL_SYMBOLS = { + # binary format (e.g. ELF) '__bss_start', '__data_start', '__dso_handle', @@ -63,29 +41,23 @@ def _is_special_symbol(name): return False -############################# -# "nm" - -NM_KINDS = { - 'b': Symbol.KIND.VARIABLE, # uninitialized - 'd': Symbol.KIND.VARIABLE, # initialized - #'g': Symbol.KIND.VARIABLE, # uninitialized - #'s': Symbol.KIND.VARIABLE, # initialized - 't': Symbol.KIND.FUNCTION, - } - +def iter_symbols(binfile, *, + nm=None, + handle_id=None, + _which=shutil.which, + _run=util.run_cmd, + ): + """Yield a Symbol for each relevant entry reported by the "nm" command.""" + if nm is None: + nm = _which('nm') + if not nm: + raise NotImplementedError + if handle_id is None: + handle_id = info.ID -def _iter_symbols_nm(binary, find_local_symbol=None, - *, - _which=shutil.which, - _run=util.run_cmd, - ): - nm = _which('nm') - if not nm: - raise NotImplementedError argv = [nm, '--line-numbers', - binary, + binfile, ] try: output = _run(argv) @@ -95,23 +67,20 @@ def _iter_symbols_nm(binary, find_local_symbol=None, raise NotImplementedError raise for line in output.splitlines(): - (name, kind, external, filename, funcname, vartype, - ) = _parse_nm_line(line, - _find_local_symbol=find_local_symbol, - ) + (name, kind, external, filename, funcname, + ) = _parse_nm_line(line) if kind != Symbol.KIND.VARIABLE: continue elif _is_special_symbol(name): continue - assert vartype is None yield Symbol( - id=(filename, funcname, name), + id=handle_id(filename, funcname, name), kind=kind, external=external, ) -def _parse_nm_line(line, *, _find_local_symbol=None): +def _parse_nm_line(line): _origline = line _, _, line = line.partition(' ') # strip off the address line = line.strip() @@ -128,18 +97,9 @@ def _parse_nm_line(line, *, _find_local_symbol=None): else: filename = info.UNKNOWN - vartype = None name, islocal = _parse_nm_name(name, kind) - if islocal: - funcname = info.UNKNOWN - if _find_local_symbol is not None: - filename, funcname, vartype = _find_local_symbol(name) - filename = filename or info.UNKNOWN - funcname = funcname or info.UNKNOWN - else: - funcname = None - # XXX fine filename and vartype? - return name, kind, external, filename, funcname, vartype + funcname = info.UNKNOWN if islocal else None + return name, kind, external, filename, funcname def _parse_nm_name(name, kind): diff --git a/Tools/c-analyzer/c_analyzer/symbols/find.py b/Tools/c-analyzer/c_analyzer/symbols/find.py new file mode 100644 index 0000000..8564652 --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/symbols/find.py @@ -0,0 +1,175 @@ +import os +import os.path +import shutil + +from ..common import files +from ..common.info import UNKNOWN, ID +from ..parser import find as p_find + +from . import _nm +from .info import Symbol + +# XXX need tests: +# * get_resolver() +# * get_resolver_from_dirs() +# * symbol() +# * symbols() +# * variables() + + +def _resolve_known(symbol, knownvars): + for varid in knownvars: + if symbol.match(varid): + break + else: + return None + return knownvars.pop(varid) + + +def get_resolver(filenames=None, known=None, *, + handle_var, + check_filename=None, + perfilecache=None, + preprocessed=False, + _from_source=p_find.variable_from_id, + ): + """Return a "resolver" func for the given known vars/types and filenames. + + "handle_var" is a callable that takes (ID, decl) and returns a + Variable. Variable.from_id is a suitable callable. + + The returned func takes a single Symbol and returns a corresponding + Variable. If the symbol was located then the variable will be + valid, populated with the corresponding information. Otherwise None + is returned. + """ + knownvars = (known or {}).get('variables') + if knownvars: + knownvars = dict(knownvars) # a copy + if filenames: + if check_filename is None: + filenames = list(filenames) + def check_filename(filename): + return filename in filenames + def resolve(symbol): + # XXX Check "found" instead? + if not check_filename(symbol.filename): + return None + found = _resolve_known(symbol, knownvars) + if found is None: + #return None + varid, decl = _from_source(symbol, filenames, + perfilecache=perfilecache, + preprocessed=preprocessed, + ) + found = handle_var(varid, decl) + return found + else: + def resolve(symbol): + return _resolve_known(symbol, knownvars) + elif filenames: + def resolve(symbol): + varid, decl = _from_source(symbol, filenames, + perfilecache=perfilecache, + preprocessed=preprocessed, + ) + return handle_var(varid, decl) + else: + def resolve(symbol): + return None + return resolve + + +def get_resolver_from_dirs(dirnames, known=None, *, + handle_var, + suffixes=('.c',), + perfilecache=None, + preprocessed=False, + _iter_files=files.iter_files_by_suffix, + _get_resolver=get_resolver, + ): + """Return a "resolver" func for the given known vars/types and filenames. + + "dirnames" should be absolute paths. If not then they will be + resolved relative to CWD. + + See get_resolver(). + """ + dirnames = [d if d.endswith(os.path.sep) else d + os.path.sep + for d in dirnames] + filenames = _iter_files(dirnames, suffixes) + def check_filename(filename): + for dirname in dirnames: + if filename.startswith(dirname): + return True + else: + return False + return _get_resolver(filenames, known, + handle_var=handle_var, + check_filename=check_filename, + perfilecache=perfilecache, + preprocessed=preprocessed, + ) + + +def symbol(symbol, filenames, known=None, *, + perfilecache=None, + preprocessed=False, + handle_id=None, + _get_resolver=get_resolver, + ): + """Return a Variable for the one matching the given symbol. + + "symbol" can be one of several objects: + + * Symbol - use the contained info + * name (str) - look for a global variable with that name + * (filename, name) - look for named global in file + * (filename, funcname, name) - look for named local in file + + A name is always required. If the filename is None, "", or + "UNKNOWN" then all files will be searched. If the funcname is + "" or "UNKNOWN" then only local variables will be searched for. + """ + resolve = _get_resolver(known, filenames, + handle_id=handle_id, + perfilecache=perfilecache, + preprocessed=preprocessed, + ) + return resolve(symbol) + + +def _get_platform_tool(): + if os.name == 'nt': + # XXX Support this. + raise NotImplementedError + elif nm := shutil.which('nm'): + return lambda b, hi: _nm.iter_symbols(b, nm=nm, handle_id=hi) + else: + raise NotImplementedError + + +def symbols(binfile, *, + handle_id=None, + _file_exists=os.path.exists, + _get_platform_tool=_get_platform_tool, + ): + """Yield a Symbol for each one found in the binary.""" + if not _file_exists(binfile): + raise Exception('executable missing (need to build it first?)') + + _iter_symbols = _get_platform_tool() + yield from _iter_symbols(binfile, handle_id) + + +def variables(binfile, *, + resolve, + handle_id=None, + _iter_symbols=symbols, + ): + """Yield (Variable, Symbol) for each found symbol.""" + for symbol in _iter_symbols(binfile, handle_id=handle_id): + if symbol.kind != Symbol.KIND.VARIABLE: + continue + var = resolve(symbol) or None + yield var, symbol diff --git a/Tools/c-analyzer/c_symbols/info.py b/Tools/c-analyzer/c_analyzer/symbols/info.py index f6ed52c..96a251a 100644 --- a/Tools/c-analyzer/c_symbols/info.py +++ b/Tools/c-analyzer/c_analyzer/symbols/info.py @@ -1,7 +1,7 @@ from collections import namedtuple -from c_analyzer_common.info import ID -from c_analyzer_common.util import classonly, _NTBase +from c_analyzer.common.info import ID +from c_analyzer.common.util import classonly, _NTBase class Symbol(_NTBase, namedtuple('Symbol', 'id kind external')): diff --git a/Tools/c-analyzer/c_analyzer/variables/__init__.py b/Tools/c-analyzer/c_analyzer/variables/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/variables/__init__.py diff --git a/Tools/c-analyzer/c_analyzer/variables/find.py b/Tools/c-analyzer/c_analyzer/variables/find.py new file mode 100644 index 0000000..3fe7284 --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/variables/find.py @@ -0,0 +1,75 @@ +from ..common import files +from ..common.info import UNKNOWN +from ..parser import ( + find as p_find, + ) +from ..symbols import ( + info as s_info, + find as s_find, + ) +from .info import Variable + +# XXX need tests: +# * vars_from_source + + +def _remove_cached(cache, var): + if not cache: + return + try: + cached = cache[var.filename] + cached.remove(var) + except (KeyError, IndexError): + pass + + +def vars_from_binary(binfile, *, + known=None, + filenames=None, + handle_id=None, + check_filename=None, + handle_var=Variable.from_id, + _iter_vars=s_find.variables, + _get_symbol_resolver=s_find.get_resolver, + ): + """Yield a Variable for each found Symbol. + + Details are filled in from the given "known" variables and types. + """ + cache = {} + resolve = _get_symbol_resolver(filenames, known, + handle_var=handle_var, + check_filename=check_filename, + perfilecache=cache, + ) + for var, symbol in _iter_vars(binfile, + resolve=resolve, + handle_id=handle_id, + ): + if var is None: + var = Variable(symbol.id, UNKNOWN, UNKNOWN) + yield var + _remove_cached(cache, var) + + +def vars_from_source(filenames, *, + preprocessed=None, + known=None, + handle_id=None, + handle_var=Variable.from_id, + iter_vars=p_find.variables, + ): + """Yield a Variable for each declaration in the raw source code. + + Details are filled in from the given "known" variables and types. + """ + cache = {} + for varid, decl in iter_vars(filenames or (), + perfilecache=cache, + preprocessed=preprocessed, + known=known, + handle_id=handle_id, + ): + var = handle_var(varid, decl) + yield var + _remove_cached(cache, var) diff --git a/Tools/c-analyzer/c_parser/info.py b/Tools/c-analyzer/c_analyzer/variables/info.py index a4e32d7..336a523 100644 --- a/Tools/c-analyzer/c_parser/info.py +++ b/Tools/c-analyzer/c_analyzer/variables/info.py @@ -1,8 +1,7 @@ from collections import namedtuple -import re -from c_analyzer_common import info, util -from c_analyzer_common.util import classonly, _NTBase +from ..common.info import ID, UNKNOWN +from ..common.util import classonly, _NTBase def normalize_vartype(vartype): @@ -16,26 +15,7 @@ def normalize_vartype(vartype): return str(vartype) -def extract_storage(decl, *, isfunc=False): - """Return (storage, vartype) based on the given declaration. - - The default storage is "implicit" or "local". - """ - if decl == info.UNKNOWN: - return decl, decl - if decl.startswith('static '): - return 'static', decl - #return 'static', decl.partition(' ')[2].strip() - elif decl.startswith('extern '): - return 'extern', decl - #return 'extern', decl.partition(' ')[2].strip() - elif re.match('.*\b(static|extern)\b', decl): - raise NotImplementedError - elif isfunc: - return 'local', decl - else: - return 'implicit', decl - +# XXX Variable.vartype -> decl (Declaration). class Variable(_NTBase, namedtuple('Variable', 'id storage vartype')): @@ -52,16 +32,23 @@ class Variable(_NTBase, @classonly def from_parts(cls, filename, funcname, name, decl, storage=None): + varid = ID(filename, funcname, name) if storage is None: - storage, decl = extract_storage(decl, isfunc=funcname) - id = info.ID(filename, funcname, name) - self = cls(id, storage, decl) + self = cls.from_id(varid, decl) + else: + self = cls(varid, storage, decl) return self + @classonly + def from_id(cls, varid, decl): + from ..parser.declarations import extract_storage + storage = extract_storage(decl, infunc=varid.funcname) + return cls(varid, storage, decl) + def __new__(cls, id, storage, vartype): self = super().__new__( cls, - id=info.ID.from_raw(id), + id=ID.from_raw(id), storage=str(storage) if storage else None, vartype=normalize_vartype(vartype) if vartype else None, ) @@ -77,10 +64,10 @@ class Variable(_NTBase, if not self.id: raise TypeError('missing id') - if not self.filename or self.filename == info.UNKNOWN: + if not self.filename or self.filename == UNKNOWN: raise TypeError(f'id missing filename ({self.id})') - if self.funcname and self.funcname == info.UNKNOWN: + if self.funcname and self.funcname == UNKNOWN: raise TypeError(f'id missing funcname ({self.id})') self.id.validate() @@ -89,12 +76,12 @@ class Variable(_NTBase, """Fail if the object is invalid (i.e. init with bad data).""" self._validate_id() - if self.storage is None or self.storage == info.UNKNOWN: + if self.storage is None or self.storage == UNKNOWN: raise TypeError('missing storage') elif self.storage not in self.STORAGE: raise ValueError(f'unsupported storage {self.storage:r}') - if self.vartype is None or self.vartype == info.UNKNOWN: + if self.vartype is None or self.vartype == UNKNOWN: raise TypeError('missing vartype') @property diff --git a/Tools/c-analyzer/c_analyzer/variables/known.py b/Tools/c-analyzer/c_analyzer/variables/known.py new file mode 100644 index 0000000..aa2934a --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/variables/known.py @@ -0,0 +1,91 @@ +import csv + +from ..common.info import ID, UNKNOWN +from ..common.util import read_tsv +from .info import Variable + + +# XXX need tests: +# * read_file() +# * look_up_variable() + + +COLUMNS = ('filename', 'funcname', 'name', 'kind', 'declaration') +HEADER = '\t'.join(COLUMNS) + + +def read_file(infile, *, + _read_tsv=read_tsv, + ): + """Yield (kind, id, decl) for each row in the data file. + + The caller is responsible for validating each row. + """ + for row in _read_tsv(infile, HEADER): + filename, funcname, name, kind, declaration = row + if not funcname or funcname == '-': + funcname = None + id = ID(filename, funcname, name) + yield kind, id, declaration + + +def from_file(infile, *, + handle_var=Variable.from_id, + _read_file=read_file, + ): + """Return the info for known declarations in the given file.""" + known = { + 'variables': {}, + #'types': {}, + #'constants': {}, + #'macros': {}, + } + for kind, id, decl in _read_file(infile): + if kind == 'variable': + values = known['variables'] + value = handle_var(id, decl) + else: + raise ValueError(f'unsupported kind in row {row}') + value.validate() + values[id] = value + return known + + +def look_up_variable(varid, knownvars, *, + match_files=(lambda f1, f2: f1 == f2), + ): + """Return the known Variable matching the given ID. + + "knownvars" is a mapping of ID to Variable. + + "match_files" is used to verify if two filenames point to + the same file. + + If no match is found then None is returned. + """ + if not knownvars: + return None + + if varid.funcname == UNKNOWN: + if not varid.filename or varid.filename == UNKNOWN: + for varid in knownvars: + if not varid.funcname: + continue + if varid.name == varid.name: + return knownvars[varid] + else: + return None + else: + for varid in knownvars: + if not varid.funcname: + continue + if not match_files(varid.filename, varid.filename): + continue + if varid.name == varid.name: + return knownvars[varid] + else: + return None + elif not varid.filename or varid.filename == UNKNOWN: + raise NotImplementedError + else: + return knownvars.get(varid.id) diff --git a/Tools/c-analyzer/c_analyzer_common/__init__.py b/Tools/c-analyzer/c_analyzer_common/__init__.py deleted file mode 100644 index 888b16f..0000000 --- a/Tools/c-analyzer/c_analyzer_common/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -import os.path - - -PKG_ROOT = os.path.dirname(__file__) -DATA_DIR = os.path.dirname(PKG_ROOT) -REPO_ROOT = os.path.dirname( - os.path.dirname(DATA_DIR)) - -SOURCE_DIRS = [os.path.join(REPO_ROOT, name) for name in [ - 'Include', - 'Python', - 'Parser', - 'Objects', - 'Modules', - ]] - - -# Clean up the namespace. -del os diff --git a/Tools/c-analyzer/c_analyzer_common/info.py b/Tools/c-analyzer/c_analyzer_common/info.py deleted file mode 100644 index e217380..0000000 --- a/Tools/c-analyzer/c_analyzer_common/info.py +++ /dev/null @@ -1,69 +0,0 @@ -from collections import namedtuple -import re - -from .util import classonly, _NTBase - - -UNKNOWN = '???' - -NAME_RE = re.compile(r'^([a-zA-Z]|_\w*[a-zA-Z]\w*|[a-zA-Z]\w*)$') - - -class ID(_NTBase, namedtuple('ID', 'filename funcname name')): - """A unique ID for a single symbol or declaration.""" - - __slots__ = () - # XXX Add optional conditions (tuple of strings) field. - #conditions = Slot() - - @classonly - def from_raw(cls, raw): - if not raw: - return None - if isinstance(raw, str): - return cls(None, None, raw) - try: - name, = raw - filename = None - except ValueError: - try: - filename, name = raw - except ValueError: - return super().from_raw(raw) - return cls(filename, None, name) - - def __new__(cls, filename, funcname, name): - self = super().__new__( - cls, - filename=str(filename) if filename else None, - funcname=str(funcname) if funcname else None, - name=str(name) if name else None, - ) - #cls.conditions.set(self, tuple(str(s) if s else None - # for s in conditions or ())) - return self - - def validate(self): - """Fail if the object is invalid (i.e. init with bad data).""" - if not self.name: - raise TypeError('missing name') - else: - if not NAME_RE.match(self.name): - raise ValueError( - f'name must be an identifier, got {self.name!r}') - - # Symbols from a binary might not have filename/funcname info. - - if self.funcname: - if not self.filename: - raise TypeError('missing filename') - if not NAME_RE.match(self.funcname) and self.funcname != UNKNOWN: - raise ValueError( - f'name must be an identifier, got {self.funcname!r}') - - # XXX Require the filename (at least UNKONWN)? - # XXX Check the filename? - - @property - def islocal(self): - return self.funcname is not None diff --git a/Tools/c-analyzer/c_analyzer_common/known.py b/Tools/c-analyzer/c_analyzer_common/known.py deleted file mode 100644 index dec1e1d..0000000 --- a/Tools/c-analyzer/c_analyzer_common/known.py +++ /dev/null @@ -1,74 +0,0 @@ -import csv -import os.path - -from c_parser.info import Variable - -from . import DATA_DIR -from .info import ID, UNKNOWN -from .util import read_tsv - - -DATA_FILE = os.path.join(DATA_DIR, 'known.tsv') - -COLUMNS = ('filename', 'funcname', 'name', 'kind', 'declaration') -HEADER = '\t'.join(COLUMNS) - - -# XXX need tests: -# * from_file() - -def from_file(infile, *, - _read_tsv=read_tsv, - ): - """Return the info for known declarations in the given file.""" - known = { - 'variables': {}, - #'types': {}, - #'constants': {}, - #'macros': {}, - } - for row in _read_tsv(infile, HEADER): - filename, funcname, name, kind, declaration = row - if not funcname or funcname == '-': - funcname = None - id = ID(filename, funcname, name) - if kind == 'variable': - values = known['variables'] - if funcname: - storage = _get_storage(declaration) or 'local' - else: - storage = _get_storage(declaration) or 'implicit' - value = Variable(id, storage, declaration) - else: - raise ValueError(f'unsupported kind in row {row}') - value.validate() -# if value.name == 'id' and declaration == UNKNOWN: -# # None of these are variables. -# declaration = 'int id'; -# else: -# value.validate() - values[id] = value - return known - - -def _get_storage(decl): - # statics - if decl.startswith('static '): - return 'static' - if decl.startswith(('Py_LOCAL(', 'Py_LOCAL_INLINE(')): - return 'static' - if decl.startswith(('_Py_IDENTIFIER(', '_Py_static_string(')): - return 'static' - if decl.startswith('PyDoc_VAR('): - return 'static' - if decl.startswith(('SLOT1BINFULL(', 'SLOT1BIN(')): - return 'static' - if decl.startswith('WRAP_METHOD('): - return 'static' - # public extern - if decl.startswith('extern '): - return 'extern' - if decl.startswith('PyAPI_DATA('): - return 'extern' - # implicit or local - return None diff --git a/Tools/c-analyzer/c_globals/find.py b/Tools/c-analyzer/c_globals/find.py deleted file mode 100644 index a51b947..0000000 --- a/Tools/c-analyzer/c_globals/find.py +++ /dev/null @@ -1,95 +0,0 @@ -from c_analyzer_common import SOURCE_DIRS -from c_analyzer_common.info import UNKNOWN -from c_symbols import ( - info as s_info, - binary as b_symbols, - source as s_symbols, - resolve, - ) -from c_parser import info, declarations - - -# XXX needs tests: -# * iter_variables - -def globals_from_binary(binfile=b_symbols.PYTHON, *, - knownvars=None, - dirnames=None, - _iter_symbols=b_symbols.iter_symbols, - _resolve=resolve.symbols_to_variables, - _get_symbol_resolver=resolve.get_resolver, - ): - """Yield a Variable for each found Symbol. - - Details are filled in from the given "known" variables and types. - """ - symbols = _iter_symbols(binfile, find_local_symbol=None) - #symbols = list(symbols) - for variable in _resolve(symbols, - resolve=_get_symbol_resolver(knownvars, dirnames), - ): - # Skip each non-global variable (unless we couldn't find it). - # XXX Drop the "UNKNOWN" condition? - if not variable.isglobal and variable.vartype != UNKNOWN: - continue - yield variable - - -def globals_from_declarations(dirnames=SOURCE_DIRS, *, - known=None, - ): - """Yield a Variable for each found declaration. - - Details are filled in from the given "known" variables and types. - """ - raise NotImplementedError - - -def iter_variables(kind='platform', *, - known=None, - dirnames=None, - _resolve_symbols=resolve.symbols_to_variables, - _get_symbol_resolver=resolve.get_resolver, - _symbols_from_binary=b_symbols.iter_symbols, - _symbols_from_source=s_symbols.iter_symbols, - _iter_raw=declarations.iter_all, - _iter_preprocessed=declarations.iter_preprocessed, - ): - """Yield a Variable for each one found (e.g. in files).""" - kind = kind or 'platform' - - if kind == 'symbols': - knownvars = (known or {}).get('variables') - yield from _resolve_symbols( - _symbols_from_source(dirnames, known), - resolve=_get_symbol_resolver(knownvars, dirnames), - ) - elif kind == 'platform': - knownvars = (known or {}).get('variables') - yield from _resolve_symbols( - _symbols_from_binary(find_local_symbol=None), - resolve=_get_symbol_resolver(knownvars, dirnames), - ) - elif kind == 'declarations': - for decl in _iter_raw(dirnames): - if not isinstance(decl, info.Variable): - continue - yield decl - elif kind == 'preprocessed': - for decl in _iter_preprocessed(dirnames): - if not isinstance(decl, info.Variable): - continue - yield decl - else: - raise ValueError(f'unsupported kind {kind!r}') - - -def globals(dirnames, known, *, - kind=None, # Use the default. - _iter_variables=iter_variables, - ): - """Return a list of (StaticVar, <supported>) for each found global var.""" - for found in _iter_variables(kind, known=known, dirnames=dirnames): - if not found.isglobal: - continue - yield found diff --git a/Tools/c-analyzer/c_globals/show.py b/Tools/c-analyzer/c_globals/show.py deleted file mode 100644 index f4298b1..0000000 --- a/Tools/c-analyzer/c_globals/show.py +++ /dev/null @@ -1,16 +0,0 @@ - -def basic(globals, *, - _print=print): - """Print each row simply.""" - for variable in globals: - if variable.funcname: - line = f'{variable.filename}:{variable.funcname}():{variable.name}' - else: - line = f'{variable.filename}:{variable.name}' - vartype = variable.vartype - #if vartype.startswith('static '): - # vartype = vartype.partition(' ')[2] - #else: - # vartype = '=' + vartype - line = f'{line:<64} {vartype}' - _print(line) diff --git a/Tools/c-analyzer/c_symbols/resolve.py b/Tools/c-analyzer/c_symbols/resolve.py deleted file mode 100644 index 56210ce..0000000 --- a/Tools/c-analyzer/c_symbols/resolve.py +++ /dev/null @@ -1,147 +0,0 @@ -import os.path - -from c_analyzer_common import files -from c_analyzer_common.info import UNKNOWN -from c_parser import declarations, info -from .info import Symbol -from .source import _find_symbol - - -# XXX need tests: -# * look_up_known_symbol() -# * symbol_from_source() -# * get_resolver() -# * symbols_to_variables() - -def look_up_known_symbol(symbol, knownvars, *, - match_files=(lambda f1, f2: f1 == f2), - ): - """Return the known variable matching the given symbol. - - "knownvars" is a mapping of common.ID to parser.Variable. - - "match_files" is used to verify if two filenames point to - the same file. - """ - if not knownvars: - return None - - if symbol.funcname == UNKNOWN: - if not symbol.filename or symbol.filename == UNKNOWN: - for varid in knownvars: - if not varid.funcname: - continue - if varid.name == symbol.name: - return knownvars[varid] - else: - return None - else: - for varid in knownvars: - if not varid.funcname: - continue - if not match_files(varid.filename, symbol.filename): - continue - if varid.name == symbol.name: - return knownvars[varid] - else: - return None - elif not symbol.filename or symbol.filename == UNKNOWN: - raise NotImplementedError - else: - return knownvars.get(symbol.id) - - -def find_in_source(symbol, dirnames, *, - _perfilecache={}, - _find_symbol=_find_symbol, - _iter_files=files.iter_files_by_suffix, - ): - """Return the Variable matching the given Symbol. - - If there is no match then return None. - """ - if symbol.filename and symbol.filename != UNKNOWN: - filenames = [symbol.filename] - else: - filenames = _iter_files(dirnames, ('.c', '.h')) - - if symbol.funcname and symbol.funcname != UNKNOWN: - raise NotImplementedError - - (filename, funcname, decl - ) = _find_symbol(symbol.name, filenames, _perfilecache) - if filename == UNKNOWN: - return None - return info.Variable.from_parts(filename, funcname, symbol.name, decl) - - -def get_resolver(knownvars=None, dirnames=None, *, - _look_up_known=look_up_known_symbol, - _from_source=find_in_source, - ): - """Return a "resolver" func for the given known vars and dirnames. - - The func takes a single Symbol and returns a corresponding Variable. - If the symbol was located then the variable will be valid, populated - with the corresponding information. Otherwise None is returned. - """ - if knownvars: - knownvars = dict(knownvars) # a copy - def resolve_known(symbol): - found = _look_up_known(symbol, knownvars) - if found is None: - return None - elif symbol.funcname == UNKNOWN: - knownvars.pop(found.id) - elif not symbol.filename or symbol.filename == UNKNOWN: - knownvars.pop(found.id) - return found - if dirnames: - def resolve(symbol): - found = resolve_known(symbol) - if found is None: - return None - #return _from_source(symbol, dirnames) - else: - for dirname in dirnames: - if not dirname.endswith(os.path.sep): - dirname += os.path.sep - if found.filename.startswith(dirname): - break - else: - return None - return found - else: - resolve = resolve_known - elif dirnames: - def resolve(symbol): - return _from_source(symbol, dirnames) - else: - def resolve(symbol): - return None - return resolve - - -def symbols_to_variables(symbols, *, - resolve=(lambda s: look_up_known_symbol(s, None)), - ): - """Yield the variable the matches each given symbol. - - Use get_resolver() for a "resolve" func to use. - """ - for symbol in symbols: - if isinstance(symbol, info.Variable): - # XXX validate? - yield symbol - continue - if symbol.kind != Symbol.KIND.VARIABLE: - continue - resolved = resolve(symbol) - if resolved is None: - #raise NotImplementedError(symbol) - resolved = info.Variable( - id=symbol.id, - storage=UNKNOWN, - vartype=UNKNOWN, - ) - yield resolved diff --git a/Tools/c-analyzer/c_symbols/source.py b/Tools/c-analyzer/c_symbols/source.py deleted file mode 100644 index a724810..0000000 --- a/Tools/c-analyzer/c_symbols/source.py +++ /dev/null @@ -1,58 +0,0 @@ -from c_analyzer_common import files -from c_analyzer_common.info import UNKNOWN -from c_parser import declarations - - -# XXX need tests: -# * find_symbol() - -def find_symbol(name, dirnames, *, - _perfilecache, - _iter_files=files.iter_files_by_suffix, - **kwargs - ): - """Return (filename, funcname, vartype) for the matching Symbol.""" - filenames = _iter_files(dirnames, ('.c', '.h')) - return _find_symbol(name, filenames, _perfilecache, **kwargs) - - -def _get_symbols(filename, *, - _iter_variables=declarations.iter_variables, - ): - """Return the list of Symbols found in the given file.""" - symbols = {} - for funcname, name, vartype in _iter_variables(filename): - if not funcname: - continue - try: - instances = symbols[name] - except KeyError: - instances = symbols[name] = [] - instances.append((funcname, vartype)) - return symbols - - -def _find_symbol(name, filenames, _perfilecache, *, - _get_local_symbols=_get_symbols, - ): - for filename in filenames: - try: - symbols = _perfilecache[filename] - except KeyError: - symbols = _perfilecache[filename] = _get_local_symbols(filename) - - try: - instances = symbols[name] - except KeyError: - continue - - funcname, vartype = instances.pop(0) - if not instances: - symbols.pop(name) - return filename, funcname, vartype - else: - return UNKNOWN, UNKNOWN, UNKNOWN - - -def iter_symbols(): - raise NotImplementedError diff --git a/Tools/c-analyzer/c_globals/README b/Tools/c-analyzer/cpython/README index 772b8be..772b8be 100644 --- a/Tools/c-analyzer/c_globals/README +++ b/Tools/c-analyzer/cpython/README diff --git a/Tools/c-analyzer/cpython/__init__.py b/Tools/c-analyzer/cpython/__init__.py new file mode 100644 index 0000000..ae45b42 --- /dev/null +++ b/Tools/c-analyzer/cpython/__init__.py @@ -0,0 +1,29 @@ +import os.path +import sys + + +TOOL_ROOT = os.path.abspath( + os.path.dirname( # c-analyzer/ + os.path.dirname(__file__))) # cpython/ +DATA_DIR = TOOL_ROOT +REPO_ROOT = ( + os.path.dirname( # .. + os.path.dirname(TOOL_ROOT))) # Tools/ + +INCLUDE_DIRS = [os.path.join(REPO_ROOT, name) for name in [ + 'Include', + ]] +SOURCE_DIRS = [os.path.join(REPO_ROOT, name) for name in [ + 'Python', + 'Parser', + 'Objects', + 'Modules', + ]] + +#PYTHON = os.path.join(REPO_ROOT, 'python') +PYTHON = sys.executable + + +# Clean up the namespace. +del sys +del os diff --git a/Tools/c-analyzer/c_globals/__main__.py b/Tools/c-analyzer/cpython/__main__.py index 9570fb6..6b0f9bc 100644 --- a/Tools/c-analyzer/c_globals/__main__.py +++ b/Tools/c-analyzer/cpython/__main__.py @@ -1,42 +1,42 @@ import argparse -import os.path import re import sys -from c_analyzer_common import SOURCE_DIRS, REPO_ROOT -from c_analyzer_common.info import UNKNOWN -from c_analyzer_common.known import ( +from c_analyzer.common import show +from c_analyzer.common.info import UNKNOWN + +from . import SOURCE_DIRS +from .find import supported_vars +from .known import ( from_file as known_from_file, DATA_FILE as KNOWN_FILE, ) -from . import find, show -from .supported import is_supported, ignored_from_file, IGNORED_FILE, _is_object +from .supported import IGNORED_FILE -def _match_unused_global(variable, knownvars, used): - found = [] - for varid in knownvars: - if varid in used: - continue - if varid.funcname is not None: - continue - if varid.name != variable.name: - continue - if variable.filename and variable.filename != UNKNOWN: - if variable.filename == varid.filename: +def _check_results(unknown, knownvars, used): + def _match_unused_global(variable): + found = [] + for varid in knownvars: + if varid in used: + continue + if varid.funcname is not None: + continue + if varid.name != variable.name: + continue + if variable.filename and variable.filename != UNKNOWN: + if variable.filename == varid.filename: + found.append(varid) + else: found.append(varid) - else: - found.append(varid) - return found - + return found -def _check_results(unknown, knownvars, used): badknown = set() for variable in sorted(unknown): msg = None if variable.funcname != UNKNOWN: msg = f'could not find global symbol {variable.id}' - elif m := _match_unused_global(variable, knownvars, used): + elif m := _match_unused_global(variable): assert isinstance(m, list) badknown.update(m) elif variable.name in ('completed', 'id'): # XXX Figure out where these variables are. @@ -65,32 +65,29 @@ def _check_results(unknown, knownvars, used): raise Exception('could not find all symbols') -def _find_globals(dirnames, known, ignored): - if dirnames == SOURCE_DIRS: - dirnames = [os.path.relpath(d, REPO_ROOT) for d in dirnames] - - ignored = ignored_from_file(ignored) - known = known_from_file(known) +# XXX Move this check to its own command. +def cmd_check_cache(cmd, *, + known=KNOWN_FILE, + ignored=IGNORED_FILE, + _known_from_file=known_from_file, + _find=supported_vars, + ): + known = _known_from_file(known) used = set() unknown = set() - knownvars = (known or {}).get('variables') - for variable in find.globals_from_binary(knownvars=knownvars, - dirnames=dirnames): - #for variable in find.globals(dirnames, known, kind='platform'): - if variable.vartype == UNKNOWN: - unknown.add(variable) + for var, supported in _find(known=known, ignored=ignored): + if supported is None: + unknown.add(var) continue - yield variable, is_supported(variable, ignored, known) - used.add(variable.id) - - #_check_results(unknown, knownvars, used) + used.add(var.id) + _check_results(unknown, known['variables'], used) -def cmd_check(cmd, dirs=SOURCE_DIRS, *, - ignored=IGNORED_FILE, +def cmd_check(cmd, *, known=KNOWN_FILE, - _find=_find_globals, + ignored=IGNORED_FILE, + _find=supported_vars, _show=show.basic, _print=print, ): @@ -100,7 +97,11 @@ def cmd_check(cmd, dirs=SOURCE_DIRS, *, In the failure case, the list of unsupported variables will be printed out. """ - unsupported = [v for v, s in _find(dirs, known, ignored) if not s] + unsupported = [] + for var, supported in _find(known=known, ignored=ignored): + if not supported: + unsupported.append(var) + if not unsupported: #_print('okay') return @@ -112,11 +113,11 @@ def cmd_check(cmd, dirs=SOURCE_DIRS, *, sys.exit(1) -def cmd_show(cmd, dirs=SOURCE_DIRS, *, - ignored=IGNORED_FILE, +def cmd_show(cmd, *, known=KNOWN_FILE, + ignored=IGNORED_FILE, skip_objects=False, - _find=_find_globals, + _find=supported_vars, _show=show.basic, _print=print, ): @@ -127,10 +128,12 @@ def cmd_show(cmd, dirs=SOURCE_DIRS, *, """ allsupported = [] allunsupported = [] - for found, supported in _find(dirs, known, ignored): - if skip_objects: # XXX Support proper filters instead. - if _is_object(found.vartype): - continue + for found, supported in _find(known=known, + ignored=ignored, + skip_objects=skip_objects, + ): + if supported is None: + continue (allsupported if supported else allunsupported ).append(found) @@ -165,9 +168,9 @@ def parse_args(prog=PROG, argv=sys.argv[1:], *, _fail=None): common.add_argument('--known', metavar='FILE', default=KNOWN_FILE, help='path to file that lists known types') - common.add_argument('dirs', metavar='DIR', nargs='*', - default=SOURCE_DIRS, - help='a directory to check') + #common.add_argument('dirs', metavar='DIR', nargs='*', + # default=SOURCE_DIRS, + # help='a directory to check') parser = argparse.ArgumentParser( prog=prog, diff --git a/Tools/c-analyzer/c_analyzer_common/_generate.py b/Tools/c-analyzer/cpython/_generate.py index 9b2fc9e..4c340ac 100644 --- a/Tools/c-analyzer/c_analyzer_common/_generate.py +++ b/Tools/c-analyzer/cpython/_generate.py @@ -1,15 +1,16 @@ # The code here consists of hacks for pre-populating the known.tsv file. -from c_parser.preprocessor import _iter_clean_lines -from c_parser.naive import ( +from c_analyzer.parser.preprocessor import _iter_clean_lines +from c_analyzer.parser.naive import ( iter_variables, parse_variable_declaration, find_variables, ) -from c_parser.info import Variable +from c_analyzer.common.known import HEADER as KNOWN_HEADER +from c_analyzer.common.info import UNKNOWN, ID +from c_analyzer.variables import Variable +from c_analyzer.util import write_tsv from . import SOURCE_DIRS, REPO_ROOT -from .known import DATA_FILE as KNOWN_FILE, HEADER as KNOWN_HEADER -from .info import UNKNOWN, ID -from .util import write_tsv +from .known import DATA_FILE as KNOWN_FILE from .files import iter_cpython_files diff --git a/Tools/c-analyzer/cpython/files.py b/Tools/c-analyzer/cpython/files.py new file mode 100644 index 0000000..543097a --- /dev/null +++ b/Tools/c-analyzer/cpython/files.py @@ -0,0 +1,29 @@ +from c_analyzer.common.files import ( + C_SOURCE_SUFFIXES, walk_tree, iter_files_by_suffix, + ) + +from . import SOURCE_DIRS, REPO_ROOT + +# XXX need tests: +# * iter_files() + + +def iter_files(*, + walk=walk_tree, + _files=iter_files_by_suffix, + ): + """Yield each file in the tree for each of the given directory names.""" + excludedtrees = [ + os.path.join('Include', 'cpython', ''), + ] + def is_excluded(filename): + for root in excludedtrees: + if filename.startswith(root): + return True + return False + for filename in _files(SOURCE_DIRS, C_SOURCE_SUFFIXES, REPO_ROOT, + walk=walk, + ): + if is_excluded(filename): + continue + yield filename diff --git a/Tools/c-analyzer/cpython/find.py b/Tools/c-analyzer/cpython/find.py new file mode 100644 index 0000000..a7bc0b4 --- /dev/null +++ b/Tools/c-analyzer/cpython/find.py @@ -0,0 +1,101 @@ +import os.path + +from c_analyzer.common import files +from c_analyzer.common.info import UNKNOWN, ID +from c_analyzer.variables import find as _common + +from . import SOURCE_DIRS, PYTHON, REPO_ROOT +from .known import ( + from_file as known_from_file, + DATA_FILE as KNOWN_FILE, + ) +from .supported import ( + ignored_from_file, IGNORED_FILE, is_supported, _is_object, + ) + +# XXX need tests: +# * vars_from_binary() +# * vars_from_source() +# * supported_vars() + + +def _handle_id(filename, funcname, name, *, + _relpath=os.path.relpath, + ): + filename = _relpath(filename, REPO_ROOT) + return ID(filename, funcname, name) + + +def vars_from_binary(*, + known=KNOWN_FILE, + _known_from_file=known_from_file, + _iter_files=files.iter_files_by_suffix, + _iter_vars=_common.vars_from_binary, + ): + """Yield a Variable for each found Symbol. + + Details are filled in from the given "known" variables and types. + """ + if isinstance(known, str): + known = _known_from_file(known) + dirnames = SOURCE_DIRS + suffixes = ('.c',) + filenames = _iter_files(dirnames, suffixes) + # XXX For now we only use known variables (no source lookup). + filenames = None + yield from _iter_vars(PYTHON, + known=known, + filenames=filenames, + handle_id=_handle_id, + check_filename=(lambda n: True), + ) + + +def vars_from_source(*, + preprocessed=None, + known=KNOWN_FILE, + _known_from_file=known_from_file, + _iter_files=files.iter_files_by_suffix, + _iter_vars=_common.vars_from_source, + ): + """Yield a Variable for each declaration in the raw source code. + + Details are filled in from the given "known" variables and types. + """ + if isinstance(known, str): + known = _known_from_file(known) + dirnames = SOURCE_DIRS + suffixes = ('.c',) + filenames = _iter_files(dirnames, suffixes) + yield from _iter_vars(filenames, + preprocessed=preprocessed, + known=known, + handle_id=_handle_id, + ) + + +def supported_vars(*, + known=KNOWN_FILE, + ignored=IGNORED_FILE, + skip_objects=False, + _known_from_file=known_from_file, + _ignored_from_file=ignored_from_file, + _iter_vars=vars_from_binary, + _is_supported=is_supported, + ): + """Yield (var, is supported) for each found variable.""" + if isinstance(known, str): + known = _known_from_file(known) + if isinstance(ignored, str): + ignored = _ignored_from_file(ignored) + + for var in _iter_vars(known=known): + if not var.isglobal: + continue + elif var.vartype == UNKNOWN: + yield var, None + # XXX Support proper filters instead. + elif skip_objects and _is_object(found.vartype): + continue + else: + yield var, _is_supported(var, ignored, known) diff --git a/Tools/c-analyzer/cpython/known.py b/Tools/c-analyzer/cpython/known.py new file mode 100644 index 0000000..c3cc2c0 --- /dev/null +++ b/Tools/c-analyzer/cpython/known.py @@ -0,0 +1,66 @@ +import csv +import os.path + +from c_analyzer.parser.declarations import extract_storage +from c_analyzer.variables import known as _common +from c_analyzer.variables.info import Variable + +from . import DATA_DIR + + +# XXX need tests: +# * from_file() +# * look_up_variable() + + +DATA_FILE = os.path.join(DATA_DIR, 'known.tsv') + + +def _get_storage(decl, infunc): + # statics + if decl.startswith(('Py_LOCAL(', 'Py_LOCAL_INLINE(')): + return 'static' + if decl.startswith(('_Py_IDENTIFIER(', '_Py_static_string(')): + return 'static' + if decl.startswith('PyDoc_VAR('): + return 'static' + if decl.startswith(('SLOT1BINFULL(', 'SLOT1BIN(')): + return 'static' + if decl.startswith('WRAP_METHOD('): + return 'static' + # public extern + if decl.startswith('PyAPI_DATA('): + return 'extern' + # Fall back to the normal handler. + return extract_storage(decl, infunc=infunc) + + +def _handle_var(varid, decl): +# if varid.name == 'id' and decl == UNKNOWN: +# # None of these are variables. +# decl = 'int id'; + storage = _get_storage(decl, varid.funcname) + return Variable(varid, storage, decl) + + +def from_file(infile=DATA_FILE, *, + _from_file=_common.from_file, + _handle_var=_handle_var, + ): + """Return the info for known declarations in the given file.""" + return _from_file(infile, handle_var=_handle_var) + + +def look_up_variable(varid, knownvars, *, + _lookup=_common.look_up_variable, + ): + """Return the known variable matching the given ID. + + "knownvars" is a mapping of ID to Variable. + + "match_files" is used to verify if two filenames point to + the same file. + + If no match is found then None is returned. + """ + return _lookup(varid, knownvars) diff --git a/Tools/c-analyzer/c_globals/supported.py b/Tools/c-analyzer/cpython/supported.py index d185daa..18786ee 100644 --- a/Tools/c-analyzer/c_globals/supported.py +++ b/Tools/c-analyzer/cpython/supported.py @@ -1,9 +1,13 @@ import os.path import re -from c_analyzer_common import DATA_DIR -from c_analyzer_common.info import ID -from c_analyzer_common.util import read_tsv, write_tsv +from c_analyzer.common.info import ID +from c_analyzer.common.util import read_tsv, write_tsv + +from . import DATA_DIR + +# XXX need tests: +# * generate / script IGNORED_FILE = os.path.join(DATA_DIR, 'ignored.tsv') @@ -379,11 +383,12 @@ def _generate_ignored_file(variables, filename=None, *, if __name__ == '__main__': - from c_analyzer_common import SOURCE_DIRS - from c_analyzer_common.known import ( + from cpython import SOURCE_DIRS + from cpython.known import ( from_file as known_from_file, DATA_FILE as KNOWN_FILE, ) + # XXX This is wrong! from . import find known = known_from_file(KNOWN_FILE) knownvars = (known or {}).get('variables') |