diff options
author | Eric Snow <ericsnowcurrently@gmail.com> | 2019-10-19 02:00:04 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-10-19 02:00:04 (GMT) |
commit | e4c431ecf50def40eb93c3969c1e4eeaf7bf32f1 (patch) | |
tree | 071224bbded262901b9742eb82c5d82d2f744fe1 /Tools/c-analyzer/c_analyzer | |
parent | ea55c51bd937f6019c35b39b87029644e469c059 (diff) | |
download | cpython-e4c431ecf50def40eb93c3969c1e4eeaf7bf32f1.zip cpython-e4c431ecf50def40eb93c3969c1e4eeaf7bf32f1.tar.gz cpython-e4c431ecf50def40eb93c3969c1e4eeaf7bf32f1.tar.bz2 |
bpo-36876: Re-organize the c-analyzer tool code. (gh-16841)
This is partly a cleanup of the code. It also is preparation for getting the variables from the source (cross-platform) rather than from the symbols.
The change only touches the tool (and its tests).
Diffstat (limited to 'Tools/c-analyzer/c_analyzer')
20 files changed, 2284 insertions, 0 deletions
diff --git a/Tools/c-analyzer/c_analyzer/__init__.py b/Tools/c-analyzer/c_analyzer/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/__init__.py diff --git a/Tools/c-analyzer/c_analyzer/common/__init__.py b/Tools/c-analyzer/c_analyzer/common/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/common/__init__.py diff --git a/Tools/c-analyzer/c_analyzer/common/files.py b/Tools/c-analyzer/c_analyzer/common/files.py new file mode 100644 index 0000000..ab551a8 --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/common/files.py @@ -0,0 +1,120 @@ +import glob +import os +import os.path + +# XXX need tests: +# * walk_tree() +# * glob_tree() +# * iter_files_by_suffix() + + +C_SOURCE_SUFFIXES = ('.c', '.h') + + +def _walk_tree(root, *, + _walk=os.walk, + ): + # A wrapper around os.walk that resolves the filenames. + for parent, _, names in _walk(root): + for name in names: + yield os.path.join(parent, name) + + +def walk_tree(root, *, + suffix=None, + walk=_walk_tree, + ): + """Yield each file in the tree under the given directory name. + + If "suffix" is provided then only files with that suffix will + be included. + """ + if suffix and not isinstance(suffix, str): + raise ValueError('suffix must be a string') + + for filename in walk(root): + if suffix and not filename.endswith(suffix): + continue + yield filename + + +def glob_tree(root, *, + suffix=None, + _glob=glob.iglob, + ): + """Yield each file in the tree under the given directory name. + + If "suffix" is provided then only files with that suffix will + be included. + """ + suffix = suffix or '' + if not isinstance(suffix, str): + raise ValueError('suffix must be a string') + + for filename in _glob(f'{root}/*{suffix}'): + yield filename + for filename in _glob(f'{root}/**/*{suffix}'): + yield filename + + +def iter_files(root, suffix=None, relparent=None, *, + get_files=os.walk, + _glob=glob_tree, + _walk=walk_tree, + ): + """Yield each file in the tree under the given directory name. + + If "root" is a non-string iterable then do the same for each of + those trees. + + If "suffix" is provided then only files with that suffix will + be included. + + if "relparent" is provided then it is used to resolve each + filename as a relative path. + """ + if not isinstance(root, str): + roots = root + for root in roots: + yield from iter_files(root, suffix, relparent, + get_files=get_files, + _glob=_glob, _walk=_walk) + return + + # Use the right "walk" function. + if get_files in (glob.glob, glob.iglob, glob_tree): + get_files = _glob + else: + _files = _walk_tree if get_files in (os.walk, walk_tree) else get_files + get_files = (lambda *a, **k: _walk(*a, walk=_files, **k)) + + # Handle a single suffix. + if suffix and not isinstance(suffix, str): + filenames = get_files(root) + suffix = tuple(suffix) + else: + filenames = get_files(root, suffix=suffix) + suffix = None + + for filename in filenames: + if suffix and not isinstance(suffix, str): # multiple suffixes + if not filename.endswith(suffix): + continue + if relparent: + filename = os.path.relpath(filename, relparent) + yield filename + + +def iter_files_by_suffix(root, suffixes, relparent=None, *, + walk=walk_tree, + _iter_files=iter_files, + ): + """Yield each file in the tree that has the given suffixes. + + Unlike iter_files(), the results are in the original suffix order. + """ + if isinstance(suffixes, str): + suffixes = [suffixes] + # XXX Ignore repeated suffixes? + for suffix in suffixes: + yield from _iter_files(root, suffix, relparent) diff --git a/Tools/c-analyzer/c_analyzer/common/info.py b/Tools/c-analyzer/c_analyzer/common/info.py new file mode 100644 index 0000000..3f3f8c5 --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/common/info.py @@ -0,0 +1,138 @@ +from collections import namedtuple +import re + +from .util import classonly, _NTBase + +# XXX need tests: +# * ID.match() + + +UNKNOWN = '???' + +NAME_RE = re.compile(r'^([a-zA-Z]|_\w*[a-zA-Z]\w*|[a-zA-Z]\w*)$') + + +class ID(_NTBase, namedtuple('ID', 'filename funcname name')): + """A unique ID for a single symbol or declaration.""" + + __slots__ = () + # XXX Add optional conditions (tuple of strings) field. + #conditions = Slot() + + @classonly + def from_raw(cls, raw): + if not raw: + return None + if isinstance(raw, str): + return cls(None, None, raw) + try: + name, = raw + filename = None + except ValueError: + try: + filename, name = raw + except ValueError: + return super().from_raw(raw) + return cls(filename, None, name) + + def __new__(cls, filename, funcname, name): + self = super().__new__( + cls, + filename=str(filename) if filename else None, + funcname=str(funcname) if funcname else None, + name=str(name) if name else None, + ) + #cls.conditions.set(self, tuple(str(s) if s else None + # for s in conditions or ())) + return self + + def validate(self): + """Fail if the object is invalid (i.e. init with bad data).""" + if not self.name: + raise TypeError('missing name') + else: + if not NAME_RE.match(self.name): + raise ValueError( + f'name must be an identifier, got {self.name!r}') + + # Symbols from a binary might not have filename/funcname info. + + if self.funcname: + if not self.filename: + raise TypeError('missing filename') + if not NAME_RE.match(self.funcname) and self.funcname != UNKNOWN: + raise ValueError( + f'name must be an identifier, got {self.funcname!r}') + + # XXX Require the filename (at least UNKONWN)? + # XXX Check the filename? + + @property + def islocal(self): + return self.funcname is not None + + def match(self, other, *, + match_files=(lambda f1, f2: f1 == f2), + ): + """Return True if the two match. + + At least one of the two must be completely valid (no UNKNOWN + anywhere). Otherwise False is returned. The remaining one + *may* have UNKNOWN for both funcname and filename. It must + have a valid name though. + + The caller is responsible for knowing which of the two is valid + (and which to use if both are valid). + """ + # First check the name. + if self.name is None: + return False + if other.name != self.name: + return False + + # Then check the filename. + if self.filename is None: + return False + if other.filename is None: + return False + if self.filename == UNKNOWN: + # "other" must be the valid one. + if other.funcname == UNKNOWN: + return False + elif self.funcname != UNKNOWN: + # XXX Try matching funcname even though we don't + # know the filename? + raise NotImplementedError + else: + return True + elif other.filename == UNKNOWN: + # "self" must be the valid one. + if self.funcname == UNKNOWN: + return False + elif other.funcname != UNKNOWN: + # XXX Try matching funcname even though we don't + # know the filename? + raise NotImplementedError + else: + return True + elif not match_files(self.filename, other.filename): + return False + + # Finally, check the funcname. + if self.funcname == UNKNOWN: + # "other" must be the valid one. + if other.funcname == UNKNOWN: + return False + else: + return other.funcname is not None + elif other.funcname == UNKNOWN: + # "self" must be the valid one. + if self.funcname == UNKNOWN: + return False + else: + return self.funcname is not None + elif self.funcname == other.funcname: + # Both are valid. + return True + + return False diff --git a/Tools/c-analyzer/c_analyzer/common/show.py b/Tools/c-analyzer/c_analyzer/common/show.py new file mode 100644 index 0000000..5f3cb1c --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/common/show.py @@ -0,0 +1,11 @@ + +def basic(variables, *, + _print=print): + """Print each row simply.""" + for var in variables: + if var.funcname: + line = f'{var.filename}:{var.funcname}():{var.name}' + else: + line = f'{var.filename}:{var.name}' + line = f'{line:<64} {var.vartype}' + _print(line) diff --git a/Tools/c-analyzer/c_analyzer/common/util.py b/Tools/c-analyzer/c_analyzer/common/util.py new file mode 100644 index 0000000..43d0bb6 --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/common/util.py @@ -0,0 +1,243 @@ +import csv +import subprocess + + +_NOT_SET = object() + + +def run_cmd(argv, **kwargs): + proc = subprocess.run( + argv, + #capture_output=True, + #stderr=subprocess.STDOUT, + stdout=subprocess.PIPE, + text=True, + check=True, + **kwargs + ) + return proc.stdout + + +def read_tsv(infile, header, *, + _open=open, + _get_reader=csv.reader, + ): + """Yield each row of the given TSV (tab-separated) file.""" + if isinstance(infile, str): + with _open(infile, newline='') as infile: + yield from read_tsv(infile, header, + _open=_open, + _get_reader=_get_reader, + ) + return + lines = iter(infile) + + # Validate the header. + try: + actualheader = next(lines).strip() + except StopIteration: + actualheader = '' + if actualheader != header: + raise ValueError(f'bad header {actualheader!r}') + + for row in _get_reader(lines, delimiter='\t'): + yield tuple(v.strip() for v in row) + + +def write_tsv(outfile, header, rows, *, + _open=open, + _get_writer=csv.writer, + ): + """Write each of the rows to the given TSV (tab-separated) file.""" + if isinstance(outfile, str): + with _open(outfile, 'w', newline='') as outfile: + return write_tsv(outfile, header, rows, + _open=_open, + _get_writer=_get_writer, + ) + + if isinstance(header, str): + header = header.split('\t') + writer = _get_writer(outfile, delimiter='\t') + writer.writerow(header) + for row in rows: + writer.writerow('' if v is None else str(v) + for v in row) + + +class Slot: + """A descriptor that provides a slot. + + This is useful for types that can't have slots via __slots__, + e.g. tuple subclasses. + """ + + __slots__ = ('initial', 'default', 'readonly', 'instances', 'name') + + def __init__(self, initial=_NOT_SET, *, + default=_NOT_SET, + readonly=False, + ): + self.initial = initial + self.default = default + self.readonly = readonly + + # The instance cache is not inherently tied to the normal + # lifetime of the instances. So must do something in order to + # avoid keeping the instances alive by holding a reference here. + # Ideally we would use weakref.WeakValueDictionary to do this. + # However, most builtin types do not support weakrefs. So + # instead we monkey-patch __del__ on the attached class to clear + # the instance. + self.instances = {} + self.name = None + + def __set_name__(self, cls, name): + if self.name is not None: + raise TypeError('already used') + self.name = name + try: + slotnames = cls.__slot_names__ + except AttributeError: + slotnames = cls.__slot_names__ = [] + slotnames.append(name) + self._ensure___del__(cls, slotnames) + + def __get__(self, obj, cls): + if obj is None: # called on the class + return self + try: + value = self.instances[id(obj)] + except KeyError: + if self.initial is _NOT_SET: + value = self.default + else: + value = self.initial + self.instances[id(obj)] = value + if value is _NOT_SET: + raise AttributeError(self.name) + # XXX Optionally make a copy? + return value + + def __set__(self, obj, value): + if self.readonly: + raise AttributeError(f'{self.name} is readonly') + # XXX Optionally coerce? + self.instances[id(obj)] = value + + def __delete__(self, obj): + if self.readonly: + raise AttributeError(f'{self.name} is readonly') + self.instances[id(obj)] = self.default # XXX refleak? + + def _ensure___del__(self, cls, slotnames): # See the comment in __init__(). + try: + old___del__ = cls.__del__ + except AttributeError: + old___del__ = (lambda s: None) + else: + if getattr(old___del__, '_slotted', False): + return + + def __del__(_self): + for name in slotnames: + delattr(_self, name) + old___del__(_self) + __del__._slotted = True + cls.__del__ = __del__ + + def set(self, obj, value): + """Update the cached value for an object. + + This works even if the descriptor is read-only. This is + particularly useful when initializing the object (e.g. in + its __new__ or __init__). + """ + self.instances[id(obj)] = value + + +class classonly: + """A non-data descriptor that makes a value only visible on the class. + + This is like the "classmethod" builtin, but does not show up on + instances of the class. It may be used as a decorator. + """ + + def __init__(self, value): + self.value = value + self.getter = classmethod(value).__get__ + self.name = None + + def __set_name__(self, cls, name): + if self.name is not None: + raise TypeError('already used') + self.name = name + + def __get__(self, obj, cls): + if obj is not None: + raise AttributeError(self.name) + # called on the class + return self.getter(None, cls) + + +class _NTBase: + + __slots__ = () + + @classonly + def from_raw(cls, raw): + if not raw: + return None + elif isinstance(raw, cls): + return raw + elif isinstance(raw, str): + return cls.from_string(raw) + else: + if hasattr(raw, 'items'): + return cls(**raw) + try: + args = tuple(raw) + except TypeError: + pass + else: + return cls(*args) + raise NotImplementedError + + @classonly + def from_string(cls, value): + """Return a new instance based on the given string.""" + raise NotImplementedError + + @classmethod + def _make(cls, iterable): # The default _make() is not subclass-friendly. + return cls.__new__(cls, *iterable) + + # XXX Always validate? + #def __init__(self, *args, **kwargs): + # self.validate() + + # XXX The default __repr__() is not subclass-friendly (where the name changes). + #def __repr__(self): + # _, _, sig = super().__repr__().partition('(') + # return f'{self.__class__.__name__}({sig}' + + # To make sorting work with None: + def __lt__(self, other): + try: + return super().__lt__(other) + except TypeError: + if None in self: + return True + elif None in other: + return False + else: + raise + + def validate(self): + return + + # XXX Always validate? + #def _replace(self, **kwargs): + # obj = super()._replace(**kwargs) + # obj.validate() + # return obj diff --git a/Tools/c-analyzer/c_analyzer/parser/__init__.py b/Tools/c-analyzer/c_analyzer/parser/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/parser/__init__.py diff --git a/Tools/c-analyzer/c_analyzer/parser/declarations.py b/Tools/c-analyzer/c_analyzer/parser/declarations.py new file mode 100644 index 0000000..f37072c --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/parser/declarations.py @@ -0,0 +1,339 @@ +import re +import shlex +import subprocess + +from ..common.info import UNKNOWN + +from . import source + + +IDENTIFIER = r'(?:[a-zA-z]|_+[a-zA-Z0-9]\w*)' + +TYPE_QUAL = r'(?:const|volatile)' + +VAR_TYPE_SPEC = r'''(?: + void | + (?: + (?:(?:un)?signed\s+)? + (?: + char | + short | + int | + long | + long\s+int | + long\s+long + ) | + ) | + float | + double | + {IDENTIFIER} | + (?:struct|union)\s+{IDENTIFIER} + )''' + +POINTER = rf'''(?: + (?:\s+const)?\s*[*] + )''' + +#STRUCT = r'''(?: +# (?:struct|(struct\s+%s))\s*[{] +# [^}]* +# [}] +# )''' % (IDENTIFIER) +#UNION = r'''(?: +# (?:union|(union\s+%s))\s*[{] +# [^}]* +# [}] +# )''' % (IDENTIFIER) +#DECL_SPEC = rf'''(?: +# ({VAR_TYPE_SPEC}) | +# ({STRUCT}) | +# ({UNION}) +# )''' + +FUNC_START = rf'''(?: + (?: + (?: + extern | + static | + static\s+inline + )\s+ + )? + #(?:const\s+)? + {VAR_TYPE_SPEC} + )''' +#GLOBAL_VAR_START = rf'''(?: +# (?: +# (?: +# extern | +# static +# )\s+ +# )? +# (?: +# {TYPE_QUAL} +# (?:\s+{TYPE_QUAL})? +# )?\s+ +# {VAR_TYPE_SPEC} +# )''' +GLOBAL_DECL_START_RE = re.compile(rf''' + ^ + (?: + ({FUNC_START}) + ) + ''', re.VERBOSE) + +LOCAL_VAR_START = rf'''(?: + (?: + (?: + register | + static + )\s+ + )? + (?: + (?: + {TYPE_QUAL} + (?:\s+{TYPE_QUAL})? + )\s+ + )? + {VAR_TYPE_SPEC} + {POINTER}? + )''' +LOCAL_STMT_START_RE = re.compile(rf''' + ^ + (?: + ({LOCAL_VAR_START}) + ) + ''', re.VERBOSE) + + +def iter_global_declarations(lines): + """Yield (decl, body) for each global declaration in the given lines. + + For function definitions the header is reduced to one line and + the body is provided as-is. For other compound declarations (e.g. + struct) the entire declaration is reduced to one line and "body" + is None. Likewise for simple declarations (e.g. variables). + + Declarations inside function bodies are ignored, though their text + is provided in the function body. + """ + # XXX Bail out upon bogus syntax. + lines = source.iter_clean_lines(lines) + for line in lines: + if not GLOBAL_DECL_START_RE.match(line): + continue + # We only need functions here, since we only need locals for now. + if line.endswith(';'): + continue + if line.endswith('{') and '(' not in line: + continue + + # Capture the function. + # (assume no func is a one-liner) + decl = line + while '{' not in line: # assume no inline structs, etc. + try: + line = next(lines) + except StopIteration: + return + decl += ' ' + line + + body, end = _extract_block(lines) + if end is None: + return + assert end == '}' + yield (f'{decl}\n{body}\n{end}', body) + + +def iter_local_statements(lines): + """Yield (lines, blocks) for each statement in the given lines. + + For simple statements, "blocks" is None and the statement is reduced + to a single line. For compound statements, "blocks" is a pair of + (header, body) for each block in the statement. The headers are + reduced to a single line each, but the bpdies are provided as-is. + """ + # XXX Bail out upon bogus syntax. + lines = source.iter_clean_lines(lines) + for line in lines: + if not LOCAL_STMT_START_RE.match(line): + continue + + stmt = line + blocks = None + if not line.endswith(';'): + # XXX Support compound & multiline simple statements. + #blocks = [] + continue + + yield (stmt, blocks) + + +def _extract_block(lines): + end = None + depth = 1 + body = [] + for line in lines: + depth += line.count('{') - line.count('}') + if depth == 0: + end = line + break + body.append(line) + return '\n'.join(body), end + + +def parse_func(stmt, body): + """Return (name, signature) for the given function definition.""" + header, _, end = stmt.partition(body) + assert end.strip() == '}' + assert header.strip().endswith('{') + header, _, _= header.rpartition('{') + + signature = ' '.join(header.strip().splitlines()) + + _, _, name = signature.split('(')[0].strip().rpartition(' ') + assert name + + return name, signature + + +#TYPE_SPEC = rf'''(?: +# )''' +#VAR_DECLARATOR = rf'''(?: +# )''' +#VAR_DECL = rf'''(?: +# {TYPE_SPEC}+ +# {VAR_DECLARATOR} +# \s* +# )''' +#VAR_DECLARATION = rf'''(?: +# {VAR_DECL} +# (?: = [^=] [^;]* )? +# ; +# )''' +# +# +#def parse_variable(decl, *, inFunc=False): +# """Return [(name, storage, vartype)] for the given variable declaration.""" +# ... + + +def _parse_var(stmt): + """Return (name, vartype) for the given variable declaration.""" + stmt = stmt.rstrip(';') + m = LOCAL_STMT_START_RE.match(stmt) + assert m + vartype = m.group(0) + name = stmt[len(vartype):].partition('=')[0].strip() + + if name.startswith('('): + name, _, after = name[1:].partition(')') + assert after + name = name.replace('*', '* ') + inside, _, name = name.strip().rpartition(' ') + vartype = f'{vartype} ({inside.strip()}){after}' + else: + name = name.replace('*', '* ') + before, _, name = name.rpartition(' ') + vartype = f'{vartype} {before}' + + vartype = vartype.strip() + while ' ' in vartype: + vartype = vartype.replace(' ', ' ') + + return name, vartype + + +def extract_storage(decl, *, infunc=None): + """Return (storage, vartype) based on the given declaration. + + The default storage is "implicit" (or "local" if infunc is True). + """ + if decl == UNKNOWN: + return decl + if decl.startswith('static '): + return 'static' + #return 'static', decl.partition(' ')[2].strip() + elif decl.startswith('extern '): + return 'extern' + #return 'extern', decl.partition(' ')[2].strip() + elif re.match('.*\b(static|extern)\b', decl): + raise NotImplementedError + elif infunc: + return 'local' + else: + return 'implicit' + + +def parse_compound(stmt, blocks): + """Return (headers, bodies) for the given compound statement.""" + # XXX Identify declarations inside compound statements + # (if/switch/for/while). + raise NotImplementedError + + +def iter_variables(filename, *, + preprocessed=False, + _iter_source_lines=source.iter_lines, + _iter_global=iter_global_declarations, + _iter_local=iter_local_statements, + _parse_func=parse_func, + _parse_var=_parse_var, + _parse_compound=parse_compound, + ): + """Yield (funcname, name, vartype) for every variable in the given file.""" + if preprocessed: + raise NotImplementedError + lines = _iter_source_lines(filename) + for stmt, body in _iter_global(lines): + # At the file top-level we only have to worry about vars & funcs. + if not body: + name, vartype = _parse_var(stmt) + if name: + yield (None, name, vartype) + else: + funcname, _ = _parse_func(stmt, body) + localvars = _iter_locals(body, + _iter_statements=_iter_local, + _parse_var=_parse_var, + _parse_compound=_parse_compound, + ) + for name, vartype in localvars: + yield (funcname, name, vartype) + + +def _iter_locals(lines, *, + _iter_statements=iter_local_statements, + _parse_var=_parse_var, + _parse_compound=parse_compound, + ): + compound = [lines] + while compound: + body = compound.pop(0) + bodylines = body.splitlines() + for stmt, blocks in _iter_statements(bodylines): + if not blocks: + name, vartype = _parse_var(stmt) + if name: + yield (name, vartype) + else: + headers, bodies = _parse_compound(stmt, blocks) + for header in headers: + for line in header: + name, vartype = _parse_var(line) + if name: + yield (name, vartype) + compound.extend(bodies) + + +def iter_all(filename, *, + preprocessed=False, + ): + """Yield a Declaration for each one found. + + If there are duplicates, due to preprocessor conditionals, then + they are checked to make sure they are the same. + """ + # XXX For the moment we cheat. + for funcname, name, decl in iter_variables(filename, + preprocessed=preprocessed): + yield 'variable', funcname, name, decl diff --git a/Tools/c-analyzer/c_analyzer/parser/find.py b/Tools/c-analyzer/c_analyzer/parser/find.py new file mode 100644 index 0000000..3860d3d --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/parser/find.py @@ -0,0 +1,107 @@ +from ..common.info import UNKNOWN, ID + +from . import declarations + +# XXX need tests: +# * variables +# * variable +# * variable_from_id + + +def _iter_vars(filenames, preprocessed, *, + handle_id=None, + _iter_decls=declarations.iter_all, + ): + if handle_id is None: + handle_id = ID + + for filename in filenames or (): + for kind, funcname, name, decl in _iter_decls(filename, + preprocessed=preprocessed, + ): + if kind != 'variable': + continue + varid = handle_id(filename, funcname, name) + yield varid, decl + + +# XXX Add a "handle_var" arg like we did for get_resolver()? + +def variables(*filenames, + perfilecache=None, + preprocessed=False, + known=None, # for types + handle_id=None, + _iter_vars=_iter_vars, + ): + """Yield (varid, decl) for each variable found in the given files. + + If "preprocessed" is provided (and not False/None) then it is used + to decide which tool to use to parse the source code after it runs + through the C preprocessor. Otherwise the raw + """ + if len(filenames) == 1 and not (filenames[0], str): + filenames, = filenames + + if perfilecache is None: + yield from _iter_vars(filenames, preprocessed) + else: + # XXX Cache per-file variables (e.g. `{filename: [(varid, decl)]}`). + raise NotImplementedError + + +def variable(name, filenames, *, + local=False, + perfilecache=None, + preprocessed=False, + handle_id=None, + _iter_vars=variables, + ): + """Return (varid, decl) for the first found variable that matches. + + If "local" is True then the first matching local variable in the + file will always be returned. To avoid that, pass perfilecache and + pop each variable from the cache after using it. + """ + for varid, decl in _iter_vars(filenames, + perfilecache=perfilecache, + preprocessed=preprocessed, + ): + if varid.name != name: + continue + if local: + if varid.funcname: + if varid.funcname == UNKNOWN: + raise NotImplementedError + return varid, decl + elif not varid.funcname: + return varid, decl + else: + return None, None # No matching variable was found. + + +def variable_from_id(id, filenames, *, + perfilecache=None, + preprocessed=False, + handle_id=None, + _get_var=variable, + ): + """Return (varid, decl) for the first found variable that matches.""" + local = False + if isinstance(id, str): + name = id + else: + if id.funcname == UNKNOWN: + local = True + elif id.funcname: + raise NotImplementedError + + name = id.name + if id.filename and id.filename != UNKNOWN: + filenames = [id.filename] + return _get_var(name, filenames, + local=local, + perfilecache=perfilecache, + preprocessed=preprocessed, + handle_id=handle_id, + ) diff --git a/Tools/c-analyzer/c_analyzer/parser/naive.py b/Tools/c-analyzer/c_analyzer/parser/naive.py new file mode 100644 index 0000000..4a4822d --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/parser/naive.py @@ -0,0 +1,179 @@ +import re + +from ..common.info import UNKNOWN, ID + +from .preprocessor import _iter_clean_lines + + +_NOT_SET = object() + + +def get_srclines(filename, *, + cache=None, + _open=open, + _iter_lines=_iter_clean_lines, + ): + """Return the file's lines as a list. + + Each line will have trailing whitespace removed (including newline). + + If a cache is given the it is used. + """ + if cache is not None: + try: + return cache[filename] + except KeyError: + pass + + with _open(filename) as srcfile: + srclines = [line + for _, line in _iter_lines(srcfile) + if not line.startswith('#')] + for i, line in enumerate(srclines): + srclines[i] = line.rstrip() + + if cache is not None: + cache[filename] = srclines + return srclines + + +def parse_variable_declaration(srcline): + """Return (name, decl) for the given declaration line.""" + # XXX possible false negatives... + decl, sep, _ = srcline.partition('=') + if not sep: + if not srcline.endswith(';'): + return None, None + decl = decl.strip(';') + decl = decl.strip() + m = re.match(r'.*\b(\w+)\s*(?:\[[^\]]*\])?$', decl) + if not m: + return None, None + name = m.group(1) + return name, decl + + +def parse_variable(srcline, funcname=None): + """Return (varid, decl) for the variable declared on the line (or None).""" + line = srcline.strip() + + # XXX Handle more than just static variables. + if line.startswith('static '): + if '(' in line and '[' not in line: + # a function + return None, None + return parse_variable_declaration(line) + else: + return None, None + + +def iter_variables(filename, *, + srccache=None, + parse_variable=None, + _get_srclines=get_srclines, + _default_parse_variable=parse_variable, + ): + """Yield (varid, decl) for each variable in the given source file.""" + if parse_variable is None: + parse_variable = _default_parse_variable + + indent = '' + prev = '' + funcname = None + for line in _get_srclines(filename, cache=srccache): + # remember current funcname + if funcname: + if line == indent + '}': + funcname = None + continue + else: + if '(' in prev and line == indent + '{': + if not prev.startswith('__attribute__'): + funcname = prev.split('(')[0].split()[-1] + prev = '' + continue + indent = line[:-len(line.lstrip())] + prev = line + + info = parse_variable(line, funcname) + if isinstance(info, list): + for name, _funcname, decl in info: + yield ID(filename, _funcname, name), decl + continue + name, decl = info + + if name is None: + continue + yield ID(filename, funcname, name), decl + + +def _match_varid(variable, name, funcname, ignored=None): + if ignored and variable in ignored: + return False + + if variable.name != name: + return False + + if funcname == UNKNOWN: + if not variable.funcname: + return False + elif variable.funcname != funcname: + return False + + return True + + +def find_variable(filename, funcname, name, *, + ignored=None, + srccache=None, # {filename: lines} + parse_variable=None, + _iter_variables=iter_variables, + ): + """Return the matching variable. + + Return None if the variable is not found. + """ + for varid, decl in _iter_variables(filename, + srccache=srccache, + parse_variable=parse_variable, + ): + if _match_varid(varid, name, funcname, ignored): + return varid, decl + else: + return None + + +def find_variables(varids, filenames=None, *, + srccache=_NOT_SET, + parse_variable=None, + _find_symbol=find_variable, + ): + """Yield (varid, decl) for each ID. + + If the variable is not found then its decl will be UNKNOWN. That + way there will be one resulting variable per given ID. + """ + if srccache is _NOT_SET: + srccache = {} + + used = set() + for varid in varids: + if varid.filename and varid.filename != UNKNOWN: + srcfiles = [varid.filename] + else: + if not filenames: + yield varid, UNKNOWN + continue + srcfiles = filenames + for filename in srcfiles: + varid, decl = _find_varid(filename, varid.funcname, varid.name, + ignored=used, + srccache=srccache, + parse_variable=parse_variable, + ) + if varid: + yield varid, decl + used.add(varid) + break + else: + yield varid, UNKNOWN diff --git a/Tools/c-analyzer/c_analyzer/parser/preprocessor.py b/Tools/c-analyzer/c_analyzer/parser/preprocessor.py new file mode 100644 index 0000000..41f306e --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/parser/preprocessor.py @@ -0,0 +1,511 @@ +from collections import namedtuple +import shlex +import os +import re + +from ..common import util, info + + +CONTINUATION = '\\' + os.linesep + +IDENTIFIER = r'(?:\w*[a-zA-Z]\w*)' +IDENTIFIER_RE = re.compile('^' + IDENTIFIER + '$') + + +def _coerce_str(value): + if not value: + return '' + return str(value).strip() + + +############################# +# directives + +DIRECTIVE_START = r''' + (?: + ^ \s* + [#] \s* + )''' +DIRECTIVE_TEXT = r''' + (?: + (?: \s+ ( .*\S ) )? + \s* $ + )''' +DIRECTIVE = rf''' + (?: + {DIRECTIVE_START} + ( + include | + error | warning | + pragma | + define | undef | + if | ifdef | ifndef | elseif | else | endif | + __FILE__ | __LINE__ | __DATE __ | __TIME__ | __TIMESTAMP__ + ) + {DIRECTIVE_TEXT} + )''' +# (?: +# [^\\\n] | +# \\ [^\n] | +# \\ \n +# )+ +# ) \n +# )''' +DIRECTIVE_RE = re.compile(DIRECTIVE, re.VERBOSE) + +DEFINE = rf''' + (?: + {DIRECTIVE_START} define \s+ + (?: + ( \w*[a-zA-Z]\w* ) + (?: \s* [(] ([^)]*) [)] )? + ) + {DIRECTIVE_TEXT} + )''' +DEFINE_RE = re.compile(DEFINE, re.VERBOSE) + + +def parse_directive(line): + """Return the appropriate directive for the given line.""" + line = line.strip() + if line.startswith('#'): + line = line[1:].lstrip() + line = '#' + line + directive = line + #directive = '#' + line + while ' ' in directive: + directive = directive.replace(' ', ' ') + return _parse_directive(directive) + + +def _parse_directive(line): + m = DEFINE_RE.match(line) + if m: + name, args, text = m.groups() + if args: + args = [a.strip() for a in args.split(',')] + return Macro(name, args, text) + else: + return Constant(name, text) + + m = DIRECTIVE_RE.match(line) + if not m: + raise ValueError(f'unsupported directive {line!r}') + kind, text = m.groups() + if not text: + if kind not in ('else', 'endif'): + raise ValueError(f'missing text in directive {line!r}') + elif kind in ('else', 'endif', 'define'): + raise ValueError(f'unexpected text in directive {line!r}') + if kind == 'include': + directive = Include(text) + elif kind in IfDirective.KINDS: + directive = IfDirective(kind, text) + else: + directive = OtherDirective(kind, text) + directive.validate() + return directive + + +class PreprocessorDirective(util._NTBase): + """The base class for directives.""" + + __slots__ = () + + KINDS = frozenset([ + 'include', + 'pragma', + 'error', 'warning', + 'define', 'undef', + 'if', 'ifdef', 'ifndef', 'elseif', 'else', 'endif', + '__FILE__', '__DATE__', '__LINE__', '__TIME__', '__TIMESTAMP__', + ]) + + @property + def text(self): + return ' '.join(v for v in self[1:] if v and v.strip()) or None + + def validate(self): + """Fail if the object is invalid (i.e. init with bad data).""" + super().validate() + + if not self.kind: + raise TypeError('missing kind') + elif self.kind not in self.KINDS: + raise ValueError + + # text can be anything, including None. + + +class Constant(PreprocessorDirective, + namedtuple('Constant', 'kind name value')): + """A single "constant" directive ("define").""" + + __slots__ = () + + def __new__(cls, name, value=None): + self = super().__new__( + cls, + 'define', + name=_coerce_str(name) or None, + value=_coerce_str(value) or None, + ) + return self + + def validate(self): + """Fail if the object is invalid (i.e. init with bad data).""" + super().validate() + + if not self.name: + raise TypeError('missing name') + elif not IDENTIFIER_RE.match(self.name): + raise ValueError(f'name must be identifier, got {self.name!r}') + + # value can be anything, including None + + +class Macro(PreprocessorDirective, + namedtuple('Macro', 'kind name args body')): + """A single "macro" directive ("define").""" + + __slots__ = () + + def __new__(cls, name, args, body=None): + # "args" must be a string or an iterable of strings (or "empty"). + if isinstance(args, str): + args = [v.strip() for v in args.split(',')] + if args: + args = tuple(_coerce_str(a) or None for a in args) + self = super().__new__( + cls, + kind='define', + name=_coerce_str(name) or None, + args=args if args else (), + body=_coerce_str(body) or None, + ) + return self + + @property + def text(self): + if self.body: + return f'{self.name}({", ".join(self.args)}) {self.body}' + else: + return f'{self.name}({", ".join(self.args)})' + + def validate(self): + """Fail if the object is invalid (i.e. init with bad data).""" + super().validate() + + if not self.name: + raise TypeError('missing name') + elif not IDENTIFIER_RE.match(self.name): + raise ValueError(f'name must be identifier, got {self.name!r}') + + for arg in self.args: + if not arg: + raise ValueError(f'missing arg in {self.args}') + elif not IDENTIFIER_RE.match(arg): + raise ValueError(f'arg must be identifier, got {arg!r}') + + # body can be anything, including None + + +class IfDirective(PreprocessorDirective, + namedtuple('IfDirective', 'kind condition')): + """A single conditional directive (e.g. "if", "ifdef"). + + This only includes directives that actually provide conditions. The + related directives "else" and "endif" are covered by OtherDirective + instead. + """ + + __slots__ = () + + KINDS = frozenset([ + 'if', + 'ifdef', + 'ifndef', + 'elseif', + ]) + + @classmethod + def _condition_from_raw(cls, raw, kind): + #return Condition.from_raw(raw, _kind=kind) + condition = _coerce_str(raw) + if not condition: + return None + + if kind == 'ifdef': + condition = f'defined({condition})' + elif kind == 'ifndef': + condition = f'! defined({condition})' + + return condition + + def __new__(cls, kind, condition): + kind = _coerce_str(kind) + self = super().__new__( + cls, + kind=kind or None, + condition=cls._condition_from_raw(condition, kind), + ) + return self + + @property + def text(self): + if self.kind == 'ifdef': + return self.condition[8:-1] # strip "defined(" + elif self.kind == 'ifndef': + return self.condition[10:-1] # strip "! defined(" + else: + return self.condition + #return str(self.condition) + + def validate(self): + """Fail if the object is invalid (i.e. init with bad data).""" + super().validate() + + if not self.condition: + raise TypeError('missing condition') + #else: + # for cond in self.condition: + # if not cond: + # raise ValueError(f'missing condition in {self.condition}') + # cond.validate() + # if self.kind in ('ifdef', 'ifndef'): + # if len(self.condition) != 1: + # raise ValueError('too many condition') + # if self.kind == 'ifdef': + # if not self.condition[0].startswith('defined '): + # raise ValueError('bad condition') + # else: + # if not self.condition[0].startswith('! defined '): + # raise ValueError('bad condition') + + +class Include(PreprocessorDirective, + namedtuple('Include', 'kind file')): + """A single "include" directive. + + Supported "file" values are either follow the bracket style + (<stdio>) or double quotes ("spam.h"). + """ + + __slots__ = () + + def __new__(cls, file): + self = super().__new__( + cls, + kind='include', + file=_coerce_str(file) or None, + ) + return self + + def validate(self): + """Fail if the object is invalid (i.e. init with bad data).""" + super().validate() + + if not self.file: + raise TypeError('missing file') + + +class OtherDirective(PreprocessorDirective, + namedtuple('OtherDirective', 'kind text')): + """A single directive not covered by another class. + + This includes the "else", "endif", and "undef" directives, which are + otherwise inherently related to the directives covered by the + Constant, Macro, and IfCondition classes. + + Note that all directives must have a text value, except for "else" + and "endif" (which must have no text). + """ + + __slots__ = () + + KINDS = PreprocessorDirective.KINDS - {'include', 'define'} - IfDirective.KINDS + + def __new__(cls, kind, text): + self = super().__new__( + cls, + kind=_coerce_str(kind) or None, + text=_coerce_str(text) or None, + ) + return self + + def validate(self): + """Fail if the object is invalid (i.e. init with bad data).""" + super().validate() + + if self.text: + if self.kind in ('else', 'endif'): + raise ValueError('unexpected text in directive') + elif self.kind not in ('else', 'endif'): + raise TypeError('missing text') + + +############################# +# iterating lines + +def _recompute_conditions(directive, ifstack): + if directive.kind in ('if', 'ifdef', 'ifndef'): + ifstack.append( + ([], directive.condition)) + elif directive.kind == 'elseif': + if ifstack: + negated, active = ifstack.pop() + if active: + negated.append(active) + else: + negated = [] + ifstack.append( + (negated, directive.condition)) + elif directive.kind == 'else': + if ifstack: + negated, active = ifstack.pop() + if active: + negated.append(active) + ifstack.append( + (negated, None)) + elif directive.kind == 'endif': + if ifstack: + ifstack.pop() + + conditions = [] + for negated, active in ifstack: + for condition in negated: + conditions.append(f'! ({condition})') + if active: + conditions.append(active) + return tuple(conditions) + + +def _iter_clean_lines(lines): + lines = iter(enumerate(lines, 1)) + for lno, line in lines: + # Handle line continuations. + while line.endswith(CONTINUATION): + try: + lno, _line = next(lines) + except StopIteration: + break + line = line[:-len(CONTINUATION)] + ' ' + _line + + # Deal with comments. + after = line + line = '' + while True: + # Look for a comment. + before, begin, remainder = after.partition('/*') + if '//' in before: + before, _, _ = before.partition('//') + line += before + ' ' # per the C99 spec + break + line += before + if not begin: + break + line += ' ' # per the C99 spec + + # Go until we find the end of the comment. + _, end, after = remainder.partition('*/') + while not end: + try: + lno, remainder = next(lines) + except StopIteration: + raise Exception('unterminated comment') + _, end, after = remainder.partition('*/') + + yield lno, line + + +def iter_lines(lines, *, + _iter_clean_lines=_iter_clean_lines, + _parse_directive=_parse_directive, + _recompute_conditions=_recompute_conditions, + ): + """Yield (lno, line, directive, active conditions) for each given line. + + This is effectively a subset of the operations taking place in + translation phases 2-4 from the C99 spec (ISO/IEC 9899:TC2); see + section 5.1.1.2. Line continuations are removed and comments + replaced with a single space. (In both cases "lno" will be the last + line involved.) Otherwise each line is returned as-is. + + "lno" is the (1-indexed) line number for the line. + + "directive" will be a PreprocessorDirective or None, depending on + whether or not there is a directive on the line. + + "active conditions" is the set of preprocessor conditions (e.g. + "defined()") under which the current line of code will be included + in compilation. That set is derived from every conditional + directive block (e.g. "if defined()", "ifdef", "else") containing + that line. That includes nested directives. Note that the + current line does not affect the active conditions for iteself. + It only impacts subsequent lines. That applies to directives + that close blocks (e.g. "endif") just as much as conditional + directvies. Also note that "else" and "elseif" directives + update the active conditions (for later lines), rather than + adding to them. + """ + ifstack = [] + conditions = () + for lno, line in _iter_clean_lines(lines): + stripped = line.strip() + if not stripped.startswith('#'): + yield lno, line, None, conditions + continue + + directive = '#' + stripped[1:].lstrip() + while ' ' in directive: + directive = directive.replace(' ', ' ') + directive = _parse_directive(directive) + yield lno, line, directive, conditions + + if directive.kind in ('else', 'endif'): + conditions = _recompute_conditions(directive, ifstack) + elif isinstance(directive, IfDirective): + conditions = _recompute_conditions(directive, ifstack) + + +############################# +# running (platform-specific?) + +def _gcc(filename, *, + _get_argv=(lambda: _get_gcc_argv()), + _run=util.run_cmd, + ): + argv = _get_argv() + argv.extend([ + '-E', filename, + ]) + output = _run(argv) + return output + + +def _get_gcc_argv(*, + _open=open, + _run=util.run_cmd, + ): + with _open('/tmp/print.mk', 'w') as tmpfile: + tmpfile.write('print-%:\n') + #tmpfile.write('\t@echo $* = $($*)\n') + tmpfile.write('\t@echo $($*)\n') + argv = ['/usr/bin/make', + '-f', 'Makefile', + '-f', '/tmp/print.mk', + 'print-CC', + 'print-PY_CORE_CFLAGS', + ] + output = _run(argv) + gcc, cflags = output.strip().splitlines() + argv = shlex.split(gcc.strip()) + cflags = shlex.split(cflags.strip()) + return argv + cflags + + +def run(filename, *, + _gcc=_gcc, + ): + """Return the text of the given file after running the preprocessor.""" + return _gcc(filename) diff --git a/Tools/c-analyzer/c_analyzer/parser/source.py b/Tools/c-analyzer/c_analyzer/parser/source.py new file mode 100644 index 0000000..f8998c8 --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/parser/source.py @@ -0,0 +1,34 @@ +from . import preprocessor + + +def iter_clean_lines(lines): + incomment = False + for line in lines: + # Deal with comments. + if incomment: + _, sep, line = line.partition('*/') + if sep: + incomment = False + continue + line, _, _ = line.partition('//') + line, sep, remainder = line.partition('/*') + if sep: + _, sep, after = remainder.partition('*/') + if not sep: + incomment = True + continue + line += ' ' + after + + # Ignore blank lines and leading/trailing whitespace. + line = line.strip() + if not line: + continue + + yield line + + +def iter_lines(filename, *, + preprocess=preprocessor.run, + ): + content = preprocess(filename) + return iter(content.splitlines()) diff --git a/Tools/c-analyzer/c_analyzer/symbols/__init__.py b/Tools/c-analyzer/c_analyzer/symbols/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/symbols/__init__.py diff --git a/Tools/c-analyzer/c_analyzer/symbols/_nm.py b/Tools/c-analyzer/c_analyzer/symbols/_nm.py new file mode 100644 index 0000000..f3a75a6 --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/symbols/_nm.py @@ -0,0 +1,117 @@ +import os.path +import shutil + +from c_analyzer.common import util, info + +from .info import Symbol + + +# XXX need tests: +# * iter_symbols + +NM_KINDS = { + 'b': Symbol.KIND.VARIABLE, # uninitialized + 'd': Symbol.KIND.VARIABLE, # initialized + #'g': Symbol.KIND.VARIABLE, # uninitialized + #'s': Symbol.KIND.VARIABLE, # initialized + 't': Symbol.KIND.FUNCTION, + } + +SPECIAL_SYMBOLS = { + # binary format (e.g. ELF) + '__bss_start', + '__data_start', + '__dso_handle', + '_DYNAMIC', + '_edata', + '_end', + '__environ@@GLIBC_2.2.5', + '_GLOBAL_OFFSET_TABLE_', + '__JCR_END__', + '__JCR_LIST__', + '__TMC_END__', + } + + +def _is_special_symbol(name): + if name in SPECIAL_SYMBOLS: + return True + if '@@GLIBC' in name: + return True + return False + + +def iter_symbols(binfile, *, + nm=None, + handle_id=None, + _which=shutil.which, + _run=util.run_cmd, + ): + """Yield a Symbol for each relevant entry reported by the "nm" command.""" + if nm is None: + nm = _which('nm') + if not nm: + raise NotImplementedError + if handle_id is None: + handle_id = info.ID + + argv = [nm, + '--line-numbers', + binfile, + ] + try: + output = _run(argv) + except Exception: + if nm is None: + # XXX Use dumpbin.exe /SYMBOLS on Windows. + raise NotImplementedError + raise + for line in output.splitlines(): + (name, kind, external, filename, funcname, + ) = _parse_nm_line(line) + if kind != Symbol.KIND.VARIABLE: + continue + elif _is_special_symbol(name): + continue + yield Symbol( + id=handle_id(filename, funcname, name), + kind=kind, + external=external, + ) + + +def _parse_nm_line(line): + _origline = line + _, _, line = line.partition(' ') # strip off the address + line = line.strip() + + kind, _, line = line.partition(' ') + line = line.strip() + external = kind.isupper() + kind = NM_KINDS.get(kind.lower(), Symbol.KIND.OTHER) + + name, _, filename = line.partition('\t') + name = name.strip() + if filename: + filename = os.path.relpath(filename.partition(':')[0]) + else: + filename = info.UNKNOWN + + name, islocal = _parse_nm_name(name, kind) + funcname = info.UNKNOWN if islocal else None + return name, kind, external, filename, funcname + + +def _parse_nm_name(name, kind): + if kind != Symbol.KIND.VARIABLE: + return name, None + if _is_special_symbol(name): + return name, None + + actual, sep, digits = name.partition('.') + if not sep: + return name, False + + if not digits.isdigit(): + raise Exception(f'got bogus name {name}') + return actual, True diff --git a/Tools/c-analyzer/c_analyzer/symbols/find.py b/Tools/c-analyzer/c_analyzer/symbols/find.py new file mode 100644 index 0000000..8564652 --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/symbols/find.py @@ -0,0 +1,175 @@ +import os +import os.path +import shutil + +from ..common import files +from ..common.info import UNKNOWN, ID +from ..parser import find as p_find + +from . import _nm +from .info import Symbol + +# XXX need tests: +# * get_resolver() +# * get_resolver_from_dirs() +# * symbol() +# * symbols() +# * variables() + + +def _resolve_known(symbol, knownvars): + for varid in knownvars: + if symbol.match(varid): + break + else: + return None + return knownvars.pop(varid) + + +def get_resolver(filenames=None, known=None, *, + handle_var, + check_filename=None, + perfilecache=None, + preprocessed=False, + _from_source=p_find.variable_from_id, + ): + """Return a "resolver" func for the given known vars/types and filenames. + + "handle_var" is a callable that takes (ID, decl) and returns a + Variable. Variable.from_id is a suitable callable. + + The returned func takes a single Symbol and returns a corresponding + Variable. If the symbol was located then the variable will be + valid, populated with the corresponding information. Otherwise None + is returned. + """ + knownvars = (known or {}).get('variables') + if knownvars: + knownvars = dict(knownvars) # a copy + if filenames: + if check_filename is None: + filenames = list(filenames) + def check_filename(filename): + return filename in filenames + def resolve(symbol): + # XXX Check "found" instead? + if not check_filename(symbol.filename): + return None + found = _resolve_known(symbol, knownvars) + if found is None: + #return None + varid, decl = _from_source(symbol, filenames, + perfilecache=perfilecache, + preprocessed=preprocessed, + ) + found = handle_var(varid, decl) + return found + else: + def resolve(symbol): + return _resolve_known(symbol, knownvars) + elif filenames: + def resolve(symbol): + varid, decl = _from_source(symbol, filenames, + perfilecache=perfilecache, + preprocessed=preprocessed, + ) + return handle_var(varid, decl) + else: + def resolve(symbol): + return None + return resolve + + +def get_resolver_from_dirs(dirnames, known=None, *, + handle_var, + suffixes=('.c',), + perfilecache=None, + preprocessed=False, + _iter_files=files.iter_files_by_suffix, + _get_resolver=get_resolver, + ): + """Return a "resolver" func for the given known vars/types and filenames. + + "dirnames" should be absolute paths. If not then they will be + resolved relative to CWD. + + See get_resolver(). + """ + dirnames = [d if d.endswith(os.path.sep) else d + os.path.sep + for d in dirnames] + filenames = _iter_files(dirnames, suffixes) + def check_filename(filename): + for dirname in dirnames: + if filename.startswith(dirname): + return True + else: + return False + return _get_resolver(filenames, known, + handle_var=handle_var, + check_filename=check_filename, + perfilecache=perfilecache, + preprocessed=preprocessed, + ) + + +def symbol(symbol, filenames, known=None, *, + perfilecache=None, + preprocessed=False, + handle_id=None, + _get_resolver=get_resolver, + ): + """Return a Variable for the one matching the given symbol. + + "symbol" can be one of several objects: + + * Symbol - use the contained info + * name (str) - look for a global variable with that name + * (filename, name) - look for named global in file + * (filename, funcname, name) - look for named local in file + + A name is always required. If the filename is None, "", or + "UNKNOWN" then all files will be searched. If the funcname is + "" or "UNKNOWN" then only local variables will be searched for. + """ + resolve = _get_resolver(known, filenames, + handle_id=handle_id, + perfilecache=perfilecache, + preprocessed=preprocessed, + ) + return resolve(symbol) + + +def _get_platform_tool(): + if os.name == 'nt': + # XXX Support this. + raise NotImplementedError + elif nm := shutil.which('nm'): + return lambda b, hi: _nm.iter_symbols(b, nm=nm, handle_id=hi) + else: + raise NotImplementedError + + +def symbols(binfile, *, + handle_id=None, + _file_exists=os.path.exists, + _get_platform_tool=_get_platform_tool, + ): + """Yield a Symbol for each one found in the binary.""" + if not _file_exists(binfile): + raise Exception('executable missing (need to build it first?)') + + _iter_symbols = _get_platform_tool() + yield from _iter_symbols(binfile, handle_id) + + +def variables(binfile, *, + resolve, + handle_id=None, + _iter_symbols=symbols, + ): + """Yield (Variable, Symbol) for each found symbol.""" + for symbol in _iter_symbols(binfile, handle_id=handle_id): + if symbol.kind != Symbol.KIND.VARIABLE: + continue + var = resolve(symbol) or None + yield var, symbol diff --git a/Tools/c-analyzer/c_analyzer/symbols/info.py b/Tools/c-analyzer/c_analyzer/symbols/info.py new file mode 100644 index 0000000..96a251a --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/symbols/info.py @@ -0,0 +1,51 @@ +from collections import namedtuple + +from c_analyzer.common.info import ID +from c_analyzer.common.util import classonly, _NTBase + + +class Symbol(_NTBase, namedtuple('Symbol', 'id kind external')): + """Info for a single compilation symbol.""" + + __slots__ = () + + class KIND: + VARIABLE = 'variable' + FUNCTION = 'function' + OTHER = 'other' + + @classonly + def from_name(cls, name, filename=None, kind=KIND.VARIABLE, external=None): + """Return a new symbol based on the given name.""" + id = ID(filename, None, name) + return cls(id, kind, external) + + def __new__(cls, id, kind=KIND.VARIABLE, external=None): + self = super().__new__( + cls, + id=ID.from_raw(id), + kind=str(kind) if kind else None, + external=bool(external) if external is not None else None, + ) + return self + + def __hash__(self): + return hash(self.id) + + def __getattr__(self, name): + return getattr(self.id, name) + + def validate(self): + """Fail if the object is invalid (i.e. init with bad data).""" + if not self.id: + raise TypeError('missing id') + else: + self.id.validate() + + if not self.kind: + raise TypeError('missing kind') + elif self.kind not in vars(self.KIND).values(): + raise ValueError(f'unsupported kind {self.kind}') + + if self.external is None: + raise TypeError('missing external') diff --git a/Tools/c-analyzer/c_analyzer/variables/__init__.py b/Tools/c-analyzer/c_analyzer/variables/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/variables/__init__.py diff --git a/Tools/c-analyzer/c_analyzer/variables/find.py b/Tools/c-analyzer/c_analyzer/variables/find.py new file mode 100644 index 0000000..3fe7284 --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/variables/find.py @@ -0,0 +1,75 @@ +from ..common import files +from ..common.info import UNKNOWN +from ..parser import ( + find as p_find, + ) +from ..symbols import ( + info as s_info, + find as s_find, + ) +from .info import Variable + +# XXX need tests: +# * vars_from_source + + +def _remove_cached(cache, var): + if not cache: + return + try: + cached = cache[var.filename] + cached.remove(var) + except (KeyError, IndexError): + pass + + +def vars_from_binary(binfile, *, + known=None, + filenames=None, + handle_id=None, + check_filename=None, + handle_var=Variable.from_id, + _iter_vars=s_find.variables, + _get_symbol_resolver=s_find.get_resolver, + ): + """Yield a Variable for each found Symbol. + + Details are filled in from the given "known" variables and types. + """ + cache = {} + resolve = _get_symbol_resolver(filenames, known, + handle_var=handle_var, + check_filename=check_filename, + perfilecache=cache, + ) + for var, symbol in _iter_vars(binfile, + resolve=resolve, + handle_id=handle_id, + ): + if var is None: + var = Variable(symbol.id, UNKNOWN, UNKNOWN) + yield var + _remove_cached(cache, var) + + +def vars_from_source(filenames, *, + preprocessed=None, + known=None, + handle_id=None, + handle_var=Variable.from_id, + iter_vars=p_find.variables, + ): + """Yield a Variable for each declaration in the raw source code. + + Details are filled in from the given "known" variables and types. + """ + cache = {} + for varid, decl in iter_vars(filenames or (), + perfilecache=cache, + preprocessed=preprocessed, + known=known, + handle_id=handle_id, + ): + var = handle_var(varid, decl) + yield var + _remove_cached(cache, var) diff --git a/Tools/c-analyzer/c_analyzer/variables/info.py b/Tools/c-analyzer/c_analyzer/variables/info.py new file mode 100644 index 0000000..336a523 --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/variables/info.py @@ -0,0 +1,93 @@ +from collections import namedtuple + +from ..common.info import ID, UNKNOWN +from ..common.util import classonly, _NTBase + + +def normalize_vartype(vartype): + """Return the canonical form for a variable type (or func signature).""" + # We allow empty strring through for semantic reasons. + if vartype is None: + return None + + # XXX finish! + # XXX Return (modifiers, type, pointer)? + return str(vartype) + + +# XXX Variable.vartype -> decl (Declaration). + +class Variable(_NTBase, + namedtuple('Variable', 'id storage vartype')): + """Information about a single variable declaration.""" + + __slots__ = () + + STORAGE = ( + 'static', + 'extern', + 'implicit', + 'local', + ) + + @classonly + def from_parts(cls, filename, funcname, name, decl, storage=None): + varid = ID(filename, funcname, name) + if storage is None: + self = cls.from_id(varid, decl) + else: + self = cls(varid, storage, decl) + return self + + @classonly + def from_id(cls, varid, decl): + from ..parser.declarations import extract_storage + storage = extract_storage(decl, infunc=varid.funcname) + return cls(varid, storage, decl) + + def __new__(cls, id, storage, vartype): + self = super().__new__( + cls, + id=ID.from_raw(id), + storage=str(storage) if storage else None, + vartype=normalize_vartype(vartype) if vartype else None, + ) + return self + + def __hash__(self): + return hash(self.id) + + def __getattr__(self, name): + return getattr(self.id, name) + + def _validate_id(self): + if not self.id: + raise TypeError('missing id') + + if not self.filename or self.filename == UNKNOWN: + raise TypeError(f'id missing filename ({self.id})') + + if self.funcname and self.funcname == UNKNOWN: + raise TypeError(f'id missing funcname ({self.id})') + + self.id.validate() + + def validate(self): + """Fail if the object is invalid (i.e. init with bad data).""" + self._validate_id() + + if self.storage is None or self.storage == UNKNOWN: + raise TypeError('missing storage') + elif self.storage not in self.STORAGE: + raise ValueError(f'unsupported storage {self.storage:r}') + + if self.vartype is None or self.vartype == UNKNOWN: + raise TypeError('missing vartype') + + @property + def isglobal(self): + return self.storage != 'local' + + @property + def isconst(self): + return 'const' in self.vartype.split() diff --git a/Tools/c-analyzer/c_analyzer/variables/known.py b/Tools/c-analyzer/c_analyzer/variables/known.py new file mode 100644 index 0000000..aa2934a --- /dev/null +++ b/Tools/c-analyzer/c_analyzer/variables/known.py @@ -0,0 +1,91 @@ +import csv + +from ..common.info import ID, UNKNOWN +from ..common.util import read_tsv +from .info import Variable + + +# XXX need tests: +# * read_file() +# * look_up_variable() + + +COLUMNS = ('filename', 'funcname', 'name', 'kind', 'declaration') +HEADER = '\t'.join(COLUMNS) + + +def read_file(infile, *, + _read_tsv=read_tsv, + ): + """Yield (kind, id, decl) for each row in the data file. + + The caller is responsible for validating each row. + """ + for row in _read_tsv(infile, HEADER): + filename, funcname, name, kind, declaration = row + if not funcname or funcname == '-': + funcname = None + id = ID(filename, funcname, name) + yield kind, id, declaration + + +def from_file(infile, *, + handle_var=Variable.from_id, + _read_file=read_file, + ): + """Return the info for known declarations in the given file.""" + known = { + 'variables': {}, + #'types': {}, + #'constants': {}, + #'macros': {}, + } + for kind, id, decl in _read_file(infile): + if kind == 'variable': + values = known['variables'] + value = handle_var(id, decl) + else: + raise ValueError(f'unsupported kind in row {row}') + value.validate() + values[id] = value + return known + + +def look_up_variable(varid, knownvars, *, + match_files=(lambda f1, f2: f1 == f2), + ): + """Return the known Variable matching the given ID. + + "knownvars" is a mapping of ID to Variable. + + "match_files" is used to verify if two filenames point to + the same file. + + If no match is found then None is returned. + """ + if not knownvars: + return None + + if varid.funcname == UNKNOWN: + if not varid.filename or varid.filename == UNKNOWN: + for varid in knownvars: + if not varid.funcname: + continue + if varid.name == varid.name: + return knownvars[varid] + else: + return None + else: + for varid in knownvars: + if not varid.funcname: + continue + if not match_files(varid.filename, varid.filename): + continue + if varid.name == varid.name: + return knownvars[varid] + else: + return None + elif not varid.filename or varid.filename == UNKNOWN: + raise NotImplementedError + else: + return knownvars.get(varid.id) |