bpo-36876: Re-organize the c-analyzer tool code. (gh-16841)

This is partly a cleanup of the code. It also is preparation for getting the variables from the source (cross-platform) rather than from the symbols. The change only touches the tool (and its tests).
author: Eric Snow <ericsnowcurrently@gmail.com> 2019-10-19 02:00:04 (GMT)
committer: GitHub <noreply@github.com> 2019-10-19 02:00:04 (GMT)
commit: e4c431ecf50def40eb93c3969c1e4eeaf7bf32f1 (patch)
tree: 071224bbded262901b9742eb82c5d82d2f744fe1 /Tools/c-analyzer/c_analyzer
parent: ea55c51bd937f6019c35b39b87029644e469c059 (diff)
download: cpython-e4c431ecf50def40eb93c3969c1e4eeaf7bf32f1.zip
cpython-e4c431ecf50def40eb93c3969c1e4eeaf7bf32f1.tar.gz
cpython-e4c431ecf50def40eb93c3969c1e4eeaf7bf32f1.tar.bz2
20 files changed, 2284 insertions, 0 deletions
diff --git a/Tools/c-analyzer/c_analyzer/__init__.py b/Tools/c-analyzer/c_analyzer/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/__init__.py
diff --git a/Tools/c-analyzer/c_analyzer/common/__init__.py b/Tools/c-analyzer/c_analyzer/common/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/common/__init__.py
diff --git a/Tools/c-analyzer/c_analyzer/common/files.py b/Tools/c-analyzer/c_analyzer/common/files.py
new file mode 100644
index 0000000..ab551a8
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/common/files.py
@@ -0,0 +1,120 @@
+import glob
+import os
+import os.path
+
+# XXX need tests:
+# * walk_tree()
+# * glob_tree()
+# * iter_files_by_suffix()
+
+
+C_SOURCE_SUFFIXES = ('.c', '.h')
+
+
+def _walk_tree(root, *,
+               _walk=os.walk,
+               ):
+    # A wrapper around os.walk that resolves the filenames.
+    for parent, _, names in _walk(root):
+        for name in names:
+            yield os.path.join(parent, name)
+
+
+def walk_tree(root, *,
+              suffix=None,
+              walk=_walk_tree,
+              ):
+    """Yield each file in the tree under the given directory name.
+
+    If "suffix" is provided then only files with that suffix will
+    be included.
+    """
+    if suffix and not isinstance(suffix, str):
+        raise ValueError('suffix must be a string')
+
+    for filename in walk(root):
+        if suffix and not filename.endswith(suffix):
+            continue
+        yield filename
+
+
+def glob_tree(root, *,
+              suffix=None,
+              _glob=glob.iglob,
+              ):
+    """Yield each file in the tree under the given directory name.
+
+    If "suffix" is provided then only files with that suffix will
+    be included.
+    """
+    suffix = suffix or ''
+    if not isinstance(suffix, str):
+        raise ValueError('suffix must be a string')
+
+    for filename in _glob(f'{root}/*{suffix}'):
+        yield filename
+    for filename in _glob(f'{root}/**/*{suffix}'):
+        yield filename
+
+
+def iter_files(root, suffix=None, relparent=None, *,
+               get_files=os.walk,
+               _glob=glob_tree,
+               _walk=walk_tree,
+               ):
+    """Yield each file in the tree under the given directory name.
+
+    If "root" is a non-string iterable then do the same for each of
+    those trees.
+
+    If "suffix" is provided then only files with that suffix will
+    be included.
+
+    if "relparent" is provided then it is used to resolve each
+    filename as a relative path.
+    """
+    if not isinstance(root, str):
+        roots = root
+        for root in roots:
+            yield from iter_files(root, suffix, relparent,
+                                  get_files=get_files,
+                                  _glob=_glob, _walk=_walk)
+        return
+
+    # Use the right "walk" function.
+    if get_files in (glob.glob, glob.iglob, glob_tree):
+        get_files = _glob
+    else:
+        _files = _walk_tree if get_files in (os.walk, walk_tree) else get_files
+        get_files = (lambda *a, **k: _walk(*a, walk=_files, **k))
+
+    # Handle a single suffix.
+    if suffix and not isinstance(suffix, str):
+        filenames = get_files(root)
+        suffix = tuple(suffix)
+    else:
+        filenames = get_files(root, suffix=suffix)
+        suffix = None
+
+    for filename in filenames:
+        if suffix and not isinstance(suffix, str):  # multiple suffixes
+            if not filename.endswith(suffix):
+                continue
+        if relparent:
+            filename = os.path.relpath(filename, relparent)
+        yield filename
+
+
+def iter_files_by_suffix(root, suffixes, relparent=None, *,
+                         walk=walk_tree,
+                         _iter_files=iter_files,
+                         ):
+    """Yield each file in the tree that has the given suffixes.
+
+    Unlike iter_files(), the results are in the original suffix order.
+    """
+    if isinstance(suffixes, str):
+        suffixes = [suffixes]
+    # XXX Ignore repeated suffixes?
+    for suffix in suffixes:
+        yield from _iter_files(root, suffix, relparent)
diff --git a/Tools/c-analyzer/c_analyzer/common/info.py b/Tools/c-analyzer/c_analyzer/common/info.py
new file mode 100644
index 0000000..3f3f8c5
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/common/info.py
@@ -0,0 +1,138 @@
+from collections import namedtuple
+import re
+
+from .util import classonly, _NTBase
+
+# XXX need tests:
+# * ID.match()
+
+
+UNKNOWN = '???'
+
+NAME_RE = re.compile(r'^([a-zA-Z]|_\w*[a-zA-Z]\w*|[a-zA-Z]\w*)$')
+
+
+class ID(_NTBase, namedtuple('ID', 'filename funcname name')):
+    """A unique ID for a single symbol or declaration."""
+
+    __slots__ = ()
+    # XXX Add optional conditions (tuple of strings) field.
+    #conditions = Slot()
+
+    @classonly
+    def from_raw(cls, raw):
+        if not raw:
+            return None
+        if isinstance(raw, str):
+            return cls(None, None, raw)
+        try:
+            name, = raw
+            filename = None
+        except ValueError:
+            try:
+                filename, name = raw
+            except ValueError:
+                return super().from_raw(raw)
+        return cls(filename, None, name)
+
+    def __new__(cls, filename, funcname, name):
+        self = super().__new__(
+                cls,
+                filename=str(filename) if filename else None,
+                funcname=str(funcname) if funcname else None,
+                name=str(name) if name else None,
+                )
+        #cls.conditions.set(self, tuple(str(s) if s else None
+        #                               for s in conditions or ()))
+        return self
+
+    def validate(self):
+        """Fail if the object is invalid (i.e. init with bad data)."""
+        if not self.name:
+            raise TypeError('missing name')
+        else:
+            if not NAME_RE.match(self.name):
+                raise ValueError(
+                        f'name must be an identifier, got {self.name!r}')
+
+        # Symbols from a binary might not have filename/funcname info.
+
+        if self.funcname:
+            if not self.filename:
+                raise TypeError('missing filename')
+            if not NAME_RE.match(self.funcname) and self.funcname != UNKNOWN:
+                raise ValueError(
+                        f'name must be an identifier, got {self.funcname!r}')
+
+        # XXX Require the filename (at least UNKONWN)?
+        # XXX Check the filename?
+
+    @property
+    def islocal(self):
+        return self.funcname is not None
+
+    def match(self, other, *,
+              match_files=(lambda f1, f2: f1 == f2),
+              ):
+        """Return True if the two match.
+
+        At least one of the two must be completely valid (no UNKNOWN
+        anywhere).  Otherwise False is returned.  The remaining one
+        *may* have UNKNOWN for both funcname and filename.  It must
+        have a valid name though.
+
+        The caller is responsible for knowing which of the two is valid
+        (and which to use if both are valid).
+        """
+        # First check the name.
+        if self.name is None:
+            return False
+        if other.name != self.name:
+            return False
+
+        # Then check the filename.
+        if self.filename is None:
+            return False
+        if other.filename is None:
+            return False
+        if self.filename == UNKNOWN:
+            # "other" must be the valid one.
+            if other.funcname == UNKNOWN:
+                return False
+            elif self.funcname != UNKNOWN:
+                # XXX Try matching funcname even though we don't
+                # know the filename?
+                raise NotImplementedError
+            else:
+                return True
+        elif other.filename == UNKNOWN:
+            # "self" must be the valid one.
+            if self.funcname == UNKNOWN:
+                return False
+            elif other.funcname != UNKNOWN:
+                # XXX Try matching funcname even though we don't
+                # know the filename?
+                raise NotImplementedError
+            else:
+                return True
+        elif not match_files(self.filename, other.filename):
+            return False
+
+        # Finally, check the funcname.
+        if self.funcname == UNKNOWN:
+            # "other" must be the valid one.
+            if other.funcname == UNKNOWN:
+                return False
+            else:
+                return other.funcname is not None
+        elif other.funcname == UNKNOWN:
+            # "self" must be the valid one.
+            if self.funcname == UNKNOWN:
+                return False
+            else:
+                return self.funcname is not None
+        elif self.funcname == other.funcname:
+            # Both are valid.
+            return True
+
+        return False
diff --git a/Tools/c-analyzer/c_analyzer/common/show.py b/Tools/c-analyzer/c_analyzer/common/show.py
new file mode 100644
index 0000000..5f3cb1c
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/common/show.py
@@ -0,0 +1,11 @@
+
+def basic(variables, *,
+          _print=print):
+    """Print each row simply."""
+    for var in variables:
+        if var.funcname:
+            line = f'{var.filename}:{var.funcname}():{var.name}'
+        else:
+            line = f'{var.filename}:{var.name}'
+        line = f'{line:<64} {var.vartype}'
+        _print(line)
diff --git a/Tools/c-analyzer/c_analyzer/common/util.py b/Tools/c-analyzer/c_analyzer/common/util.py
new file mode 100644
index 0000000..43d0bb6
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/common/util.py
@@ -0,0 +1,243 @@
+import csv
+import subprocess
+
+
+_NOT_SET = object()
+
+
+def run_cmd(argv, **kwargs):
+    proc = subprocess.run(
+            argv,
+            #capture_output=True,
+            #stderr=subprocess.STDOUT,
+            stdout=subprocess.PIPE,
+            text=True,
+            check=True,
+            **kwargs
+            )
+    return proc.stdout
+
+
+def read_tsv(infile, header, *,
+             _open=open,
+             _get_reader=csv.reader,
+             ):
+    """Yield each row of the given TSV (tab-separated) file."""
+    if isinstance(infile, str):
+        with _open(infile, newline='') as infile:
+            yield from read_tsv(infile, header,
+                                _open=_open,
+                                _get_reader=_get_reader,
+                                )
+            return
+    lines = iter(infile)
+
+    # Validate the header.
+    try:
+        actualheader = next(lines).strip()
+    except StopIteration:
+        actualheader = ''
+    if actualheader != header:
+        raise ValueError(f'bad header {actualheader!r}')
+
+    for row in _get_reader(lines, delimiter='\t'):
+        yield tuple(v.strip() for v in row)
+
+
+def write_tsv(outfile, header, rows, *,
+             _open=open,
+             _get_writer=csv.writer,
+             ):
+    """Write each of the rows to the given TSV (tab-separated) file."""
+    if isinstance(outfile, str):
+        with _open(outfile, 'w', newline='') as outfile:
+            return write_tsv(outfile, header, rows,
+                            _open=_open,
+                            _get_writer=_get_writer,
+                            )
+
+    if isinstance(header, str):
+        header = header.split('\t')
+    writer = _get_writer(outfile, delimiter='\t')
+    writer.writerow(header)
+    for row in rows:
+        writer.writerow('' if v is None else str(v)
+                        for v in row)
+
+
+class Slot:
+    """A descriptor that provides a slot.
+
+    This is useful for types that can't have slots via __slots__,
+    e.g. tuple subclasses.
+    """
+
+    __slots__ = ('initial', 'default', 'readonly', 'instances', 'name')
+
+    def __init__(self, initial=_NOT_SET, *,
+                 default=_NOT_SET,
+                 readonly=False,
+                 ):
+        self.initial = initial
+        self.default = default
+        self.readonly = readonly
+
+        # The instance cache is not inherently tied to the normal
+        # lifetime of the instances.  So must do something in order to
+        # avoid keeping the instances alive by holding a reference here.
+        # Ideally we would use weakref.WeakValueDictionary to do this.
+        # However, most builtin types do not support weakrefs.  So
+        # instead we monkey-patch __del__ on the attached class to clear
+        # the instance.
+        self.instances = {}
+        self.name = None
+
+    def __set_name__(self, cls, name):
+        if self.name is not None:
+            raise TypeError('already used')
+        self.name = name
+        try:
+            slotnames = cls.__slot_names__
+        except AttributeError:
+            slotnames = cls.__slot_names__ = []
+        slotnames.append(name)
+        self._ensure___del__(cls, slotnames)
+
+    def __get__(self, obj, cls):
+        if obj is None:  # called on the class
+            return self
+        try:
+            value = self.instances[id(obj)]
+        except KeyError:
+            if self.initial is _NOT_SET:
+                value = self.default
+            else:
+                value = self.initial
+            self.instances[id(obj)] = value
+        if value is _NOT_SET:
+            raise AttributeError(self.name)
+        # XXX Optionally make a copy?
+        return value
+
+    def __set__(self, obj, value):
+        if self.readonly:
+            raise AttributeError(f'{self.name} is readonly')
+        # XXX Optionally coerce?
+        self.instances[id(obj)] = value
+
+    def __delete__(self, obj):
+        if self.readonly:
+            raise AttributeError(f'{self.name} is readonly')
+        self.instances[id(obj)] = self.default  # XXX refleak?
+
+    def _ensure___del__(self, cls, slotnames):  # See the comment in __init__().
+        try:
+            old___del__ = cls.__del__
+        except AttributeError:
+            old___del__ = (lambda s: None)
+        else:
+            if getattr(old___del__, '_slotted', False):
+                return
+
+        def __del__(_self):
+            for name in slotnames:
+                delattr(_self, name)
+            old___del__(_self)
+        __del__._slotted = True
+        cls.__del__ = __del__
+
+    def set(self, obj, value):
+        """Update the cached value for an object.
+
+        This works even if the descriptor is read-only.  This is
+        particularly useful when initializing the object (e.g. in
+        its __new__ or __init__).
+        """
+        self.instances[id(obj)] = value
+
+
+class classonly:
+    """A non-data descriptor that makes a value only visible on the class.
+
+    This is like the "classmethod" builtin, but does not show up on
+    instances of the class.  It may be used as a decorator.
+    """
+
+    def __init__(self, value):
+        self.value = value
+        self.getter = classmethod(value).__get__
+        self.name = None
+
+    def __set_name__(self, cls, name):
+        if self.name is not None:
+            raise TypeError('already used')
+        self.name = name
+
+    def __get__(self, obj, cls):
+        if obj is not None:
+            raise AttributeError(self.name)
+        # called on the class
+        return self.getter(None, cls)
+
+
+class _NTBase:
+
+    __slots__ = ()
+
+    @classonly
+    def from_raw(cls, raw):
+        if not raw:
+            return None
+        elif isinstance(raw, cls):
+            return raw
+        elif isinstance(raw, str):
+            return cls.from_string(raw)
+        else:
+            if hasattr(raw, 'items'):
+                return cls(**raw)
+            try:
+                args = tuple(raw)
+            except TypeError:
+                pass
+            else:
+                return cls(*args)
+        raise NotImplementedError
+
+    @classonly
+    def from_string(cls, value):
+        """Return a new instance based on the given string."""
+        raise NotImplementedError
+
+    @classmethod
+    def _make(cls, iterable):  # The default _make() is not subclass-friendly.
+        return cls.__new__(cls, *iterable)
+
+    # XXX Always validate?
+    #def __init__(self, *args, **kwargs):
+    #    self.validate()
+
+    # XXX The default __repr__() is not subclass-friendly (where the name changes).
+    #def __repr__(self):
+    #    _, _, sig = super().__repr__().partition('(')
+    #    return f'{self.__class__.__name__}({sig}'
+
+    # To make sorting work with None:
+    def __lt__(self, other):
+        try:
+            return super().__lt__(other)
+        except TypeError:
+            if None in self:
+                return True
+            elif None in other:
+                return False
+            else:
+                raise
+
+    def validate(self):
+        return
+
+    # XXX Always validate?
+    #def _replace(self, **kwargs):
+    #    obj = super()._replace(**kwargs)
+    #    obj.validate()
+    #    return obj
diff --git a/Tools/c-analyzer/c_analyzer/parser/__init__.py b/Tools/c-analyzer/c_analyzer/parser/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/parser/__init__.py
diff --git a/Tools/c-analyzer/c_analyzer/parser/declarations.py b/Tools/c-analyzer/c_analyzer/parser/declarations.py
new file mode 100644
index 0000000..f37072c
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/parser/declarations.py
@@ -0,0 +1,339 @@
+import re
+import shlex
+import subprocess
+
+from ..common.info import UNKNOWN
+
+from . import source
+
+
+IDENTIFIER = r'(?:[a-zA-z]|_+[a-zA-Z0-9]\w*)'
+
+TYPE_QUAL = r'(?:const|volatile)'
+
+VAR_TYPE_SPEC = r'''(?:
+        void |
+        (?:
+         (?:(?:un)?signed\s+)?
+         (?:
+          char |
+          short |
+          int |
+          long |
+          long\s+int |
+          long\s+long
+          ) |
+         ) |
+        float |
+        double |
+        {IDENTIFIER} |
+        (?:struct|union)\s+{IDENTIFIER}
+        )'''
+
+POINTER = rf'''(?:
+        (?:\s+const)?\s*[*]
+        )'''
+
+#STRUCT = r'''(?:
+#        (?:struct|(struct\s+%s))\s*[{]
+#            [^}]*
+#        [}]
+#        )''' % (IDENTIFIER)
+#UNION = r'''(?:
+#        (?:union|(union\s+%s))\s*[{]
+#            [^}]*
+#        [}]
+#        )''' % (IDENTIFIER)
+#DECL_SPEC = rf'''(?:
+#        ({VAR_TYPE_SPEC}) |
+#        ({STRUCT}) |
+#        ({UNION})
+#        )'''
+
+FUNC_START = rf'''(?:
+        (?:
+          (?:
+            extern |
+            static |
+            static\s+inline
+           )\s+
+         )?
+        #(?:const\s+)?
+        {VAR_TYPE_SPEC}
+        )'''
+#GLOBAL_VAR_START = rf'''(?:
+#        (?:
+#          (?:
+#            extern |
+#            static
+#           )\s+
+#         )?
+#        (?:
+#           {TYPE_QUAL}
+#           (?:\s+{TYPE_QUAL})?
+#         )?\s+
+#        {VAR_TYPE_SPEC}
+#        )'''
+GLOBAL_DECL_START_RE = re.compile(rf'''
+        ^
+        (?:
+            ({FUNC_START})
+         )
+        ''', re.VERBOSE)
+
+LOCAL_VAR_START = rf'''(?:
+        (?:
+          (?:
+            register |
+            static
+           )\s+
+         )?
+        (?:
+          (?:
+            {TYPE_QUAL}
+            (?:\s+{TYPE_QUAL})?
+           )\s+
+         )?
+        {VAR_TYPE_SPEC}
+        {POINTER}?
+        )'''
+LOCAL_STMT_START_RE = re.compile(rf'''
+        ^
+        (?:
+            ({LOCAL_VAR_START})
+         )
+        ''', re.VERBOSE)
+
+
+def iter_global_declarations(lines):
+    """Yield (decl, body) for each global declaration in the given lines.
+
+    For function definitions the header is reduced to one line and
+    the body is provided as-is.  For other compound declarations (e.g.
+    struct) the entire declaration is reduced to one line and "body"
+    is None.  Likewise for simple declarations (e.g. variables).
+
+    Declarations inside function bodies are ignored, though their text
+    is provided in the function body.
+    """
+    # XXX Bail out upon bogus syntax.
+    lines = source.iter_clean_lines(lines)
+    for line in lines:
+        if not GLOBAL_DECL_START_RE.match(line):
+            continue
+        # We only need functions here, since we only need locals for now.
+        if line.endswith(';'):
+            continue
+        if line.endswith('{') and '(' not in line:
+            continue
+
+        # Capture the function.
+        # (assume no func is a one-liner)
+        decl = line
+        while '{' not in line:  # assume no inline structs, etc.
+            try:
+                line = next(lines)
+            except StopIteration:
+                return
+            decl += ' ' + line
+
+        body, end = _extract_block(lines)
+        if end is None:
+            return
+        assert end == '}'
+        yield (f'{decl}\n{body}\n{end}', body)
+
+
+def iter_local_statements(lines):
+    """Yield (lines, blocks) for each statement in the given lines.
+
+    For simple statements, "blocks" is None and the statement is reduced
+    to a single line.  For compound statements, "blocks" is a pair of
+    (header, body) for each block in the statement.  The headers are
+    reduced to a single line each, but the bpdies are provided as-is.
+    """
+    # XXX Bail out upon bogus syntax.
+    lines = source.iter_clean_lines(lines)
+    for line in lines:
+        if not LOCAL_STMT_START_RE.match(line):
+            continue
+
+        stmt = line
+        blocks = None
+        if not line.endswith(';'):
+            # XXX Support compound & multiline simple statements.
+            #blocks = []
+            continue
+
+        yield (stmt, blocks)
+
+
+def _extract_block(lines):
+    end = None
+    depth = 1
+    body = []
+    for line in lines:
+        depth += line.count('{') - line.count('}')
+        if depth == 0:
+            end = line
+            break
+        body.append(line)
+    return '\n'.join(body), end
+
+
+def parse_func(stmt, body):
+    """Return (name, signature) for the given function definition."""
+    header, _, end = stmt.partition(body)
+    assert end.strip() == '}'
+    assert header.strip().endswith('{')
+    header, _, _= header.rpartition('{')
+
+    signature = ' '.join(header.strip().splitlines())
+
+    _, _, name = signature.split('(')[0].strip().rpartition(' ')
+    assert name
+
+    return name, signature
+
+
+#TYPE_SPEC = rf'''(?:
+#        )'''
+#VAR_DECLARATOR = rf'''(?:
+#        )'''
+#VAR_DECL = rf'''(?:
+#            {TYPE_SPEC}+
+#            {VAR_DECLARATOR}
+#            \s*
+#        )'''
+#VAR_DECLARATION = rf'''(?:
+#            {VAR_DECL}
+#            (?: = [^=] [^;]* )?
+#            ;
+#        )'''
+#
+#
+#def parse_variable(decl, *, inFunc=False):
+#    """Return [(name, storage, vartype)] for the given variable declaration."""
+#    ...
+
+
+def _parse_var(stmt):
+    """Return (name, vartype) for the given variable declaration."""
+    stmt = stmt.rstrip(';')
+    m = LOCAL_STMT_START_RE.match(stmt)
+    assert m
+    vartype = m.group(0)
+    name = stmt[len(vartype):].partition('=')[0].strip()
+
+    if name.startswith('('):
+        name, _, after = name[1:].partition(')')
+        assert after
+        name = name.replace('*', '* ')
+        inside, _, name = name.strip().rpartition(' ')
+        vartype = f'{vartype} ({inside.strip()}){after}'
+    else:
+        name = name.replace('*', '* ')
+        before, _, name = name.rpartition(' ')
+        vartype = f'{vartype} {before}'
+
+    vartype = vartype.strip()
+    while '  ' in vartype:
+        vartype = vartype.replace('  ', ' ')
+
+    return name, vartype
+
+
+def extract_storage(decl, *, infunc=None):
+    """Return (storage, vartype) based on the given declaration.
+
+    The default storage is "implicit" (or "local" if infunc is True).
+    """
+    if decl == UNKNOWN:
+        return decl
+    if decl.startswith('static '):
+        return 'static'
+        #return 'static', decl.partition(' ')[2].strip()
+    elif decl.startswith('extern '):
+        return 'extern'
+        #return 'extern', decl.partition(' ')[2].strip()
+    elif re.match('.*\b(static|extern)\b', decl):
+        raise NotImplementedError
+    elif infunc:
+        return 'local'
+    else:
+        return 'implicit'
+
+
+def parse_compound(stmt, blocks):
+    """Return (headers, bodies) for the given compound statement."""
+    # XXX Identify declarations inside compound statements
+    # (if/switch/for/while).
+    raise NotImplementedError
+
+
+def iter_variables(filename, *,
+                   preprocessed=False,
+                   _iter_source_lines=source.iter_lines,
+                   _iter_global=iter_global_declarations,
+                   _iter_local=iter_local_statements,
+                   _parse_func=parse_func,
+                   _parse_var=_parse_var,
+                   _parse_compound=parse_compound,
+                   ):
+    """Yield (funcname, name, vartype) for every variable in the given file."""
+    if preprocessed:
+        raise NotImplementedError
+    lines = _iter_source_lines(filename)
+    for stmt, body in _iter_global(lines):
+        # At the file top-level we only have to worry about vars & funcs.
+        if not body:
+            name, vartype = _parse_var(stmt)
+            if name:
+                yield (None, name, vartype)
+        else:
+            funcname, _ = _parse_func(stmt, body)
+            localvars = _iter_locals(body,
+                                     _iter_statements=_iter_local,
+                                     _parse_var=_parse_var,
+                                     _parse_compound=_parse_compound,
+                                     )
+            for name, vartype in localvars:
+                yield (funcname, name, vartype)
+
+
+def _iter_locals(lines, *,
+                 _iter_statements=iter_local_statements,
+                 _parse_var=_parse_var,
+                 _parse_compound=parse_compound,
+                 ):
+    compound = [lines]
+    while compound:
+        body = compound.pop(0)
+        bodylines = body.splitlines()
+        for stmt, blocks in _iter_statements(bodylines):
+            if not blocks:
+                name, vartype = _parse_var(stmt)
+                if name:
+                    yield (name, vartype)
+            else:
+                headers, bodies = _parse_compound(stmt, blocks)
+                for header in headers:
+                    for line in header:
+                        name, vartype = _parse_var(line)
+                        if name:
+                            yield (name, vartype)
+                compound.extend(bodies)
+
+
+def iter_all(filename, *,
+             preprocessed=False,
+             ):
+    """Yield a Declaration for each one found.
+
+    If there are duplicates, due to preprocessor conditionals, then
+    they are checked to make sure they are the same.
+    """
+    # XXX For the moment we cheat.
+    for funcname, name, decl in iter_variables(filename,
+                                               preprocessed=preprocessed):
+        yield 'variable', funcname, name, decl
diff --git a/Tools/c-analyzer/c_analyzer/parser/find.py b/Tools/c-analyzer/c_analyzer/parser/find.py
new file mode 100644
index 0000000..3860d3d
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/parser/find.py
@@ -0,0 +1,107 @@
+from ..common.info import UNKNOWN, ID
+
+from . import declarations
+
+# XXX need tests:
+# * variables
+# * variable
+# * variable_from_id
+
+
+def _iter_vars(filenames, preprocessed, *,
+               handle_id=None,
+               _iter_decls=declarations.iter_all,
+               ):
+    if handle_id is None:
+        handle_id = ID
+
+    for filename in filenames or ():
+        for kind, funcname, name, decl in _iter_decls(filename,
+                                                      preprocessed=preprocessed,
+                                                      ):
+            if kind != 'variable':
+                continue
+            varid = handle_id(filename, funcname, name)
+            yield varid, decl
+
+
+# XXX Add a "handle_var" arg like we did for get_resolver()?
+
+def variables(*filenames,
+              perfilecache=None,
+              preprocessed=False,
+              known=None,  # for types
+              handle_id=None,
+              _iter_vars=_iter_vars,
+              ):
+    """Yield (varid, decl) for each variable found in the given files.
+
+    If "preprocessed" is provided (and not False/None) then it is used
+    to decide which tool to use to parse the source code after it runs
+    through the C preprocessor.  Otherwise the raw
+    """
+    if len(filenames) == 1 and not (filenames[0], str):
+        filenames, = filenames
+
+    if perfilecache is None:
+        yield from _iter_vars(filenames, preprocessed)
+    else:
+        # XXX Cache per-file variables (e.g. `{filename: [(varid, decl)]}`).
+        raise NotImplementedError
+
+
+def variable(name, filenames, *,
+             local=False,
+             perfilecache=None,
+             preprocessed=False,
+             handle_id=None,
+             _iter_vars=variables,
+             ):
+    """Return (varid, decl) for the first found variable that matches.
+
+    If "local" is True then the first matching local variable in the
+    file will always be returned.  To avoid that, pass perfilecache and
+    pop each variable from the cache after using it.
+    """
+    for varid, decl in _iter_vars(filenames,
+                                  perfilecache=perfilecache,
+                                  preprocessed=preprocessed,
+                                  ):
+        if varid.name != name:
+            continue
+        if local:
+            if varid.funcname:
+                if varid.funcname == UNKNOWN:
+                    raise NotImplementedError
+                return varid, decl
+        elif not varid.funcname:
+            return varid, decl
+    else:
+        return None, None  # No matching variable was found.
+
+
+def variable_from_id(id, filenames, *,
+                     perfilecache=None,
+                     preprocessed=False,
+                     handle_id=None,
+                     _get_var=variable,
+                     ):
+    """Return (varid, decl) for the first found variable that matches."""
+    local = False
+    if isinstance(id, str):
+        name = id
+    else:
+        if id.funcname == UNKNOWN:
+            local = True
+        elif id.funcname:
+            raise NotImplementedError
+
+        name = id.name
+        if id.filename and id.filename != UNKNOWN:
+            filenames = [id.filename]
+    return _get_var(name, filenames,
+                    local=local,
+                    perfilecache=perfilecache,
+                    preprocessed=preprocessed,
+                    handle_id=handle_id,
+                    )
diff --git a/Tools/c-analyzer/c_analyzer/parser/naive.py b/Tools/c-analyzer/c_analyzer/parser/naive.py
new file mode 100644
index 0000000..4a4822d
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/parser/naive.py
@@ -0,0 +1,179 @@
+import re
+
+from ..common.info import UNKNOWN, ID
+
+from .preprocessor import _iter_clean_lines
+
+
+_NOT_SET = object()
+
+
+def get_srclines(filename, *,
+                 cache=None,
+                 _open=open,
+                 _iter_lines=_iter_clean_lines,
+                 ):
+    """Return the file's lines as a list.
+
+    Each line will have trailing whitespace removed (including newline).
+
+    If a cache is given the it is used.
+    """
+    if cache is not None:
+        try:
+            return cache[filename]
+        except KeyError:
+            pass
+
+    with _open(filename) as srcfile:
+        srclines = [line
+                    for _, line in _iter_lines(srcfile)
+                    if not line.startswith('#')]
+    for i, line in enumerate(srclines):
+        srclines[i] = line.rstrip()
+
+    if cache is not None:
+        cache[filename] = srclines
+    return srclines
+
+
+def parse_variable_declaration(srcline):
+    """Return (name, decl) for the given declaration line."""
+    # XXX possible false negatives...
+    decl, sep, _ = srcline.partition('=')
+    if not sep:
+        if not srcline.endswith(';'):
+            return None, None
+        decl = decl.strip(';')
+    decl = decl.strip()
+    m = re.match(r'.*\b(\w+)\s*(?:\[[^\]]*\])?$', decl)
+    if not m:
+        return None, None
+    name = m.group(1)
+    return name, decl
+
+
+def parse_variable(srcline, funcname=None):
+    """Return (varid, decl) for the variable declared on the line (or None)."""
+    line = srcline.strip()
+
+    # XXX Handle more than just static variables.
+    if line.startswith('static '):
+        if '(' in line and '[' not in line:
+            # a function
+            return None, None
+        return parse_variable_declaration(line)
+    else:
+        return None, None
+
+
+def iter_variables(filename, *,
+                   srccache=None,
+                   parse_variable=None,
+                   _get_srclines=get_srclines,
+                   _default_parse_variable=parse_variable,
+                   ):
+    """Yield (varid, decl) for each variable in the given source file."""
+    if parse_variable is None:
+        parse_variable = _default_parse_variable
+
+    indent = ''
+    prev = ''
+    funcname = None
+    for line in _get_srclines(filename, cache=srccache):
+        # remember current funcname
+        if funcname:
+            if line == indent + '}':
+                funcname = None
+                continue
+        else:
+            if '(' in prev and line == indent + '{':
+                if not prev.startswith('__attribute__'):
+                    funcname = prev.split('(')[0].split()[-1]
+                    prev = ''
+                    continue
+            indent = line[:-len(line.lstrip())]
+            prev = line
+
+        info = parse_variable(line, funcname)
+        if isinstance(info, list):
+            for name, _funcname, decl in info:
+                yield ID(filename, _funcname, name), decl
+            continue
+        name, decl = info
+
+        if name is None:
+            continue
+        yield ID(filename, funcname, name), decl
+
+
+def _match_varid(variable, name, funcname, ignored=None):
+    if ignored and variable in ignored:
+        return False
+
+    if variable.name != name:
+        return False
+
+    if funcname == UNKNOWN:
+        if not variable.funcname:
+            return False
+    elif variable.funcname != funcname:
+        return False
+
+    return True
+
+
+def find_variable(filename, funcname, name, *,
+                  ignored=None,
+                  srccache=None,  # {filename: lines}
+                  parse_variable=None,
+                  _iter_variables=iter_variables,
+                  ):
+    """Return the matching variable.
+
+    Return None if the variable is not found.
+    """
+    for varid, decl in _iter_variables(filename,
+                                    srccache=srccache,
+                                    parse_variable=parse_variable,
+                                    ):
+        if _match_varid(varid, name, funcname, ignored):
+            return varid, decl
+    else:
+        return None
+
+
+def find_variables(varids, filenames=None, *,
+                   srccache=_NOT_SET,
+                   parse_variable=None,
+                   _find_symbol=find_variable,
+                   ):
+    """Yield (varid, decl) for each ID.
+
+    If the variable is not found then its decl will be UNKNOWN.  That
+    way there will be one resulting variable per given ID.
+    """
+    if srccache is _NOT_SET:
+        srccache = {}
+
+    used = set()
+    for varid in varids:
+        if varid.filename and varid.filename != UNKNOWN:
+            srcfiles = [varid.filename]
+        else:
+            if not filenames:
+                yield varid, UNKNOWN
+                continue
+            srcfiles = filenames
+        for filename in srcfiles:
+            varid, decl = _find_varid(filename, varid.funcname, varid.name,
+                                      ignored=used,
+                                      srccache=srccache,
+                                      parse_variable=parse_variable,
+                                      )
+            if varid:
+                yield varid, decl
+                used.add(varid)
+                break
+        else:
+            yield varid, UNKNOWN
diff --git a/Tools/c-analyzer/c_analyzer/parser/preprocessor.py b/Tools/c-analyzer/c_analyzer/parser/preprocessor.py
new file mode 100644
index 0000000..41f306e
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/parser/preprocessor.py
@@ -0,0 +1,511 @@
+from collections import namedtuple
+import shlex
+import os
+import re
+
+from ..common import util, info
+
+
+CONTINUATION = '\\' + os.linesep
+
+IDENTIFIER = r'(?:\w*[a-zA-Z]\w*)'
+IDENTIFIER_RE = re.compile('^' + IDENTIFIER + '$')
+
+
+def _coerce_str(value):
+    if not value:
+        return ''
+    return str(value).strip()
+
+
+#############################
+# directives
+
+DIRECTIVE_START = r'''
+    (?:
+      ^ \s*
+      [#] \s*
+      )'''
+DIRECTIVE_TEXT = r'''
+    (?:
+      (?: \s+ ( .*\S ) )?
+      \s* $
+      )'''
+DIRECTIVE = rf'''
+    (?:
+      {DIRECTIVE_START}
+      (
+        include |
+        error | warning |
+        pragma |
+        define | undef |
+        if | ifdef | ifndef | elseif | else | endif |
+        __FILE__ | __LINE__ | __DATE __ | __TIME__ | __TIMESTAMP__
+        )
+      {DIRECTIVE_TEXT}
+      )'''
+#       (?:
+#        [^\\\n] |
+#        \\ [^\n] |
+#        \\ \n
+#        )+
+#      ) \n
+#     )'''
+DIRECTIVE_RE = re.compile(DIRECTIVE, re.VERBOSE)
+
+DEFINE = rf'''
+    (?:
+      {DIRECTIVE_START} define \s+
+      (?:
+        ( \w*[a-zA-Z]\w* )
+        (?: \s* [(] ([^)]*) [)] )?
+        )
+      {DIRECTIVE_TEXT}
+      )'''
+DEFINE_RE = re.compile(DEFINE, re.VERBOSE)
+
+
+def parse_directive(line):
+    """Return the appropriate directive for the given line."""
+    line = line.strip()
+    if line.startswith('#'):
+        line = line[1:].lstrip()
+        line = '#' + line
+    directive = line
+    #directive = '#' + line
+    while '  ' in directive:
+        directive = directive.replace('  ', ' ')
+    return _parse_directive(directive)
+
+
+def _parse_directive(line):
+    m = DEFINE_RE.match(line)
+    if m:
+        name, args, text = m.groups()
+        if args:
+            args = [a.strip() for a in args.split(',')]
+            return Macro(name, args, text)
+        else:
+            return Constant(name, text)
+
+    m = DIRECTIVE_RE.match(line)
+    if not m:
+        raise ValueError(f'unsupported directive {line!r}')
+    kind, text = m.groups()
+    if not text:
+        if kind not in ('else', 'endif'):
+            raise ValueError(f'missing text in directive {line!r}')
+    elif kind in ('else', 'endif', 'define'):
+        raise ValueError(f'unexpected text in directive {line!r}')
+    if kind == 'include':
+        directive = Include(text)
+    elif kind in IfDirective.KINDS:
+        directive = IfDirective(kind, text)
+    else:
+        directive = OtherDirective(kind, text)
+    directive.validate()
+    return directive
+
+
+class PreprocessorDirective(util._NTBase):
+    """The base class for directives."""
+
+    __slots__ = ()
+
+    KINDS = frozenset([
+            'include',
+            'pragma',
+            'error', 'warning',
+            'define', 'undef',
+            'if', 'ifdef', 'ifndef', 'elseif', 'else', 'endif',
+            '__FILE__', '__DATE__', '__LINE__', '__TIME__', '__TIMESTAMP__',
+            ])
+
+    @property
+    def text(self):
+        return ' '.join(v for v in self[1:] if v and v.strip()) or None
+
+    def validate(self):
+        """Fail if the object is invalid (i.e. init with bad data)."""
+        super().validate()
+
+        if not self.kind:
+            raise TypeError('missing kind')
+        elif self.kind not in self.KINDS:
+            raise ValueError
+
+        # text can be anything, including None.
+
+
+class Constant(PreprocessorDirective,
+               namedtuple('Constant', 'kind name value')):
+    """A single "constant" directive ("define")."""
+
+    __slots__ = ()
+
+    def __new__(cls, name, value=None):
+        self = super().__new__(
+                cls,
+                'define',
+                name=_coerce_str(name) or None,
+                value=_coerce_str(value) or None,
+                )
+        return self
+
+    def validate(self):
+        """Fail if the object is invalid (i.e. init with bad data)."""
+        super().validate()
+
+        if not self.name:
+            raise TypeError('missing name')
+        elif not IDENTIFIER_RE.match(self.name):
+            raise ValueError(f'name must be identifier, got {self.name!r}')
+
+        # value can be anything, including None
+
+
+class Macro(PreprocessorDirective,
+            namedtuple('Macro', 'kind name args body')):
+    """A single "macro" directive ("define")."""
+
+    __slots__ = ()
+
+    def __new__(cls, name, args, body=None):
+        # "args" must be a string or an iterable of strings (or "empty").
+        if isinstance(args, str):
+            args = [v.strip() for v in args.split(',')]
+        if args:
+            args = tuple(_coerce_str(a) or None for a in args)
+        self = super().__new__(
+                cls,
+                kind='define',
+                name=_coerce_str(name) or None,
+                args=args if args else (),
+                body=_coerce_str(body) or None,
+                )
+        return self
+
+    @property
+    def text(self):
+        if self.body:
+            return f'{self.name}({", ".join(self.args)}) {self.body}'
+        else:
+            return f'{self.name}({", ".join(self.args)})'
+
+    def validate(self):
+        """Fail if the object is invalid (i.e. init with bad data)."""
+        super().validate()
+
+        if not self.name:
+            raise TypeError('missing name')
+        elif not IDENTIFIER_RE.match(self.name):
+            raise ValueError(f'name must be identifier, got {self.name!r}')
+
+        for arg in self.args:
+            if not arg:
+                raise ValueError(f'missing arg in {self.args}')
+            elif not IDENTIFIER_RE.match(arg):
+                raise ValueError(f'arg must be identifier, got {arg!r}')
+
+        # body can be anything, including None
+
+
+class IfDirective(PreprocessorDirective,
+                  namedtuple('IfDirective', 'kind condition')):
+    """A single conditional directive (e.g. "if", "ifdef").
+
+    This only includes directives that actually provide conditions.  The
+    related directives "else" and "endif" are covered by OtherDirective
+    instead.
+    """
+
+    __slots__ = ()
+
+    KINDS = frozenset([
+            'if',
+            'ifdef',
+            'ifndef',
+            'elseif',
+            ])
+
+    @classmethod
+    def _condition_from_raw(cls, raw, kind):
+        #return Condition.from_raw(raw, _kind=kind)
+        condition = _coerce_str(raw)
+        if not condition:
+            return None
+
+        if kind == 'ifdef':
+            condition = f'defined({condition})'
+        elif kind == 'ifndef':
+            condition = f'! defined({condition})'
+
+        return condition
+
+    def __new__(cls, kind, condition):
+        kind = _coerce_str(kind)
+        self = super().__new__(
+                cls,
+                kind=kind or None,
+                condition=cls._condition_from_raw(condition, kind),
+                )
+        return self
+
+    @property
+    def text(self):
+        if self.kind == 'ifdef':
+            return self.condition[8:-1]  # strip "defined("
+        elif self.kind == 'ifndef':
+            return self.condition[10:-1]  # strip "! defined("
+        else:
+            return self.condition
+        #return str(self.condition)
+
+    def validate(self):
+        """Fail if the object is invalid (i.e. init with bad data)."""
+        super().validate()
+
+        if not self.condition:
+            raise TypeError('missing condition')
+        #else:
+        #    for cond in self.condition:
+        #        if not cond:
+        #            raise ValueError(f'missing condition in {self.condition}')
+        #        cond.validate()
+        #    if self.kind in ('ifdef', 'ifndef'):
+        #        if len(self.condition) != 1:
+        #            raise ValueError('too many condition')
+        #        if self.kind == 'ifdef':
+        #            if not self.condition[0].startswith('defined '):
+        #                raise ValueError('bad condition')
+        #        else:
+        #            if not self.condition[0].startswith('! defined '):
+        #                raise ValueError('bad condition')
+
+
+class Include(PreprocessorDirective,
+              namedtuple('Include', 'kind file')):
+    """A single "include" directive.
+
+    Supported "file" values are either follow the bracket style
+    (<stdio>) or double quotes ("spam.h").
+    """
+
+    __slots__ = ()
+
+    def __new__(cls, file):
+        self = super().__new__(
+                cls,
+                kind='include',
+                file=_coerce_str(file) or None,
+                )
+        return self
+
+    def validate(self):
+        """Fail if the object is invalid (i.e. init with bad data)."""
+        super().validate()
+
+        if not self.file:
+            raise TypeError('missing file')
+
+
+class OtherDirective(PreprocessorDirective,
+                     namedtuple('OtherDirective', 'kind text')):
+    """A single directive not covered by another class.
+
+    This includes the "else", "endif", and "undef" directives, which are
+    otherwise inherently related to the directives covered by the
+    Constant, Macro, and IfCondition classes.
+
+    Note that all directives must have a text value, except for "else"
+    and "endif" (which must have no text).
+    """
+
+    __slots__ = ()
+
+    KINDS = PreprocessorDirective.KINDS - {'include', 'define'} - IfDirective.KINDS
+
+    def __new__(cls, kind, text):
+        self = super().__new__(
+                cls,
+                kind=_coerce_str(kind) or None,
+                text=_coerce_str(text) or None,
+                )
+        return self
+
+    def validate(self):
+        """Fail if the object is invalid (i.e. init with bad data)."""
+        super().validate()
+
+        if self.text:
+            if self.kind in ('else', 'endif'):
+                raise ValueError('unexpected text in directive')
+        elif self.kind not in ('else', 'endif'):
+            raise TypeError('missing text')
+
+
+#############################
+# iterating lines
+
+def _recompute_conditions(directive, ifstack):
+    if directive.kind in ('if', 'ifdef', 'ifndef'):
+        ifstack.append(
+                ([], directive.condition))
+    elif directive.kind == 'elseif':
+        if ifstack:
+            negated, active = ifstack.pop()
+            if active:
+                negated.append(active)
+        else:
+            negated = []
+        ifstack.append(
+                (negated, directive.condition))
+    elif directive.kind == 'else':
+        if ifstack:
+            negated, active = ifstack.pop()
+            if active:
+                negated.append(active)
+            ifstack.append(
+                    (negated, None))
+    elif directive.kind == 'endif':
+        if ifstack:
+            ifstack.pop()
+
+    conditions = []
+    for negated, active in ifstack:
+        for condition in negated:
+            conditions.append(f'! ({condition})')
+        if active:
+            conditions.append(active)
+    return tuple(conditions)
+
+
+def _iter_clean_lines(lines):
+    lines = iter(enumerate(lines, 1))
+    for lno, line in lines:
+        # Handle line continuations.
+        while line.endswith(CONTINUATION):
+            try:
+                lno, _line = next(lines)
+            except StopIteration:
+                break
+            line = line[:-len(CONTINUATION)] + ' ' + _line
+
+        # Deal with comments.
+        after = line
+        line = ''
+        while True:
+            # Look for a comment.
+            before, begin, remainder = after.partition('/*')
+            if '//' in before:
+                before, _, _ = before.partition('//')
+                line += before + ' '  # per the C99 spec
+                break
+            line += before
+            if not begin:
+                break
+            line += ' '  # per the C99 spec
+
+            # Go until we find the end of the comment.
+            _, end, after = remainder.partition('*/')
+            while not end:
+                try:
+                    lno, remainder = next(lines)
+                except StopIteration:
+                    raise Exception('unterminated comment')
+                _, end, after = remainder.partition('*/')
+
+        yield lno, line
+
+
+def iter_lines(lines, *,
+                   _iter_clean_lines=_iter_clean_lines,
+                   _parse_directive=_parse_directive,
+                   _recompute_conditions=_recompute_conditions,
+                   ):
+    """Yield (lno, line, directive, active conditions) for each given line.
+
+    This is effectively a subset of the operations taking place in
+    translation phases 2-4 from the C99 spec (ISO/IEC 9899:TC2); see
+    section 5.1.1.2.  Line continuations are removed and comments
+    replaced with a single space.  (In both cases "lno" will be the last
+    line involved.)  Otherwise each line is returned as-is.
+
+    "lno" is the (1-indexed) line number for the line.
+
+    "directive" will be a PreprocessorDirective or None, depending on
+    whether or not there is a directive on the line.
+
+    "active conditions" is the set of preprocessor conditions (e.g.
+    "defined()") under which the current line of code will be included
+    in compilation.  That set is derived from every conditional
+    directive block (e.g. "if defined()", "ifdef", "else") containing
+    that line.  That includes nested directives.  Note that the
+    current line does not affect the active conditions for iteself.
+    It only impacts subsequent lines.  That applies to directives
+    that close blocks (e.g. "endif") just as much as conditional
+    directvies.  Also note that "else" and "elseif" directives
+    update the active conditions (for later lines), rather than
+    adding to them.
+    """
+    ifstack = []
+    conditions = ()
+    for lno, line in _iter_clean_lines(lines):
+        stripped = line.strip()
+        if not stripped.startswith('#'):
+            yield lno, line, None, conditions
+            continue
+
+        directive = '#' + stripped[1:].lstrip()
+        while '  ' in directive:
+            directive = directive.replace('  ', ' ')
+        directive = _parse_directive(directive)
+        yield lno, line, directive, conditions
+
+        if directive.kind in ('else', 'endif'):
+            conditions = _recompute_conditions(directive, ifstack)
+        elif isinstance(directive, IfDirective):
+            conditions = _recompute_conditions(directive, ifstack)
+
+
+#############################
+# running (platform-specific?)
+
+def _gcc(filename, *,
+         _get_argv=(lambda: _get_gcc_argv()),
+         _run=util.run_cmd,
+         ):
+    argv = _get_argv()
+    argv.extend([
+            '-E', filename,
+            ])
+    output = _run(argv)
+    return output
+
+
+def _get_gcc_argv(*,
+                  _open=open,
+                  _run=util.run_cmd,
+                  ):
+    with _open('/tmp/print.mk', 'w') as tmpfile:
+        tmpfile.write('print-%:\n')
+        #tmpfile.write('\t@echo $* = $($*)\n')
+        tmpfile.write('\t@echo $($*)\n')
+    argv = ['/usr/bin/make',
+            '-f', 'Makefile',
+            '-f', '/tmp/print.mk',
+            'print-CC',
+            'print-PY_CORE_CFLAGS',
+            ]
+    output = _run(argv)
+    gcc, cflags = output.strip().splitlines()
+    argv = shlex.split(gcc.strip())
+    cflags = shlex.split(cflags.strip())
+    return argv + cflags
+
+
+def run(filename, *,
+        _gcc=_gcc,
+        ):
+    """Return the text of the given file after running the preprocessor."""
+    return _gcc(filename)
diff --git a/Tools/c-analyzer/c_analyzer/parser/source.py b/Tools/c-analyzer/c_analyzer/parser/source.py
new file mode 100644
index 0000000..f8998c8
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/parser/source.py
@@ -0,0 +1,34 @@
+from . import preprocessor
+
+
+def iter_clean_lines(lines):
+    incomment = False
+    for line in lines:
+        # Deal with comments.
+        if incomment:
+            _, sep, line = line.partition('*/')
+            if sep:
+                incomment = False
+            continue
+        line, _, _ = line.partition('//')
+        line, sep, remainder = line.partition('/*')
+        if sep:
+            _, sep, after = remainder.partition('*/')
+            if not sep:
+                incomment = True
+                continue
+            line += ' ' + after
+
+        # Ignore blank lines and leading/trailing whitespace.
+        line = line.strip()
+        if not line:
+            continue
+
+        yield line
+
+
+def iter_lines(filename, *,
+               preprocess=preprocessor.run,
+               ):
+    content = preprocess(filename)
+    return iter(content.splitlines())
diff --git a/Tools/c-analyzer/c_analyzer/symbols/__init__.py b/Tools/c-analyzer/c_analyzer/symbols/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/symbols/__init__.py
diff --git a/Tools/c-analyzer/c_analyzer/symbols/_nm.py b/Tools/c-analyzer/c_analyzer/symbols/_nm.py
new file mode 100644
index 0000000..f3a75a6
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/symbols/_nm.py
@@ -0,0 +1,117 @@
+import os.path
+import shutil
+
+from c_analyzer.common import util, info
+
+from .info import Symbol
+
+
+# XXX need tests:
+# * iter_symbols
+
+NM_KINDS = {
+        'b': Symbol.KIND.VARIABLE,  # uninitialized
+        'd': Symbol.KIND.VARIABLE,  # initialized
+        #'g': Symbol.KIND.VARIABLE,  # uninitialized
+        #'s': Symbol.KIND.VARIABLE,  # initialized
+        't': Symbol.KIND.FUNCTION,
+        }
+
+SPECIAL_SYMBOLS = {
+        # binary format (e.g. ELF)
+        '__bss_start',
+        '__data_start',
+        '__dso_handle',
+        '_DYNAMIC',
+        '_edata',
+        '_end',
+        '__environ@@GLIBC_2.2.5',
+        '_GLOBAL_OFFSET_TABLE_',
+        '__JCR_END__',
+        '__JCR_LIST__',
+        '__TMC_END__',
+        }
+
+
+def _is_special_symbol(name):
+    if name in SPECIAL_SYMBOLS:
+        return True
+    if '@@GLIBC' in name:
+        return True
+    return False
+
+
+def iter_symbols(binfile, *,
+                 nm=None,
+                 handle_id=None,
+                 _which=shutil.which,
+                 _run=util.run_cmd,
+                 ):
+    """Yield a Symbol for each relevant entry reported by the "nm" command."""
+    if nm is None:
+        nm = _which('nm')
+        if not nm:
+            raise NotImplementedError
+    if handle_id is None:
+        handle_id = info.ID
+
+    argv = [nm,
+            '--line-numbers',
+            binfile,
+            ]
+    try:
+        output = _run(argv)
+    except Exception:
+        if nm is None:
+            # XXX Use dumpbin.exe /SYMBOLS on Windows.
+            raise NotImplementedError
+        raise
+    for line in output.splitlines():
+        (name, kind, external, filename, funcname,
+         ) = _parse_nm_line(line)
+        if kind != Symbol.KIND.VARIABLE:
+            continue
+        elif _is_special_symbol(name):
+            continue
+        yield Symbol(
+                id=handle_id(filename, funcname, name),
+                kind=kind,
+                external=external,
+                )
+
+
+def _parse_nm_line(line):
+    _origline = line
+    _, _, line = line.partition(' ')  # strip off the address
+    line = line.strip()
+
+    kind, _, line = line.partition(' ')
+    line = line.strip()
+    external = kind.isupper()
+    kind = NM_KINDS.get(kind.lower(), Symbol.KIND.OTHER)
+
+    name, _, filename = line.partition('\t')
+    name = name.strip()
+    if filename:
+        filename = os.path.relpath(filename.partition(':')[0])
+    else:
+        filename = info.UNKNOWN
+
+    name, islocal = _parse_nm_name(name, kind)
+    funcname = info.UNKNOWN if islocal else None
+    return name, kind, external, filename, funcname
+
+
+def _parse_nm_name(name, kind):
+    if kind != Symbol.KIND.VARIABLE:
+        return name, None
+    if _is_special_symbol(name):
+        return name, None
+
+    actual, sep, digits = name.partition('.')
+    if not sep:
+        return name, False
+
+    if not digits.isdigit():
+        raise Exception(f'got bogus name {name}')
+    return actual, True
diff --git a/Tools/c-analyzer/c_analyzer/symbols/find.py b/Tools/c-analyzer/c_analyzer/symbols/find.py
new file mode 100644
index 0000000..8564652
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/symbols/find.py
@@ -0,0 +1,175 @@
+import os
+import os.path
+import shutil
+
+from ..common import files
+from ..common.info import UNKNOWN, ID
+from ..parser import find as p_find
+
+from . import _nm
+from .info import Symbol
+
+# XXX need tests:
+# * get_resolver()
+# * get_resolver_from_dirs()
+# * symbol()
+# * symbols()
+# * variables()
+
+
+def _resolve_known(symbol, knownvars):
+    for varid in knownvars:
+        if symbol.match(varid):
+            break
+    else:
+        return None
+    return knownvars.pop(varid)
+
+
+def get_resolver(filenames=None, known=None, *,
+                 handle_var,
+                 check_filename=None,
+                 perfilecache=None,
+                 preprocessed=False,
+                 _from_source=p_find.variable_from_id,
+                 ):
+    """Return a "resolver" func for the given known vars/types and filenames.
+
+    "handle_var" is a callable that takes (ID, decl) and returns a
+    Variable.  Variable.from_id is a suitable callable.
+
+    The returned func takes a single Symbol and returns a corresponding
+    Variable.  If the symbol was located then the variable will be
+    valid, populated with the corresponding information.  Otherwise None
+    is returned.
+    """
+    knownvars = (known or {}).get('variables')
+    if knownvars:
+        knownvars = dict(knownvars)  # a copy
+        if filenames:
+            if check_filename is None:
+                filenames = list(filenames)
+                def check_filename(filename):
+                    return filename in filenames
+            def resolve(symbol):
+                # XXX Check "found" instead?
+                if not check_filename(symbol.filename):
+                    return None
+                found = _resolve_known(symbol, knownvars)
+                if found is None:
+                    #return None
+                    varid, decl = _from_source(symbol, filenames,
+                                               perfilecache=perfilecache,
+                                               preprocessed=preprocessed,
+                                               )
+                    found = handle_var(varid, decl)
+                return found
+        else:
+            def resolve(symbol):
+                return _resolve_known(symbol, knownvars)
+    elif filenames:
+        def resolve(symbol):
+            varid, decl = _from_source(symbol, filenames,
+                                       perfilecache=perfilecache,
+                                       preprocessed=preprocessed,
+                                       )
+            return handle_var(varid, decl)
+    else:
+        def resolve(symbol):
+            return None
+    return resolve
+
+
+def get_resolver_from_dirs(dirnames, known=None, *,
+                           handle_var,
+                           suffixes=('.c',),
+                           perfilecache=None,
+                           preprocessed=False,
+                           _iter_files=files.iter_files_by_suffix,
+                           _get_resolver=get_resolver,
+                           ):
+    """Return a "resolver" func for the given known vars/types and filenames.
+
+    "dirnames" should be absolute paths.  If not then they will be
+    resolved relative to CWD.
+
+    See get_resolver().
+    """
+    dirnames = [d if d.endswith(os.path.sep) else d + os.path.sep
+                for d in dirnames]
+    filenames = _iter_files(dirnames, suffixes)
+    def check_filename(filename):
+        for dirname in dirnames:
+            if filename.startswith(dirname):
+                return True
+        else:
+            return False
+    return _get_resolver(filenames, known,
+                         handle_var=handle_var,
+                         check_filename=check_filename,
+                         perfilecache=perfilecache,
+                         preprocessed=preprocessed,
+                         )
+
+
+def symbol(symbol, filenames, known=None, *,
+           perfilecache=None,
+           preprocessed=False,
+           handle_id=None,
+           _get_resolver=get_resolver,
+           ):
+    """Return a Variable for the one matching the given symbol.
+
+    "symbol" can be one of several objects:
+
+    * Symbol - use the contained info
+    * name (str) - look for a global variable with that name
+    * (filename, name) - look for named global in file
+    * (filename, funcname, name) - look for named local in file
+
+    A name is always required.  If the filename is None, "", or
+    "UNKNOWN" then all files will be searched.  If the funcname is
+    "" or "UNKNOWN" then only local variables will be searched for.
+    """
+    resolve = _get_resolver(known, filenames,
+                            handle_id=handle_id,
+                            perfilecache=perfilecache,
+                            preprocessed=preprocessed,
+                            )
+    return resolve(symbol)
+
+
+def _get_platform_tool():
+    if os.name == 'nt':
+        # XXX Support this.
+        raise NotImplementedError
+    elif nm := shutil.which('nm'):
+        return lambda b, hi: _nm.iter_symbols(b, nm=nm, handle_id=hi)
+    else:
+        raise NotImplementedError
+
+
+def symbols(binfile, *,
+            handle_id=None,
+            _file_exists=os.path.exists,
+            _get_platform_tool=_get_platform_tool,
+            ):
+    """Yield a Symbol for each one found in the binary."""
+    if not _file_exists(binfile):
+        raise Exception('executable missing (need to build it first?)')
+
+    _iter_symbols = _get_platform_tool()
+    yield from _iter_symbols(binfile, handle_id)
+
+
+def variables(binfile, *,
+              resolve,
+              handle_id=None,
+              _iter_symbols=symbols,
+              ):
+    """Yield (Variable, Symbol) for each found symbol."""
+    for symbol in _iter_symbols(binfile, handle_id=handle_id):
+        if symbol.kind != Symbol.KIND.VARIABLE:
+            continue
+        var = resolve(symbol) or None
+        yield var, symbol
diff --git a/Tools/c-analyzer/c_analyzer/symbols/info.py b/Tools/c-analyzer/c_analyzer/symbols/info.py
new file mode 100644
index 0000000..96a251a
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/symbols/info.py
@@ -0,0 +1,51 @@
+from collections import namedtuple
+
+from c_analyzer.common.info import ID
+from c_analyzer.common.util import classonly, _NTBase
+
+
+class Symbol(_NTBase, namedtuple('Symbol', 'id kind external')):
+    """Info for a single compilation symbol."""
+
+    __slots__ = ()
+
+    class KIND:
+        VARIABLE = 'variable'
+        FUNCTION = 'function'
+        OTHER = 'other'
+
+    @classonly
+    def from_name(cls, name, filename=None, kind=KIND.VARIABLE, external=None):
+        """Return a new symbol based on the given name."""
+        id = ID(filename, None, name)
+        return cls(id, kind, external)
+
+    def __new__(cls, id, kind=KIND.VARIABLE, external=None):
+        self = super().__new__(
+                cls,
+                id=ID.from_raw(id),
+                kind=str(kind) if kind else None,
+                external=bool(external) if external is not None else None,
+                )
+        return self
+
+    def __hash__(self):
+        return hash(self.id)
+
+    def __getattr__(self, name):
+        return getattr(self.id, name)
+
+    def validate(self):
+        """Fail if the object is invalid (i.e. init with bad data)."""
+        if not self.id:
+            raise TypeError('missing id')
+        else:
+            self.id.validate()
+
+        if not self.kind:
+            raise TypeError('missing kind')
+        elif self.kind not in vars(self.KIND).values():
+            raise ValueError(f'unsupported kind {self.kind}')
+
+        if self.external is None:
+            raise TypeError('missing external')
diff --git a/Tools/c-analyzer/c_analyzer/variables/__init__.py b/Tools/c-analyzer/c_analyzer/variables/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/variables/__init__.py
diff --git a/Tools/c-analyzer/c_analyzer/variables/find.py b/Tools/c-analyzer/c_analyzer/variables/find.py
new file mode 100644
index 0000000..3fe7284
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/variables/find.py
@@ -0,0 +1,75 @@
+from ..common import files
+from ..common.info import UNKNOWN
+from ..parser import (
+        find as p_find,
+        )
+from ..symbols import (
+        info as s_info,
+        find as s_find,
+        )
+from .info import Variable
+
+# XXX need tests:
+# * vars_from_source
+
+
+def _remove_cached(cache, var):
+    if not cache:
+        return
+    try:
+        cached = cache[var.filename]
+        cached.remove(var)
+    except (KeyError, IndexError):
+        pass
+
+
+def vars_from_binary(binfile, *,
+                     known=None,
+                     filenames=None,
+                     handle_id=None,
+                     check_filename=None,
+                     handle_var=Variable.from_id,
+                     _iter_vars=s_find.variables,
+                     _get_symbol_resolver=s_find.get_resolver,
+                     ):
+    """Yield a Variable for each found Symbol.
+
+    Details are filled in from the given "known" variables and types.
+    """
+    cache = {}
+    resolve = _get_symbol_resolver(filenames, known,
+                                   handle_var=handle_var,
+                                   check_filename=check_filename,
+                                   perfilecache=cache,
+                                   )
+    for var, symbol in _iter_vars(binfile,
+                                  resolve=resolve,
+                                  handle_id=handle_id,
+                                  ):
+        if var is None:
+            var = Variable(symbol.id, UNKNOWN, UNKNOWN)
+        yield var
+        _remove_cached(cache, var)
+
+
+def vars_from_source(filenames, *,
+                     preprocessed=None,
+                     known=None,
+                     handle_id=None,
+                     handle_var=Variable.from_id,
+                     iter_vars=p_find.variables,
+                     ):
+    """Yield a Variable for each declaration in the raw source code.
+
+    Details are filled in from the given "known" variables and types.
+    """
+    cache = {}
+    for varid, decl in iter_vars(filenames or (),
+                                 perfilecache=cache,
+                                 preprocessed=preprocessed,
+                                 known=known,
+                                 handle_id=handle_id,
+                                 ):
+        var = handle_var(varid, decl)
+        yield var
+        _remove_cached(cache, var)
diff --git a/Tools/c-analyzer/c_analyzer/variables/info.py b/Tools/c-analyzer/c_analyzer/variables/info.py
new file mode 100644
index 0000000..336a523
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/variables/info.py
@@ -0,0 +1,93 @@
+from collections import namedtuple
+
+from ..common.info import ID, UNKNOWN
+from ..common.util import classonly, _NTBase
+
+
+def normalize_vartype(vartype):
+    """Return the canonical form for a variable type (or func signature)."""
+    # We allow empty strring through for semantic reasons.
+    if vartype is None:
+        return None
+
+    # XXX finish!
+    # XXX Return (modifiers, type, pointer)?
+    return str(vartype)
+
+
+# XXX Variable.vartype -> decl (Declaration).
+
+class Variable(_NTBase,
+               namedtuple('Variable', 'id storage vartype')):
+    """Information about a single variable declaration."""
+
+    __slots__ = ()
+
+    STORAGE = (
+            'static',
+            'extern',
+            'implicit',
+            'local',
+            )
+
+    @classonly
+    def from_parts(cls, filename, funcname, name, decl, storage=None):
+        varid = ID(filename, funcname, name)
+        if storage is None:
+            self = cls.from_id(varid, decl)
+        else:
+            self = cls(varid, storage, decl)
+        return self
+
+    @classonly
+    def from_id(cls, varid, decl):
+        from ..parser.declarations import extract_storage
+        storage = extract_storage(decl, infunc=varid.funcname)
+        return cls(varid, storage, decl)
+
+    def __new__(cls, id, storage, vartype):
+        self = super().__new__(
+                cls,
+                id=ID.from_raw(id),
+                storage=str(storage) if storage else None,
+                vartype=normalize_vartype(vartype) if vartype else None,
+                )
+        return self
+
+    def __hash__(self):
+        return hash(self.id)
+
+    def __getattr__(self, name):
+        return getattr(self.id, name)
+
+    def _validate_id(self):
+        if not self.id:
+            raise TypeError('missing id')
+
+        if not self.filename or self.filename == UNKNOWN:
+            raise TypeError(f'id missing filename ({self.id})')
+
+        if self.funcname and self.funcname == UNKNOWN:
+            raise TypeError(f'id missing funcname ({self.id})')
+
+        self.id.validate()
+
+    def validate(self):
+        """Fail if the object is invalid (i.e. init with bad data)."""
+        self._validate_id()
+
+        if self.storage is None or self.storage == UNKNOWN:
+            raise TypeError('missing storage')
+        elif self.storage not in self.STORAGE:
+            raise ValueError(f'unsupported storage {self.storage:r}')
+
+        if self.vartype is None or self.vartype == UNKNOWN:
+            raise TypeError('missing vartype')
+
+    @property
+    def isglobal(self):
+        return self.storage != 'local'
+
+    @property
+    def isconst(self):
+        return 'const' in self.vartype.split()
diff --git a/Tools/c-analyzer/c_analyzer/variables/known.py b/Tools/c-analyzer/c_analyzer/variables/known.py
new file mode 100644
index 0000000..aa2934a
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/variables/known.py
@@ -0,0 +1,91 @@
+import csv
+
+from ..common.info import ID, UNKNOWN
+from ..common.util import read_tsv
+from .info import Variable
+
+
+# XXX need tests:
+# * read_file()
+# * look_up_variable()
+
+
+COLUMNS = ('filename', 'funcname', 'name', 'kind', 'declaration')
+HEADER = '\t'.join(COLUMNS)
+
+
+def read_file(infile, *,
+              _read_tsv=read_tsv,
+              ):
+    """Yield (kind, id, decl) for each row in the data file.
+
+    The caller is responsible for validating each row.
+    """
+    for row in _read_tsv(infile, HEADER):
+        filename, funcname, name, kind, declaration = row
+        if not funcname or funcname == '-':
+            funcname = None
+        id = ID(filename, funcname, name)
+        yield kind, id, declaration
+
+
+def from_file(infile, *,
+              handle_var=Variable.from_id,
+              _read_file=read_file,
+              ):
+    """Return the info for known declarations in the given file."""
+    known = {
+        'variables': {},
+        #'types': {},
+        #'constants': {},
+        #'macros': {},
+        }
+    for kind, id, decl in _read_file(infile):
+        if kind == 'variable':
+            values = known['variables']
+            value = handle_var(id, decl)
+        else:
+            raise ValueError(f'unsupported kind in row {row}')
+        value.validate()
+        values[id] = value
+    return known
+
+
+def look_up_variable(varid, knownvars, *,
+                     match_files=(lambda f1, f2: f1 == f2),
+                     ):
+    """Return the known Variable matching the given ID.
+
+    "knownvars" is a mapping of ID to Variable.
+
+    "match_files" is used to verify if two filenames point to
+    the same file.
+
+    If no match is found then None is returned.
+    """
+    if not knownvars:
+        return None
+
+    if varid.funcname == UNKNOWN:
+        if not varid.filename or varid.filename == UNKNOWN:
+            for varid in knownvars:
+                if not varid.funcname:
+                    continue
+                if varid.name == varid.name:
+                    return knownvars[varid]
+            else:
+                return None
+        else:
+            for varid in knownvars:
+                if not varid.funcname:
+                    continue
+                if not match_files(varid.filename, varid.filename):
+                    continue
+                if varid.name == varid.name:
+                    return knownvars[varid]
+            else:
+                return None
+    elif not varid.filename or varid.filename == UNKNOWN:
+        raise NotImplementedError
+    else:
+        return knownvars.get(varid.id)
author	Eric Snow <ericsnowcurrently@gmail.com>	2019-10-19 02:00:04 (GMT)
committer	GitHub <noreply@github.com>	2019-10-19 02:00:04 (GMT)
commit	e4c431ecf50def40eb93c3969c1e4eeaf7bf32f1 (patch)
tree	071224bbded262901b9742eb82c5d82d2f744fe1 /Tools/c-analyzer/c_analyzer
parent	ea55c51bd937f6019c35b39b87029644e469c059 (diff)
download	cpython-e4c431ecf50def40eb93c3969c1e4eeaf7bf32f1.zip cpython-e4c431ecf50def40eb93c3969c1e4eeaf7bf32f1.tar.gz cpython-e4c431ecf50def40eb93c3969c1e4eeaf7bf32f1.tar.bz2