summaryrefslogtreecommitdiffstats
path: root/Tools/c-analyzer/c_analyzer
diff options
context:
space:
mode:
authorEric Snow <ericsnowcurrently@gmail.com>2019-10-19 02:00:04 (GMT)
committerGitHub <noreply@github.com>2019-10-19 02:00:04 (GMT)
commite4c431ecf50def40eb93c3969c1e4eeaf7bf32f1 (patch)
tree071224bbded262901b9742eb82c5d82d2f744fe1 /Tools/c-analyzer/c_analyzer
parentea55c51bd937f6019c35b39b87029644e469c059 (diff)
downloadcpython-e4c431ecf50def40eb93c3969c1e4eeaf7bf32f1.zip
cpython-e4c431ecf50def40eb93c3969c1e4eeaf7bf32f1.tar.gz
cpython-e4c431ecf50def40eb93c3969c1e4eeaf7bf32f1.tar.bz2
bpo-36876: Re-organize the c-analyzer tool code. (gh-16841)
This is partly a cleanup of the code. It also is preparation for getting the variables from the source (cross-platform) rather than from the symbols. The change only touches the tool (and its tests).
Diffstat (limited to 'Tools/c-analyzer/c_analyzer')
-rw-r--r--Tools/c-analyzer/c_analyzer/__init__.py0
-rw-r--r--Tools/c-analyzer/c_analyzer/common/__init__.py0
-rw-r--r--Tools/c-analyzer/c_analyzer/common/files.py120
-rw-r--r--Tools/c-analyzer/c_analyzer/common/info.py138
-rw-r--r--Tools/c-analyzer/c_analyzer/common/show.py11
-rw-r--r--Tools/c-analyzer/c_analyzer/common/util.py243
-rw-r--r--Tools/c-analyzer/c_analyzer/parser/__init__.py0
-rw-r--r--Tools/c-analyzer/c_analyzer/parser/declarations.py339
-rw-r--r--Tools/c-analyzer/c_analyzer/parser/find.py107
-rw-r--r--Tools/c-analyzer/c_analyzer/parser/naive.py179
-rw-r--r--Tools/c-analyzer/c_analyzer/parser/preprocessor.py511
-rw-r--r--Tools/c-analyzer/c_analyzer/parser/source.py34
-rw-r--r--Tools/c-analyzer/c_analyzer/symbols/__init__.py0
-rw-r--r--Tools/c-analyzer/c_analyzer/symbols/_nm.py117
-rw-r--r--Tools/c-analyzer/c_analyzer/symbols/find.py175
-rw-r--r--Tools/c-analyzer/c_analyzer/symbols/info.py51
-rw-r--r--Tools/c-analyzer/c_analyzer/variables/__init__.py0
-rw-r--r--Tools/c-analyzer/c_analyzer/variables/find.py75
-rw-r--r--Tools/c-analyzer/c_analyzer/variables/info.py93
-rw-r--r--Tools/c-analyzer/c_analyzer/variables/known.py91
20 files changed, 2284 insertions, 0 deletions
diff --git a/Tools/c-analyzer/c_analyzer/__init__.py b/Tools/c-analyzer/c_analyzer/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/__init__.py
diff --git a/Tools/c-analyzer/c_analyzer/common/__init__.py b/Tools/c-analyzer/c_analyzer/common/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/common/__init__.py
diff --git a/Tools/c-analyzer/c_analyzer/common/files.py b/Tools/c-analyzer/c_analyzer/common/files.py
new file mode 100644
index 0000000..ab551a8
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/common/files.py
@@ -0,0 +1,120 @@
+import glob
+import os
+import os.path
+
+# XXX need tests:
+# * walk_tree()
+# * glob_tree()
+# * iter_files_by_suffix()
+
+
+C_SOURCE_SUFFIXES = ('.c', '.h')
+
+
+def _walk_tree(root, *,
+ _walk=os.walk,
+ ):
+ # A wrapper around os.walk that resolves the filenames.
+ for parent, _, names in _walk(root):
+ for name in names:
+ yield os.path.join(parent, name)
+
+
+def walk_tree(root, *,
+ suffix=None,
+ walk=_walk_tree,
+ ):
+ """Yield each file in the tree under the given directory name.
+
+ If "suffix" is provided then only files with that suffix will
+ be included.
+ """
+ if suffix and not isinstance(suffix, str):
+ raise ValueError('suffix must be a string')
+
+ for filename in walk(root):
+ if suffix and not filename.endswith(suffix):
+ continue
+ yield filename
+
+
+def glob_tree(root, *,
+ suffix=None,
+ _glob=glob.iglob,
+ ):
+ """Yield each file in the tree under the given directory name.
+
+ If "suffix" is provided then only files with that suffix will
+ be included.
+ """
+ suffix = suffix or ''
+ if not isinstance(suffix, str):
+ raise ValueError('suffix must be a string')
+
+ for filename in _glob(f'{root}/*{suffix}'):
+ yield filename
+ for filename in _glob(f'{root}/**/*{suffix}'):
+ yield filename
+
+
+def iter_files(root, suffix=None, relparent=None, *,
+ get_files=os.walk,
+ _glob=glob_tree,
+ _walk=walk_tree,
+ ):
+ """Yield each file in the tree under the given directory name.
+
+ If "root" is a non-string iterable then do the same for each of
+ those trees.
+
+ If "suffix" is provided then only files with that suffix will
+ be included.
+
+ if "relparent" is provided then it is used to resolve each
+ filename as a relative path.
+ """
+ if not isinstance(root, str):
+ roots = root
+ for root in roots:
+ yield from iter_files(root, suffix, relparent,
+ get_files=get_files,
+ _glob=_glob, _walk=_walk)
+ return
+
+ # Use the right "walk" function.
+ if get_files in (glob.glob, glob.iglob, glob_tree):
+ get_files = _glob
+ else:
+ _files = _walk_tree if get_files in (os.walk, walk_tree) else get_files
+ get_files = (lambda *a, **k: _walk(*a, walk=_files, **k))
+
+ # Handle a single suffix.
+ if suffix and not isinstance(suffix, str):
+ filenames = get_files(root)
+ suffix = tuple(suffix)
+ else:
+ filenames = get_files(root, suffix=suffix)
+ suffix = None
+
+ for filename in filenames:
+ if suffix and not isinstance(suffix, str): # multiple suffixes
+ if not filename.endswith(suffix):
+ continue
+ if relparent:
+ filename = os.path.relpath(filename, relparent)
+ yield filename
+
+
+def iter_files_by_suffix(root, suffixes, relparent=None, *,
+ walk=walk_tree,
+ _iter_files=iter_files,
+ ):
+ """Yield each file in the tree that has the given suffixes.
+
+ Unlike iter_files(), the results are in the original suffix order.
+ """
+ if isinstance(suffixes, str):
+ suffixes = [suffixes]
+ # XXX Ignore repeated suffixes?
+ for suffix in suffixes:
+ yield from _iter_files(root, suffix, relparent)
diff --git a/Tools/c-analyzer/c_analyzer/common/info.py b/Tools/c-analyzer/c_analyzer/common/info.py
new file mode 100644
index 0000000..3f3f8c5
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/common/info.py
@@ -0,0 +1,138 @@
+from collections import namedtuple
+import re
+
+from .util import classonly, _NTBase
+
+# XXX need tests:
+# * ID.match()
+
+
+UNKNOWN = '???'
+
+NAME_RE = re.compile(r'^([a-zA-Z]|_\w*[a-zA-Z]\w*|[a-zA-Z]\w*)$')
+
+
+class ID(_NTBase, namedtuple('ID', 'filename funcname name')):
+ """A unique ID for a single symbol or declaration."""
+
+ __slots__ = ()
+ # XXX Add optional conditions (tuple of strings) field.
+ #conditions = Slot()
+
+ @classonly
+ def from_raw(cls, raw):
+ if not raw:
+ return None
+ if isinstance(raw, str):
+ return cls(None, None, raw)
+ try:
+ name, = raw
+ filename = None
+ except ValueError:
+ try:
+ filename, name = raw
+ except ValueError:
+ return super().from_raw(raw)
+ return cls(filename, None, name)
+
+ def __new__(cls, filename, funcname, name):
+ self = super().__new__(
+ cls,
+ filename=str(filename) if filename else None,
+ funcname=str(funcname) if funcname else None,
+ name=str(name) if name else None,
+ )
+ #cls.conditions.set(self, tuple(str(s) if s else None
+ # for s in conditions or ()))
+ return self
+
+ def validate(self):
+ """Fail if the object is invalid (i.e. init with bad data)."""
+ if not self.name:
+ raise TypeError('missing name')
+ else:
+ if not NAME_RE.match(self.name):
+ raise ValueError(
+ f'name must be an identifier, got {self.name!r}')
+
+ # Symbols from a binary might not have filename/funcname info.
+
+ if self.funcname:
+ if not self.filename:
+ raise TypeError('missing filename')
+ if not NAME_RE.match(self.funcname) and self.funcname != UNKNOWN:
+ raise ValueError(
+ f'name must be an identifier, got {self.funcname!r}')
+
+ # XXX Require the filename (at least UNKONWN)?
+ # XXX Check the filename?
+
+ @property
+ def islocal(self):
+ return self.funcname is not None
+
+ def match(self, other, *,
+ match_files=(lambda f1, f2: f1 == f2),
+ ):
+ """Return True if the two match.
+
+ At least one of the two must be completely valid (no UNKNOWN
+ anywhere). Otherwise False is returned. The remaining one
+ *may* have UNKNOWN for both funcname and filename. It must
+ have a valid name though.
+
+ The caller is responsible for knowing which of the two is valid
+ (and which to use if both are valid).
+ """
+ # First check the name.
+ if self.name is None:
+ return False
+ if other.name != self.name:
+ return False
+
+ # Then check the filename.
+ if self.filename is None:
+ return False
+ if other.filename is None:
+ return False
+ if self.filename == UNKNOWN:
+ # "other" must be the valid one.
+ if other.funcname == UNKNOWN:
+ return False
+ elif self.funcname != UNKNOWN:
+ # XXX Try matching funcname even though we don't
+ # know the filename?
+ raise NotImplementedError
+ else:
+ return True
+ elif other.filename == UNKNOWN:
+ # "self" must be the valid one.
+ if self.funcname == UNKNOWN:
+ return False
+ elif other.funcname != UNKNOWN:
+ # XXX Try matching funcname even though we don't
+ # know the filename?
+ raise NotImplementedError
+ else:
+ return True
+ elif not match_files(self.filename, other.filename):
+ return False
+
+ # Finally, check the funcname.
+ if self.funcname == UNKNOWN:
+ # "other" must be the valid one.
+ if other.funcname == UNKNOWN:
+ return False
+ else:
+ return other.funcname is not None
+ elif other.funcname == UNKNOWN:
+ # "self" must be the valid one.
+ if self.funcname == UNKNOWN:
+ return False
+ else:
+ return self.funcname is not None
+ elif self.funcname == other.funcname:
+ # Both are valid.
+ return True
+
+ return False
diff --git a/Tools/c-analyzer/c_analyzer/common/show.py b/Tools/c-analyzer/c_analyzer/common/show.py
new file mode 100644
index 0000000..5f3cb1c
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/common/show.py
@@ -0,0 +1,11 @@
+
+def basic(variables, *,
+ _print=print):
+ """Print each row simply."""
+ for var in variables:
+ if var.funcname:
+ line = f'{var.filename}:{var.funcname}():{var.name}'
+ else:
+ line = f'{var.filename}:{var.name}'
+ line = f'{line:<64} {var.vartype}'
+ _print(line)
diff --git a/Tools/c-analyzer/c_analyzer/common/util.py b/Tools/c-analyzer/c_analyzer/common/util.py
new file mode 100644
index 0000000..43d0bb6
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/common/util.py
@@ -0,0 +1,243 @@
+import csv
+import subprocess
+
+
+_NOT_SET = object()
+
+
+def run_cmd(argv, **kwargs):
+ proc = subprocess.run(
+ argv,
+ #capture_output=True,
+ #stderr=subprocess.STDOUT,
+ stdout=subprocess.PIPE,
+ text=True,
+ check=True,
+ **kwargs
+ )
+ return proc.stdout
+
+
+def read_tsv(infile, header, *,
+ _open=open,
+ _get_reader=csv.reader,
+ ):
+ """Yield each row of the given TSV (tab-separated) file."""
+ if isinstance(infile, str):
+ with _open(infile, newline='') as infile:
+ yield from read_tsv(infile, header,
+ _open=_open,
+ _get_reader=_get_reader,
+ )
+ return
+ lines = iter(infile)
+
+ # Validate the header.
+ try:
+ actualheader = next(lines).strip()
+ except StopIteration:
+ actualheader = ''
+ if actualheader != header:
+ raise ValueError(f'bad header {actualheader!r}')
+
+ for row in _get_reader(lines, delimiter='\t'):
+ yield tuple(v.strip() for v in row)
+
+
+def write_tsv(outfile, header, rows, *,
+ _open=open,
+ _get_writer=csv.writer,
+ ):
+ """Write each of the rows to the given TSV (tab-separated) file."""
+ if isinstance(outfile, str):
+ with _open(outfile, 'w', newline='') as outfile:
+ return write_tsv(outfile, header, rows,
+ _open=_open,
+ _get_writer=_get_writer,
+ )
+
+ if isinstance(header, str):
+ header = header.split('\t')
+ writer = _get_writer(outfile, delimiter='\t')
+ writer.writerow(header)
+ for row in rows:
+ writer.writerow('' if v is None else str(v)
+ for v in row)
+
+
+class Slot:
+ """A descriptor that provides a slot.
+
+ This is useful for types that can't have slots via __slots__,
+ e.g. tuple subclasses.
+ """
+
+ __slots__ = ('initial', 'default', 'readonly', 'instances', 'name')
+
+ def __init__(self, initial=_NOT_SET, *,
+ default=_NOT_SET,
+ readonly=False,
+ ):
+ self.initial = initial
+ self.default = default
+ self.readonly = readonly
+
+ # The instance cache is not inherently tied to the normal
+ # lifetime of the instances. So must do something in order to
+ # avoid keeping the instances alive by holding a reference here.
+ # Ideally we would use weakref.WeakValueDictionary to do this.
+ # However, most builtin types do not support weakrefs. So
+ # instead we monkey-patch __del__ on the attached class to clear
+ # the instance.
+ self.instances = {}
+ self.name = None
+
+ def __set_name__(self, cls, name):
+ if self.name is not None:
+ raise TypeError('already used')
+ self.name = name
+ try:
+ slotnames = cls.__slot_names__
+ except AttributeError:
+ slotnames = cls.__slot_names__ = []
+ slotnames.append(name)
+ self._ensure___del__(cls, slotnames)
+
+ def __get__(self, obj, cls):
+ if obj is None: # called on the class
+ return self
+ try:
+ value = self.instances[id(obj)]
+ except KeyError:
+ if self.initial is _NOT_SET:
+ value = self.default
+ else:
+ value = self.initial
+ self.instances[id(obj)] = value
+ if value is _NOT_SET:
+ raise AttributeError(self.name)
+ # XXX Optionally make a copy?
+ return value
+
+ def __set__(self, obj, value):
+ if self.readonly:
+ raise AttributeError(f'{self.name} is readonly')
+ # XXX Optionally coerce?
+ self.instances[id(obj)] = value
+
+ def __delete__(self, obj):
+ if self.readonly:
+ raise AttributeError(f'{self.name} is readonly')
+ self.instances[id(obj)] = self.default # XXX refleak?
+
+ def _ensure___del__(self, cls, slotnames): # See the comment in __init__().
+ try:
+ old___del__ = cls.__del__
+ except AttributeError:
+ old___del__ = (lambda s: None)
+ else:
+ if getattr(old___del__, '_slotted', False):
+ return
+
+ def __del__(_self):
+ for name in slotnames:
+ delattr(_self, name)
+ old___del__(_self)
+ __del__._slotted = True
+ cls.__del__ = __del__
+
+ def set(self, obj, value):
+ """Update the cached value for an object.
+
+ This works even if the descriptor is read-only. This is
+ particularly useful when initializing the object (e.g. in
+ its __new__ or __init__).
+ """
+ self.instances[id(obj)] = value
+
+
+class classonly:
+ """A non-data descriptor that makes a value only visible on the class.
+
+ This is like the "classmethod" builtin, but does not show up on
+ instances of the class. It may be used as a decorator.
+ """
+
+ def __init__(self, value):
+ self.value = value
+ self.getter = classmethod(value).__get__
+ self.name = None
+
+ def __set_name__(self, cls, name):
+ if self.name is not None:
+ raise TypeError('already used')
+ self.name = name
+
+ def __get__(self, obj, cls):
+ if obj is not None:
+ raise AttributeError(self.name)
+ # called on the class
+ return self.getter(None, cls)
+
+
+class _NTBase:
+
+ __slots__ = ()
+
+ @classonly
+ def from_raw(cls, raw):
+ if not raw:
+ return None
+ elif isinstance(raw, cls):
+ return raw
+ elif isinstance(raw, str):
+ return cls.from_string(raw)
+ else:
+ if hasattr(raw, 'items'):
+ return cls(**raw)
+ try:
+ args = tuple(raw)
+ except TypeError:
+ pass
+ else:
+ return cls(*args)
+ raise NotImplementedError
+
+ @classonly
+ def from_string(cls, value):
+ """Return a new instance based on the given string."""
+ raise NotImplementedError
+
+ @classmethod
+ def _make(cls, iterable): # The default _make() is not subclass-friendly.
+ return cls.__new__(cls, *iterable)
+
+ # XXX Always validate?
+ #def __init__(self, *args, **kwargs):
+ # self.validate()
+
+ # XXX The default __repr__() is not subclass-friendly (where the name changes).
+ #def __repr__(self):
+ # _, _, sig = super().__repr__().partition('(')
+ # return f'{self.__class__.__name__}({sig}'
+
+ # To make sorting work with None:
+ def __lt__(self, other):
+ try:
+ return super().__lt__(other)
+ except TypeError:
+ if None in self:
+ return True
+ elif None in other:
+ return False
+ else:
+ raise
+
+ def validate(self):
+ return
+
+ # XXX Always validate?
+ #def _replace(self, **kwargs):
+ # obj = super()._replace(**kwargs)
+ # obj.validate()
+ # return obj
diff --git a/Tools/c-analyzer/c_analyzer/parser/__init__.py b/Tools/c-analyzer/c_analyzer/parser/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/parser/__init__.py
diff --git a/Tools/c-analyzer/c_analyzer/parser/declarations.py b/Tools/c-analyzer/c_analyzer/parser/declarations.py
new file mode 100644
index 0000000..f37072c
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/parser/declarations.py
@@ -0,0 +1,339 @@
+import re
+import shlex
+import subprocess
+
+from ..common.info import UNKNOWN
+
+from . import source
+
+
+IDENTIFIER = r'(?:[a-zA-z]|_+[a-zA-Z0-9]\w*)'
+
+TYPE_QUAL = r'(?:const|volatile)'
+
+VAR_TYPE_SPEC = r'''(?:
+ void |
+ (?:
+ (?:(?:un)?signed\s+)?
+ (?:
+ char |
+ short |
+ int |
+ long |
+ long\s+int |
+ long\s+long
+ ) |
+ ) |
+ float |
+ double |
+ {IDENTIFIER} |
+ (?:struct|union)\s+{IDENTIFIER}
+ )'''
+
+POINTER = rf'''(?:
+ (?:\s+const)?\s*[*]
+ )'''
+
+#STRUCT = r'''(?:
+# (?:struct|(struct\s+%s))\s*[{]
+# [^}]*
+# [}]
+# )''' % (IDENTIFIER)
+#UNION = r'''(?:
+# (?:union|(union\s+%s))\s*[{]
+# [^}]*
+# [}]
+# )''' % (IDENTIFIER)
+#DECL_SPEC = rf'''(?:
+# ({VAR_TYPE_SPEC}) |
+# ({STRUCT}) |
+# ({UNION})
+# )'''
+
+FUNC_START = rf'''(?:
+ (?:
+ (?:
+ extern |
+ static |
+ static\s+inline
+ )\s+
+ )?
+ #(?:const\s+)?
+ {VAR_TYPE_SPEC}
+ )'''
+#GLOBAL_VAR_START = rf'''(?:
+# (?:
+# (?:
+# extern |
+# static
+# )\s+
+# )?
+# (?:
+# {TYPE_QUAL}
+# (?:\s+{TYPE_QUAL})?
+# )?\s+
+# {VAR_TYPE_SPEC}
+# )'''
+GLOBAL_DECL_START_RE = re.compile(rf'''
+ ^
+ (?:
+ ({FUNC_START})
+ )
+ ''', re.VERBOSE)
+
+LOCAL_VAR_START = rf'''(?:
+ (?:
+ (?:
+ register |
+ static
+ )\s+
+ )?
+ (?:
+ (?:
+ {TYPE_QUAL}
+ (?:\s+{TYPE_QUAL})?
+ )\s+
+ )?
+ {VAR_TYPE_SPEC}
+ {POINTER}?
+ )'''
+LOCAL_STMT_START_RE = re.compile(rf'''
+ ^
+ (?:
+ ({LOCAL_VAR_START})
+ )
+ ''', re.VERBOSE)
+
+
+def iter_global_declarations(lines):
+ """Yield (decl, body) for each global declaration in the given lines.
+
+ For function definitions the header is reduced to one line and
+ the body is provided as-is. For other compound declarations (e.g.
+ struct) the entire declaration is reduced to one line and "body"
+ is None. Likewise for simple declarations (e.g. variables).
+
+ Declarations inside function bodies are ignored, though their text
+ is provided in the function body.
+ """
+ # XXX Bail out upon bogus syntax.
+ lines = source.iter_clean_lines(lines)
+ for line in lines:
+ if not GLOBAL_DECL_START_RE.match(line):
+ continue
+ # We only need functions here, since we only need locals for now.
+ if line.endswith(';'):
+ continue
+ if line.endswith('{') and '(' not in line:
+ continue
+
+ # Capture the function.
+ # (assume no func is a one-liner)
+ decl = line
+ while '{' not in line: # assume no inline structs, etc.
+ try:
+ line = next(lines)
+ except StopIteration:
+ return
+ decl += ' ' + line
+
+ body, end = _extract_block(lines)
+ if end is None:
+ return
+ assert end == '}'
+ yield (f'{decl}\n{body}\n{end}', body)
+
+
+def iter_local_statements(lines):
+ """Yield (lines, blocks) for each statement in the given lines.
+
+ For simple statements, "blocks" is None and the statement is reduced
+ to a single line. For compound statements, "blocks" is a pair of
+ (header, body) for each block in the statement. The headers are
+ reduced to a single line each, but the bpdies are provided as-is.
+ """
+ # XXX Bail out upon bogus syntax.
+ lines = source.iter_clean_lines(lines)
+ for line in lines:
+ if not LOCAL_STMT_START_RE.match(line):
+ continue
+
+ stmt = line
+ blocks = None
+ if not line.endswith(';'):
+ # XXX Support compound & multiline simple statements.
+ #blocks = []
+ continue
+
+ yield (stmt, blocks)
+
+
+def _extract_block(lines):
+ end = None
+ depth = 1
+ body = []
+ for line in lines:
+ depth += line.count('{') - line.count('}')
+ if depth == 0:
+ end = line
+ break
+ body.append(line)
+ return '\n'.join(body), end
+
+
+def parse_func(stmt, body):
+ """Return (name, signature) for the given function definition."""
+ header, _, end = stmt.partition(body)
+ assert end.strip() == '}'
+ assert header.strip().endswith('{')
+ header, _, _= header.rpartition('{')
+
+ signature = ' '.join(header.strip().splitlines())
+
+ _, _, name = signature.split('(')[0].strip().rpartition(' ')
+ assert name
+
+ return name, signature
+
+
+#TYPE_SPEC = rf'''(?:
+# )'''
+#VAR_DECLARATOR = rf'''(?:
+# )'''
+#VAR_DECL = rf'''(?:
+# {TYPE_SPEC}+
+# {VAR_DECLARATOR}
+# \s*
+# )'''
+#VAR_DECLARATION = rf'''(?:
+# {VAR_DECL}
+# (?: = [^=] [^;]* )?
+# ;
+# )'''
+#
+#
+#def parse_variable(decl, *, inFunc=False):
+# """Return [(name, storage, vartype)] for the given variable declaration."""
+# ...
+
+
+def _parse_var(stmt):
+ """Return (name, vartype) for the given variable declaration."""
+ stmt = stmt.rstrip(';')
+ m = LOCAL_STMT_START_RE.match(stmt)
+ assert m
+ vartype = m.group(0)
+ name = stmt[len(vartype):].partition('=')[0].strip()
+
+ if name.startswith('('):
+ name, _, after = name[1:].partition(')')
+ assert after
+ name = name.replace('*', '* ')
+ inside, _, name = name.strip().rpartition(' ')
+ vartype = f'{vartype} ({inside.strip()}){after}'
+ else:
+ name = name.replace('*', '* ')
+ before, _, name = name.rpartition(' ')
+ vartype = f'{vartype} {before}'
+
+ vartype = vartype.strip()
+ while ' ' in vartype:
+ vartype = vartype.replace(' ', ' ')
+
+ return name, vartype
+
+
+def extract_storage(decl, *, infunc=None):
+ """Return (storage, vartype) based on the given declaration.
+
+ The default storage is "implicit" (or "local" if infunc is True).
+ """
+ if decl == UNKNOWN:
+ return decl
+ if decl.startswith('static '):
+ return 'static'
+ #return 'static', decl.partition(' ')[2].strip()
+ elif decl.startswith('extern '):
+ return 'extern'
+ #return 'extern', decl.partition(' ')[2].strip()
+ elif re.match('.*\b(static|extern)\b', decl):
+ raise NotImplementedError
+ elif infunc:
+ return 'local'
+ else:
+ return 'implicit'
+
+
+def parse_compound(stmt, blocks):
+ """Return (headers, bodies) for the given compound statement."""
+ # XXX Identify declarations inside compound statements
+ # (if/switch/for/while).
+ raise NotImplementedError
+
+
+def iter_variables(filename, *,
+ preprocessed=False,
+ _iter_source_lines=source.iter_lines,
+ _iter_global=iter_global_declarations,
+ _iter_local=iter_local_statements,
+ _parse_func=parse_func,
+ _parse_var=_parse_var,
+ _parse_compound=parse_compound,
+ ):
+ """Yield (funcname, name, vartype) for every variable in the given file."""
+ if preprocessed:
+ raise NotImplementedError
+ lines = _iter_source_lines(filename)
+ for stmt, body in _iter_global(lines):
+ # At the file top-level we only have to worry about vars & funcs.
+ if not body:
+ name, vartype = _parse_var(stmt)
+ if name:
+ yield (None, name, vartype)
+ else:
+ funcname, _ = _parse_func(stmt, body)
+ localvars = _iter_locals(body,
+ _iter_statements=_iter_local,
+ _parse_var=_parse_var,
+ _parse_compound=_parse_compound,
+ )
+ for name, vartype in localvars:
+ yield (funcname, name, vartype)
+
+
+def _iter_locals(lines, *,
+ _iter_statements=iter_local_statements,
+ _parse_var=_parse_var,
+ _parse_compound=parse_compound,
+ ):
+ compound = [lines]
+ while compound:
+ body = compound.pop(0)
+ bodylines = body.splitlines()
+ for stmt, blocks in _iter_statements(bodylines):
+ if not blocks:
+ name, vartype = _parse_var(stmt)
+ if name:
+ yield (name, vartype)
+ else:
+ headers, bodies = _parse_compound(stmt, blocks)
+ for header in headers:
+ for line in header:
+ name, vartype = _parse_var(line)
+ if name:
+ yield (name, vartype)
+ compound.extend(bodies)
+
+
+def iter_all(filename, *,
+ preprocessed=False,
+ ):
+ """Yield a Declaration for each one found.
+
+ If there are duplicates, due to preprocessor conditionals, then
+ they are checked to make sure they are the same.
+ """
+ # XXX For the moment we cheat.
+ for funcname, name, decl in iter_variables(filename,
+ preprocessed=preprocessed):
+ yield 'variable', funcname, name, decl
diff --git a/Tools/c-analyzer/c_analyzer/parser/find.py b/Tools/c-analyzer/c_analyzer/parser/find.py
new file mode 100644
index 0000000..3860d3d
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/parser/find.py
@@ -0,0 +1,107 @@
+from ..common.info import UNKNOWN, ID
+
+from . import declarations
+
+# XXX need tests:
+# * variables
+# * variable
+# * variable_from_id
+
+
+def _iter_vars(filenames, preprocessed, *,
+ handle_id=None,
+ _iter_decls=declarations.iter_all,
+ ):
+ if handle_id is None:
+ handle_id = ID
+
+ for filename in filenames or ():
+ for kind, funcname, name, decl in _iter_decls(filename,
+ preprocessed=preprocessed,
+ ):
+ if kind != 'variable':
+ continue
+ varid = handle_id(filename, funcname, name)
+ yield varid, decl
+
+
+# XXX Add a "handle_var" arg like we did for get_resolver()?
+
+def variables(*filenames,
+ perfilecache=None,
+ preprocessed=False,
+ known=None, # for types
+ handle_id=None,
+ _iter_vars=_iter_vars,
+ ):
+ """Yield (varid, decl) for each variable found in the given files.
+
+ If "preprocessed" is provided (and not False/None) then it is used
+ to decide which tool to use to parse the source code after it runs
+ through the C preprocessor. Otherwise the raw
+ """
+ if len(filenames) == 1 and not (filenames[0], str):
+ filenames, = filenames
+
+ if perfilecache is None:
+ yield from _iter_vars(filenames, preprocessed)
+ else:
+ # XXX Cache per-file variables (e.g. `{filename: [(varid, decl)]}`).
+ raise NotImplementedError
+
+
+def variable(name, filenames, *,
+ local=False,
+ perfilecache=None,
+ preprocessed=False,
+ handle_id=None,
+ _iter_vars=variables,
+ ):
+ """Return (varid, decl) for the first found variable that matches.
+
+ If "local" is True then the first matching local variable in the
+ file will always be returned. To avoid that, pass perfilecache and
+ pop each variable from the cache after using it.
+ """
+ for varid, decl in _iter_vars(filenames,
+ perfilecache=perfilecache,
+ preprocessed=preprocessed,
+ ):
+ if varid.name != name:
+ continue
+ if local:
+ if varid.funcname:
+ if varid.funcname == UNKNOWN:
+ raise NotImplementedError
+ return varid, decl
+ elif not varid.funcname:
+ return varid, decl
+ else:
+ return None, None # No matching variable was found.
+
+
+def variable_from_id(id, filenames, *,
+ perfilecache=None,
+ preprocessed=False,
+ handle_id=None,
+ _get_var=variable,
+ ):
+ """Return (varid, decl) for the first found variable that matches."""
+ local = False
+ if isinstance(id, str):
+ name = id
+ else:
+ if id.funcname == UNKNOWN:
+ local = True
+ elif id.funcname:
+ raise NotImplementedError
+
+ name = id.name
+ if id.filename and id.filename != UNKNOWN:
+ filenames = [id.filename]
+ return _get_var(name, filenames,
+ local=local,
+ perfilecache=perfilecache,
+ preprocessed=preprocessed,
+ handle_id=handle_id,
+ )
diff --git a/Tools/c-analyzer/c_analyzer/parser/naive.py b/Tools/c-analyzer/c_analyzer/parser/naive.py
new file mode 100644
index 0000000..4a4822d
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/parser/naive.py
@@ -0,0 +1,179 @@
+import re
+
+from ..common.info import UNKNOWN, ID
+
+from .preprocessor import _iter_clean_lines
+
+
+_NOT_SET = object()
+
+
+def get_srclines(filename, *,
+ cache=None,
+ _open=open,
+ _iter_lines=_iter_clean_lines,
+ ):
+ """Return the file's lines as a list.
+
+ Each line will have trailing whitespace removed (including newline).
+
+ If a cache is given the it is used.
+ """
+ if cache is not None:
+ try:
+ return cache[filename]
+ except KeyError:
+ pass
+
+ with _open(filename) as srcfile:
+ srclines = [line
+ for _, line in _iter_lines(srcfile)
+ if not line.startswith('#')]
+ for i, line in enumerate(srclines):
+ srclines[i] = line.rstrip()
+
+ if cache is not None:
+ cache[filename] = srclines
+ return srclines
+
+
+def parse_variable_declaration(srcline):
+ """Return (name, decl) for the given declaration line."""
+ # XXX possible false negatives...
+ decl, sep, _ = srcline.partition('=')
+ if not sep:
+ if not srcline.endswith(';'):
+ return None, None
+ decl = decl.strip(';')
+ decl = decl.strip()
+ m = re.match(r'.*\b(\w+)\s*(?:\[[^\]]*\])?$', decl)
+ if not m:
+ return None, None
+ name = m.group(1)
+ return name, decl
+
+
+def parse_variable(srcline, funcname=None):
+ """Return (varid, decl) for the variable declared on the line (or None)."""
+ line = srcline.strip()
+
+ # XXX Handle more than just static variables.
+ if line.startswith('static '):
+ if '(' in line and '[' not in line:
+ # a function
+ return None, None
+ return parse_variable_declaration(line)
+ else:
+ return None, None
+
+
+def iter_variables(filename, *,
+ srccache=None,
+ parse_variable=None,
+ _get_srclines=get_srclines,
+ _default_parse_variable=parse_variable,
+ ):
+ """Yield (varid, decl) for each variable in the given source file."""
+ if parse_variable is None:
+ parse_variable = _default_parse_variable
+
+ indent = ''
+ prev = ''
+ funcname = None
+ for line in _get_srclines(filename, cache=srccache):
+ # remember current funcname
+ if funcname:
+ if line == indent + '}':
+ funcname = None
+ continue
+ else:
+ if '(' in prev and line == indent + '{':
+ if not prev.startswith('__attribute__'):
+ funcname = prev.split('(')[0].split()[-1]
+ prev = ''
+ continue
+ indent = line[:-len(line.lstrip())]
+ prev = line
+
+ info = parse_variable(line, funcname)
+ if isinstance(info, list):
+ for name, _funcname, decl in info:
+ yield ID(filename, _funcname, name), decl
+ continue
+ name, decl = info
+
+ if name is None:
+ continue
+ yield ID(filename, funcname, name), decl
+
+
+def _match_varid(variable, name, funcname, ignored=None):
+ if ignored and variable in ignored:
+ return False
+
+ if variable.name != name:
+ return False
+
+ if funcname == UNKNOWN:
+ if not variable.funcname:
+ return False
+ elif variable.funcname != funcname:
+ return False
+
+ return True
+
+
+def find_variable(filename, funcname, name, *,
+ ignored=None,
+ srccache=None, # {filename: lines}
+ parse_variable=None,
+ _iter_variables=iter_variables,
+ ):
+ """Return the matching variable.
+
+ Return None if the variable is not found.
+ """
+ for varid, decl in _iter_variables(filename,
+ srccache=srccache,
+ parse_variable=parse_variable,
+ ):
+ if _match_varid(varid, name, funcname, ignored):
+ return varid, decl
+ else:
+ return None
+
+
+def find_variables(varids, filenames=None, *,
+ srccache=_NOT_SET,
+ parse_variable=None,
+ _find_symbol=find_variable,
+ ):
+ """Yield (varid, decl) for each ID.
+
+ If the variable is not found then its decl will be UNKNOWN. That
+ way there will be one resulting variable per given ID.
+ """
+ if srccache is _NOT_SET:
+ srccache = {}
+
+ used = set()
+ for varid in varids:
+ if varid.filename and varid.filename != UNKNOWN:
+ srcfiles = [varid.filename]
+ else:
+ if not filenames:
+ yield varid, UNKNOWN
+ continue
+ srcfiles = filenames
+ for filename in srcfiles:
+ varid, decl = _find_varid(filename, varid.funcname, varid.name,
+ ignored=used,
+ srccache=srccache,
+ parse_variable=parse_variable,
+ )
+ if varid:
+ yield varid, decl
+ used.add(varid)
+ break
+ else:
+ yield varid, UNKNOWN
diff --git a/Tools/c-analyzer/c_analyzer/parser/preprocessor.py b/Tools/c-analyzer/c_analyzer/parser/preprocessor.py
new file mode 100644
index 0000000..41f306e
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/parser/preprocessor.py
@@ -0,0 +1,511 @@
+from collections import namedtuple
+import shlex
+import os
+import re
+
+from ..common import util, info
+
+
+CONTINUATION = '\\' + os.linesep
+
+IDENTIFIER = r'(?:\w*[a-zA-Z]\w*)'
+IDENTIFIER_RE = re.compile('^' + IDENTIFIER + '$')
+
+
+def _coerce_str(value):
+ if not value:
+ return ''
+ return str(value).strip()
+
+
+#############################
+# directives
+
+DIRECTIVE_START = r'''
+ (?:
+ ^ \s*
+ [#] \s*
+ )'''
+DIRECTIVE_TEXT = r'''
+ (?:
+ (?: \s+ ( .*\S ) )?
+ \s* $
+ )'''
+DIRECTIVE = rf'''
+ (?:
+ {DIRECTIVE_START}
+ (
+ include |
+ error | warning |
+ pragma |
+ define | undef |
+ if | ifdef | ifndef | elseif | else | endif |
+ __FILE__ | __LINE__ | __DATE __ | __TIME__ | __TIMESTAMP__
+ )
+ {DIRECTIVE_TEXT}
+ )'''
+# (?:
+# [^\\\n] |
+# \\ [^\n] |
+# \\ \n
+# )+
+# ) \n
+# )'''
+DIRECTIVE_RE = re.compile(DIRECTIVE, re.VERBOSE)
+
+DEFINE = rf'''
+ (?:
+ {DIRECTIVE_START} define \s+
+ (?:
+ ( \w*[a-zA-Z]\w* )
+ (?: \s* [(] ([^)]*) [)] )?
+ )
+ {DIRECTIVE_TEXT}
+ )'''
+DEFINE_RE = re.compile(DEFINE, re.VERBOSE)
+
+
+def parse_directive(line):
+ """Return the appropriate directive for the given line."""
+ line = line.strip()
+ if line.startswith('#'):
+ line = line[1:].lstrip()
+ line = '#' + line
+ directive = line
+ #directive = '#' + line
+ while ' ' in directive:
+ directive = directive.replace(' ', ' ')
+ return _parse_directive(directive)
+
+
+def _parse_directive(line):
+ m = DEFINE_RE.match(line)
+ if m:
+ name, args, text = m.groups()
+ if args:
+ args = [a.strip() for a in args.split(',')]
+ return Macro(name, args, text)
+ else:
+ return Constant(name, text)
+
+ m = DIRECTIVE_RE.match(line)
+ if not m:
+ raise ValueError(f'unsupported directive {line!r}')
+ kind, text = m.groups()
+ if not text:
+ if kind not in ('else', 'endif'):
+ raise ValueError(f'missing text in directive {line!r}')
+ elif kind in ('else', 'endif', 'define'):
+ raise ValueError(f'unexpected text in directive {line!r}')
+ if kind == 'include':
+ directive = Include(text)
+ elif kind in IfDirective.KINDS:
+ directive = IfDirective(kind, text)
+ else:
+ directive = OtherDirective(kind, text)
+ directive.validate()
+ return directive
+
+
+class PreprocessorDirective(util._NTBase):
+ """The base class for directives."""
+
+ __slots__ = ()
+
+ KINDS = frozenset([
+ 'include',
+ 'pragma',
+ 'error', 'warning',
+ 'define', 'undef',
+ 'if', 'ifdef', 'ifndef', 'elseif', 'else', 'endif',
+ '__FILE__', '__DATE__', '__LINE__', '__TIME__', '__TIMESTAMP__',
+ ])
+
+ @property
+ def text(self):
+ return ' '.join(v for v in self[1:] if v and v.strip()) or None
+
+ def validate(self):
+ """Fail if the object is invalid (i.e. init with bad data)."""
+ super().validate()
+
+ if not self.kind:
+ raise TypeError('missing kind')
+ elif self.kind not in self.KINDS:
+ raise ValueError
+
+ # text can be anything, including None.
+
+
+class Constant(PreprocessorDirective,
+ namedtuple('Constant', 'kind name value')):
+ """A single "constant" directive ("define")."""
+
+ __slots__ = ()
+
+ def __new__(cls, name, value=None):
+ self = super().__new__(
+ cls,
+ 'define',
+ name=_coerce_str(name) or None,
+ value=_coerce_str(value) or None,
+ )
+ return self
+
+ def validate(self):
+ """Fail if the object is invalid (i.e. init with bad data)."""
+ super().validate()
+
+ if not self.name:
+ raise TypeError('missing name')
+ elif not IDENTIFIER_RE.match(self.name):
+ raise ValueError(f'name must be identifier, got {self.name!r}')
+
+ # value can be anything, including None
+
+
+class Macro(PreprocessorDirective,
+ namedtuple('Macro', 'kind name args body')):
+ """A single "macro" directive ("define")."""
+
+ __slots__ = ()
+
+ def __new__(cls, name, args, body=None):
+ # "args" must be a string or an iterable of strings (or "empty").
+ if isinstance(args, str):
+ args = [v.strip() for v in args.split(',')]
+ if args:
+ args = tuple(_coerce_str(a) or None for a in args)
+ self = super().__new__(
+ cls,
+ kind='define',
+ name=_coerce_str(name) or None,
+ args=args if args else (),
+ body=_coerce_str(body) or None,
+ )
+ return self
+
+ @property
+ def text(self):
+ if self.body:
+ return f'{self.name}({", ".join(self.args)}) {self.body}'
+ else:
+ return f'{self.name}({", ".join(self.args)})'
+
+ def validate(self):
+ """Fail if the object is invalid (i.e. init with bad data)."""
+ super().validate()
+
+ if not self.name:
+ raise TypeError('missing name')
+ elif not IDENTIFIER_RE.match(self.name):
+ raise ValueError(f'name must be identifier, got {self.name!r}')
+
+ for arg in self.args:
+ if not arg:
+ raise ValueError(f'missing arg in {self.args}')
+ elif not IDENTIFIER_RE.match(arg):
+ raise ValueError(f'arg must be identifier, got {arg!r}')
+
+ # body can be anything, including None
+
+
+class IfDirective(PreprocessorDirective,
+ namedtuple('IfDirective', 'kind condition')):
+ """A single conditional directive (e.g. "if", "ifdef").
+
+ This only includes directives that actually provide conditions. The
+ related directives "else" and "endif" are covered by OtherDirective
+ instead.
+ """
+
+ __slots__ = ()
+
+ KINDS = frozenset([
+ 'if',
+ 'ifdef',
+ 'ifndef',
+ 'elseif',
+ ])
+
+ @classmethod
+ def _condition_from_raw(cls, raw, kind):
+ #return Condition.from_raw(raw, _kind=kind)
+ condition = _coerce_str(raw)
+ if not condition:
+ return None
+
+ if kind == 'ifdef':
+ condition = f'defined({condition})'
+ elif kind == 'ifndef':
+ condition = f'! defined({condition})'
+
+ return condition
+
+ def __new__(cls, kind, condition):
+ kind = _coerce_str(kind)
+ self = super().__new__(
+ cls,
+ kind=kind or None,
+ condition=cls._condition_from_raw(condition, kind),
+ )
+ return self
+
+ @property
+ def text(self):
+ if self.kind == 'ifdef':
+ return self.condition[8:-1] # strip "defined("
+ elif self.kind == 'ifndef':
+ return self.condition[10:-1] # strip "! defined("
+ else:
+ return self.condition
+ #return str(self.condition)
+
+ def validate(self):
+ """Fail if the object is invalid (i.e. init with bad data)."""
+ super().validate()
+
+ if not self.condition:
+ raise TypeError('missing condition')
+ #else:
+ # for cond in self.condition:
+ # if not cond:
+ # raise ValueError(f'missing condition in {self.condition}')
+ # cond.validate()
+ # if self.kind in ('ifdef', 'ifndef'):
+ # if len(self.condition) != 1:
+ # raise ValueError('too many condition')
+ # if self.kind == 'ifdef':
+ # if not self.condition[0].startswith('defined '):
+ # raise ValueError('bad condition')
+ # else:
+ # if not self.condition[0].startswith('! defined '):
+ # raise ValueError('bad condition')
+
+
+class Include(PreprocessorDirective,
+ namedtuple('Include', 'kind file')):
+ """A single "include" directive.
+
+ Supported "file" values are either follow the bracket style
+ (<stdio>) or double quotes ("spam.h").
+ """
+
+ __slots__ = ()
+
+ def __new__(cls, file):
+ self = super().__new__(
+ cls,
+ kind='include',
+ file=_coerce_str(file) or None,
+ )
+ return self
+
+ def validate(self):
+ """Fail if the object is invalid (i.e. init with bad data)."""
+ super().validate()
+
+ if not self.file:
+ raise TypeError('missing file')
+
+
+class OtherDirective(PreprocessorDirective,
+ namedtuple('OtherDirective', 'kind text')):
+ """A single directive not covered by another class.
+
+ This includes the "else", "endif", and "undef" directives, which are
+ otherwise inherently related to the directives covered by the
+ Constant, Macro, and IfCondition classes.
+
+ Note that all directives must have a text value, except for "else"
+ and "endif" (which must have no text).
+ """
+
+ __slots__ = ()
+
+ KINDS = PreprocessorDirective.KINDS - {'include', 'define'} - IfDirective.KINDS
+
+ def __new__(cls, kind, text):
+ self = super().__new__(
+ cls,
+ kind=_coerce_str(kind) or None,
+ text=_coerce_str(text) or None,
+ )
+ return self
+
+ def validate(self):
+ """Fail if the object is invalid (i.e. init with bad data)."""
+ super().validate()
+
+ if self.text:
+ if self.kind in ('else', 'endif'):
+ raise ValueError('unexpected text in directive')
+ elif self.kind not in ('else', 'endif'):
+ raise TypeError('missing text')
+
+
+#############################
+# iterating lines
+
+def _recompute_conditions(directive, ifstack):
+ if directive.kind in ('if', 'ifdef', 'ifndef'):
+ ifstack.append(
+ ([], directive.condition))
+ elif directive.kind == 'elseif':
+ if ifstack:
+ negated, active = ifstack.pop()
+ if active:
+ negated.append(active)
+ else:
+ negated = []
+ ifstack.append(
+ (negated, directive.condition))
+ elif directive.kind == 'else':
+ if ifstack:
+ negated, active = ifstack.pop()
+ if active:
+ negated.append(active)
+ ifstack.append(
+ (negated, None))
+ elif directive.kind == 'endif':
+ if ifstack:
+ ifstack.pop()
+
+ conditions = []
+ for negated, active in ifstack:
+ for condition in negated:
+ conditions.append(f'! ({condition})')
+ if active:
+ conditions.append(active)
+ return tuple(conditions)
+
+
+def _iter_clean_lines(lines):
+ lines = iter(enumerate(lines, 1))
+ for lno, line in lines:
+ # Handle line continuations.
+ while line.endswith(CONTINUATION):
+ try:
+ lno, _line = next(lines)
+ except StopIteration:
+ break
+ line = line[:-len(CONTINUATION)] + ' ' + _line
+
+ # Deal with comments.
+ after = line
+ line = ''
+ while True:
+ # Look for a comment.
+ before, begin, remainder = after.partition('/*')
+ if '//' in before:
+ before, _, _ = before.partition('//')
+ line += before + ' ' # per the C99 spec
+ break
+ line += before
+ if not begin:
+ break
+ line += ' ' # per the C99 spec
+
+ # Go until we find the end of the comment.
+ _, end, after = remainder.partition('*/')
+ while not end:
+ try:
+ lno, remainder = next(lines)
+ except StopIteration:
+ raise Exception('unterminated comment')
+ _, end, after = remainder.partition('*/')
+
+ yield lno, line
+
+
+def iter_lines(lines, *,
+ _iter_clean_lines=_iter_clean_lines,
+ _parse_directive=_parse_directive,
+ _recompute_conditions=_recompute_conditions,
+ ):
+ """Yield (lno, line, directive, active conditions) for each given line.
+
+ This is effectively a subset of the operations taking place in
+ translation phases 2-4 from the C99 spec (ISO/IEC 9899:TC2); see
+ section 5.1.1.2. Line continuations are removed and comments
+ replaced with a single space. (In both cases "lno" will be the last
+ line involved.) Otherwise each line is returned as-is.
+
+ "lno" is the (1-indexed) line number for the line.
+
+ "directive" will be a PreprocessorDirective or None, depending on
+ whether or not there is a directive on the line.
+
+ "active conditions" is the set of preprocessor conditions (e.g.
+ "defined()") under which the current line of code will be included
+ in compilation. That set is derived from every conditional
+ directive block (e.g. "if defined()", "ifdef", "else") containing
+ that line. That includes nested directives. Note that the
+ current line does not affect the active conditions for iteself.
+ It only impacts subsequent lines. That applies to directives
+ that close blocks (e.g. "endif") just as much as conditional
+ directvies. Also note that "else" and "elseif" directives
+ update the active conditions (for later lines), rather than
+ adding to them.
+ """
+ ifstack = []
+ conditions = ()
+ for lno, line in _iter_clean_lines(lines):
+ stripped = line.strip()
+ if not stripped.startswith('#'):
+ yield lno, line, None, conditions
+ continue
+
+ directive = '#' + stripped[1:].lstrip()
+ while ' ' in directive:
+ directive = directive.replace(' ', ' ')
+ directive = _parse_directive(directive)
+ yield lno, line, directive, conditions
+
+ if directive.kind in ('else', 'endif'):
+ conditions = _recompute_conditions(directive, ifstack)
+ elif isinstance(directive, IfDirective):
+ conditions = _recompute_conditions(directive, ifstack)
+
+
+#############################
+# running (platform-specific?)
+
+def _gcc(filename, *,
+ _get_argv=(lambda: _get_gcc_argv()),
+ _run=util.run_cmd,
+ ):
+ argv = _get_argv()
+ argv.extend([
+ '-E', filename,
+ ])
+ output = _run(argv)
+ return output
+
+
+def _get_gcc_argv(*,
+ _open=open,
+ _run=util.run_cmd,
+ ):
+ with _open('/tmp/print.mk', 'w') as tmpfile:
+ tmpfile.write('print-%:\n')
+ #tmpfile.write('\t@echo $* = $($*)\n')
+ tmpfile.write('\t@echo $($*)\n')
+ argv = ['/usr/bin/make',
+ '-f', 'Makefile',
+ '-f', '/tmp/print.mk',
+ 'print-CC',
+ 'print-PY_CORE_CFLAGS',
+ ]
+ output = _run(argv)
+ gcc, cflags = output.strip().splitlines()
+ argv = shlex.split(gcc.strip())
+ cflags = shlex.split(cflags.strip())
+ return argv + cflags
+
+
+def run(filename, *,
+ _gcc=_gcc,
+ ):
+ """Return the text of the given file after running the preprocessor."""
+ return _gcc(filename)
diff --git a/Tools/c-analyzer/c_analyzer/parser/source.py b/Tools/c-analyzer/c_analyzer/parser/source.py
new file mode 100644
index 0000000..f8998c8
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/parser/source.py
@@ -0,0 +1,34 @@
+from . import preprocessor
+
+
+def iter_clean_lines(lines):
+ incomment = False
+ for line in lines:
+ # Deal with comments.
+ if incomment:
+ _, sep, line = line.partition('*/')
+ if sep:
+ incomment = False
+ continue
+ line, _, _ = line.partition('//')
+ line, sep, remainder = line.partition('/*')
+ if sep:
+ _, sep, after = remainder.partition('*/')
+ if not sep:
+ incomment = True
+ continue
+ line += ' ' + after
+
+ # Ignore blank lines and leading/trailing whitespace.
+ line = line.strip()
+ if not line:
+ continue
+
+ yield line
+
+
+def iter_lines(filename, *,
+ preprocess=preprocessor.run,
+ ):
+ content = preprocess(filename)
+ return iter(content.splitlines())
diff --git a/Tools/c-analyzer/c_analyzer/symbols/__init__.py b/Tools/c-analyzer/c_analyzer/symbols/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/symbols/__init__.py
diff --git a/Tools/c-analyzer/c_analyzer/symbols/_nm.py b/Tools/c-analyzer/c_analyzer/symbols/_nm.py
new file mode 100644
index 0000000..f3a75a6
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/symbols/_nm.py
@@ -0,0 +1,117 @@
+import os.path
+import shutil
+
+from c_analyzer.common import util, info
+
+from .info import Symbol
+
+
+# XXX need tests:
+# * iter_symbols
+
+NM_KINDS = {
+ 'b': Symbol.KIND.VARIABLE, # uninitialized
+ 'd': Symbol.KIND.VARIABLE, # initialized
+ #'g': Symbol.KIND.VARIABLE, # uninitialized
+ #'s': Symbol.KIND.VARIABLE, # initialized
+ 't': Symbol.KIND.FUNCTION,
+ }
+
+SPECIAL_SYMBOLS = {
+ # binary format (e.g. ELF)
+ '__bss_start',
+ '__data_start',
+ '__dso_handle',
+ '_DYNAMIC',
+ '_edata',
+ '_end',
+ '__environ@@GLIBC_2.2.5',
+ '_GLOBAL_OFFSET_TABLE_',
+ '__JCR_END__',
+ '__JCR_LIST__',
+ '__TMC_END__',
+ }
+
+
+def _is_special_symbol(name):
+ if name in SPECIAL_SYMBOLS:
+ return True
+ if '@@GLIBC' in name:
+ return True
+ return False
+
+
+def iter_symbols(binfile, *,
+ nm=None,
+ handle_id=None,
+ _which=shutil.which,
+ _run=util.run_cmd,
+ ):
+ """Yield a Symbol for each relevant entry reported by the "nm" command."""
+ if nm is None:
+ nm = _which('nm')
+ if not nm:
+ raise NotImplementedError
+ if handle_id is None:
+ handle_id = info.ID
+
+ argv = [nm,
+ '--line-numbers',
+ binfile,
+ ]
+ try:
+ output = _run(argv)
+ except Exception:
+ if nm is None:
+ # XXX Use dumpbin.exe /SYMBOLS on Windows.
+ raise NotImplementedError
+ raise
+ for line in output.splitlines():
+ (name, kind, external, filename, funcname,
+ ) = _parse_nm_line(line)
+ if kind != Symbol.KIND.VARIABLE:
+ continue
+ elif _is_special_symbol(name):
+ continue
+ yield Symbol(
+ id=handle_id(filename, funcname, name),
+ kind=kind,
+ external=external,
+ )
+
+
+def _parse_nm_line(line):
+ _origline = line
+ _, _, line = line.partition(' ') # strip off the address
+ line = line.strip()
+
+ kind, _, line = line.partition(' ')
+ line = line.strip()
+ external = kind.isupper()
+ kind = NM_KINDS.get(kind.lower(), Symbol.KIND.OTHER)
+
+ name, _, filename = line.partition('\t')
+ name = name.strip()
+ if filename:
+ filename = os.path.relpath(filename.partition(':')[0])
+ else:
+ filename = info.UNKNOWN
+
+ name, islocal = _parse_nm_name(name, kind)
+ funcname = info.UNKNOWN if islocal else None
+ return name, kind, external, filename, funcname
+
+
+def _parse_nm_name(name, kind):
+ if kind != Symbol.KIND.VARIABLE:
+ return name, None
+ if _is_special_symbol(name):
+ return name, None
+
+ actual, sep, digits = name.partition('.')
+ if not sep:
+ return name, False
+
+ if not digits.isdigit():
+ raise Exception(f'got bogus name {name}')
+ return actual, True
diff --git a/Tools/c-analyzer/c_analyzer/symbols/find.py b/Tools/c-analyzer/c_analyzer/symbols/find.py
new file mode 100644
index 0000000..8564652
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/symbols/find.py
@@ -0,0 +1,175 @@
+import os
+import os.path
+import shutil
+
+from ..common import files
+from ..common.info import UNKNOWN, ID
+from ..parser import find as p_find
+
+from . import _nm
+from .info import Symbol
+
+# XXX need tests:
+# * get_resolver()
+# * get_resolver_from_dirs()
+# * symbol()
+# * symbols()
+# * variables()
+
+
+def _resolve_known(symbol, knownvars):
+ for varid in knownvars:
+ if symbol.match(varid):
+ break
+ else:
+ return None
+ return knownvars.pop(varid)
+
+
+def get_resolver(filenames=None, known=None, *,
+ handle_var,
+ check_filename=None,
+ perfilecache=None,
+ preprocessed=False,
+ _from_source=p_find.variable_from_id,
+ ):
+ """Return a "resolver" func for the given known vars/types and filenames.
+
+ "handle_var" is a callable that takes (ID, decl) and returns a
+ Variable. Variable.from_id is a suitable callable.
+
+ The returned func takes a single Symbol and returns a corresponding
+ Variable. If the symbol was located then the variable will be
+ valid, populated with the corresponding information. Otherwise None
+ is returned.
+ """
+ knownvars = (known or {}).get('variables')
+ if knownvars:
+ knownvars = dict(knownvars) # a copy
+ if filenames:
+ if check_filename is None:
+ filenames = list(filenames)
+ def check_filename(filename):
+ return filename in filenames
+ def resolve(symbol):
+ # XXX Check "found" instead?
+ if not check_filename(symbol.filename):
+ return None
+ found = _resolve_known(symbol, knownvars)
+ if found is None:
+ #return None
+ varid, decl = _from_source(symbol, filenames,
+ perfilecache=perfilecache,
+ preprocessed=preprocessed,
+ )
+ found = handle_var(varid, decl)
+ return found
+ else:
+ def resolve(symbol):
+ return _resolve_known(symbol, knownvars)
+ elif filenames:
+ def resolve(symbol):
+ varid, decl = _from_source(symbol, filenames,
+ perfilecache=perfilecache,
+ preprocessed=preprocessed,
+ )
+ return handle_var(varid, decl)
+ else:
+ def resolve(symbol):
+ return None
+ return resolve
+
+
+def get_resolver_from_dirs(dirnames, known=None, *,
+ handle_var,
+ suffixes=('.c',),
+ perfilecache=None,
+ preprocessed=False,
+ _iter_files=files.iter_files_by_suffix,
+ _get_resolver=get_resolver,
+ ):
+ """Return a "resolver" func for the given known vars/types and filenames.
+
+ "dirnames" should be absolute paths. If not then they will be
+ resolved relative to CWD.
+
+ See get_resolver().
+ """
+ dirnames = [d if d.endswith(os.path.sep) else d + os.path.sep
+ for d in dirnames]
+ filenames = _iter_files(dirnames, suffixes)
+ def check_filename(filename):
+ for dirname in dirnames:
+ if filename.startswith(dirname):
+ return True
+ else:
+ return False
+ return _get_resolver(filenames, known,
+ handle_var=handle_var,
+ check_filename=check_filename,
+ perfilecache=perfilecache,
+ preprocessed=preprocessed,
+ )
+
+
+def symbol(symbol, filenames, known=None, *,
+ perfilecache=None,
+ preprocessed=False,
+ handle_id=None,
+ _get_resolver=get_resolver,
+ ):
+ """Return a Variable for the one matching the given symbol.
+
+ "symbol" can be one of several objects:
+
+ * Symbol - use the contained info
+ * name (str) - look for a global variable with that name
+ * (filename, name) - look for named global in file
+ * (filename, funcname, name) - look for named local in file
+
+ A name is always required. If the filename is None, "", or
+ "UNKNOWN" then all files will be searched. If the funcname is
+ "" or "UNKNOWN" then only local variables will be searched for.
+ """
+ resolve = _get_resolver(known, filenames,
+ handle_id=handle_id,
+ perfilecache=perfilecache,
+ preprocessed=preprocessed,
+ )
+ return resolve(symbol)
+
+
+def _get_platform_tool():
+ if os.name == 'nt':
+ # XXX Support this.
+ raise NotImplementedError
+ elif nm := shutil.which('nm'):
+ return lambda b, hi: _nm.iter_symbols(b, nm=nm, handle_id=hi)
+ else:
+ raise NotImplementedError
+
+
+def symbols(binfile, *,
+ handle_id=None,
+ _file_exists=os.path.exists,
+ _get_platform_tool=_get_platform_tool,
+ ):
+ """Yield a Symbol for each one found in the binary."""
+ if not _file_exists(binfile):
+ raise Exception('executable missing (need to build it first?)')
+
+ _iter_symbols = _get_platform_tool()
+ yield from _iter_symbols(binfile, handle_id)
+
+
+def variables(binfile, *,
+ resolve,
+ handle_id=None,
+ _iter_symbols=symbols,
+ ):
+ """Yield (Variable, Symbol) for each found symbol."""
+ for symbol in _iter_symbols(binfile, handle_id=handle_id):
+ if symbol.kind != Symbol.KIND.VARIABLE:
+ continue
+ var = resolve(symbol) or None
+ yield var, symbol
diff --git a/Tools/c-analyzer/c_analyzer/symbols/info.py b/Tools/c-analyzer/c_analyzer/symbols/info.py
new file mode 100644
index 0000000..96a251a
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/symbols/info.py
@@ -0,0 +1,51 @@
+from collections import namedtuple
+
+from c_analyzer.common.info import ID
+from c_analyzer.common.util import classonly, _NTBase
+
+
+class Symbol(_NTBase, namedtuple('Symbol', 'id kind external')):
+ """Info for a single compilation symbol."""
+
+ __slots__ = ()
+
+ class KIND:
+ VARIABLE = 'variable'
+ FUNCTION = 'function'
+ OTHER = 'other'
+
+ @classonly
+ def from_name(cls, name, filename=None, kind=KIND.VARIABLE, external=None):
+ """Return a new symbol based on the given name."""
+ id = ID(filename, None, name)
+ return cls(id, kind, external)
+
+ def __new__(cls, id, kind=KIND.VARIABLE, external=None):
+ self = super().__new__(
+ cls,
+ id=ID.from_raw(id),
+ kind=str(kind) if kind else None,
+ external=bool(external) if external is not None else None,
+ )
+ return self
+
+ def __hash__(self):
+ return hash(self.id)
+
+ def __getattr__(self, name):
+ return getattr(self.id, name)
+
+ def validate(self):
+ """Fail if the object is invalid (i.e. init with bad data)."""
+ if not self.id:
+ raise TypeError('missing id')
+ else:
+ self.id.validate()
+
+ if not self.kind:
+ raise TypeError('missing kind')
+ elif self.kind not in vars(self.KIND).values():
+ raise ValueError(f'unsupported kind {self.kind}')
+
+ if self.external is None:
+ raise TypeError('missing external')
diff --git a/Tools/c-analyzer/c_analyzer/variables/__init__.py b/Tools/c-analyzer/c_analyzer/variables/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/variables/__init__.py
diff --git a/Tools/c-analyzer/c_analyzer/variables/find.py b/Tools/c-analyzer/c_analyzer/variables/find.py
new file mode 100644
index 0000000..3fe7284
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/variables/find.py
@@ -0,0 +1,75 @@
+from ..common import files
+from ..common.info import UNKNOWN
+from ..parser import (
+ find as p_find,
+ )
+from ..symbols import (
+ info as s_info,
+ find as s_find,
+ )
+from .info import Variable
+
+# XXX need tests:
+# * vars_from_source
+
+
+def _remove_cached(cache, var):
+ if not cache:
+ return
+ try:
+ cached = cache[var.filename]
+ cached.remove(var)
+ except (KeyError, IndexError):
+ pass
+
+
+def vars_from_binary(binfile, *,
+ known=None,
+ filenames=None,
+ handle_id=None,
+ check_filename=None,
+ handle_var=Variable.from_id,
+ _iter_vars=s_find.variables,
+ _get_symbol_resolver=s_find.get_resolver,
+ ):
+ """Yield a Variable for each found Symbol.
+
+ Details are filled in from the given "known" variables and types.
+ """
+ cache = {}
+ resolve = _get_symbol_resolver(filenames, known,
+ handle_var=handle_var,
+ check_filename=check_filename,
+ perfilecache=cache,
+ )
+ for var, symbol in _iter_vars(binfile,
+ resolve=resolve,
+ handle_id=handle_id,
+ ):
+ if var is None:
+ var = Variable(symbol.id, UNKNOWN, UNKNOWN)
+ yield var
+ _remove_cached(cache, var)
+
+
+def vars_from_source(filenames, *,
+ preprocessed=None,
+ known=None,
+ handle_id=None,
+ handle_var=Variable.from_id,
+ iter_vars=p_find.variables,
+ ):
+ """Yield a Variable for each declaration in the raw source code.
+
+ Details are filled in from the given "known" variables and types.
+ """
+ cache = {}
+ for varid, decl in iter_vars(filenames or (),
+ perfilecache=cache,
+ preprocessed=preprocessed,
+ known=known,
+ handle_id=handle_id,
+ ):
+ var = handle_var(varid, decl)
+ yield var
+ _remove_cached(cache, var)
diff --git a/Tools/c-analyzer/c_analyzer/variables/info.py b/Tools/c-analyzer/c_analyzer/variables/info.py
new file mode 100644
index 0000000..336a523
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/variables/info.py
@@ -0,0 +1,93 @@
+from collections import namedtuple
+
+from ..common.info import ID, UNKNOWN
+from ..common.util import classonly, _NTBase
+
+
+def normalize_vartype(vartype):
+ """Return the canonical form for a variable type (or func signature)."""
+ # We allow empty strring through for semantic reasons.
+ if vartype is None:
+ return None
+
+ # XXX finish!
+ # XXX Return (modifiers, type, pointer)?
+ return str(vartype)
+
+
+# XXX Variable.vartype -> decl (Declaration).
+
+class Variable(_NTBase,
+ namedtuple('Variable', 'id storage vartype')):
+ """Information about a single variable declaration."""
+
+ __slots__ = ()
+
+ STORAGE = (
+ 'static',
+ 'extern',
+ 'implicit',
+ 'local',
+ )
+
+ @classonly
+ def from_parts(cls, filename, funcname, name, decl, storage=None):
+ varid = ID(filename, funcname, name)
+ if storage is None:
+ self = cls.from_id(varid, decl)
+ else:
+ self = cls(varid, storage, decl)
+ return self
+
+ @classonly
+ def from_id(cls, varid, decl):
+ from ..parser.declarations import extract_storage
+ storage = extract_storage(decl, infunc=varid.funcname)
+ return cls(varid, storage, decl)
+
+ def __new__(cls, id, storage, vartype):
+ self = super().__new__(
+ cls,
+ id=ID.from_raw(id),
+ storage=str(storage) if storage else None,
+ vartype=normalize_vartype(vartype) if vartype else None,
+ )
+ return self
+
+ def __hash__(self):
+ return hash(self.id)
+
+ def __getattr__(self, name):
+ return getattr(self.id, name)
+
+ def _validate_id(self):
+ if not self.id:
+ raise TypeError('missing id')
+
+ if not self.filename or self.filename == UNKNOWN:
+ raise TypeError(f'id missing filename ({self.id})')
+
+ if self.funcname and self.funcname == UNKNOWN:
+ raise TypeError(f'id missing funcname ({self.id})')
+
+ self.id.validate()
+
+ def validate(self):
+ """Fail if the object is invalid (i.e. init with bad data)."""
+ self._validate_id()
+
+ if self.storage is None or self.storage == UNKNOWN:
+ raise TypeError('missing storage')
+ elif self.storage not in self.STORAGE:
+ raise ValueError(f'unsupported storage {self.storage:r}')
+
+ if self.vartype is None or self.vartype == UNKNOWN:
+ raise TypeError('missing vartype')
+
+ @property
+ def isglobal(self):
+ return self.storage != 'local'
+
+ @property
+ def isconst(self):
+ return 'const' in self.vartype.split()
diff --git a/Tools/c-analyzer/c_analyzer/variables/known.py b/Tools/c-analyzer/c_analyzer/variables/known.py
new file mode 100644
index 0000000..aa2934a
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/variables/known.py
@@ -0,0 +1,91 @@
+import csv
+
+from ..common.info import ID, UNKNOWN
+from ..common.util import read_tsv
+from .info import Variable
+
+
+# XXX need tests:
+# * read_file()
+# * look_up_variable()
+
+
+COLUMNS = ('filename', 'funcname', 'name', 'kind', 'declaration')
+HEADER = '\t'.join(COLUMNS)
+
+
+def read_file(infile, *,
+ _read_tsv=read_tsv,
+ ):
+ """Yield (kind, id, decl) for each row in the data file.
+
+ The caller is responsible for validating each row.
+ """
+ for row in _read_tsv(infile, HEADER):
+ filename, funcname, name, kind, declaration = row
+ if not funcname or funcname == '-':
+ funcname = None
+ id = ID(filename, funcname, name)
+ yield kind, id, declaration
+
+
+def from_file(infile, *,
+ handle_var=Variable.from_id,
+ _read_file=read_file,
+ ):
+ """Return the info for known declarations in the given file."""
+ known = {
+ 'variables': {},
+ #'types': {},
+ #'constants': {},
+ #'macros': {},
+ }
+ for kind, id, decl in _read_file(infile):
+ if kind == 'variable':
+ values = known['variables']
+ value = handle_var(id, decl)
+ else:
+ raise ValueError(f'unsupported kind in row {row}')
+ value.validate()
+ values[id] = value
+ return known
+
+
+def look_up_variable(varid, knownvars, *,
+ match_files=(lambda f1, f2: f1 == f2),
+ ):
+ """Return the known Variable matching the given ID.
+
+ "knownvars" is a mapping of ID to Variable.
+
+ "match_files" is used to verify if two filenames point to
+ the same file.
+
+ If no match is found then None is returned.
+ """
+ if not knownvars:
+ return None
+
+ if varid.funcname == UNKNOWN:
+ if not varid.filename or varid.filename == UNKNOWN:
+ for varid in knownvars:
+ if not varid.funcname:
+ continue
+ if varid.name == varid.name:
+ return knownvars[varid]
+ else:
+ return None
+ else:
+ for varid in knownvars:
+ if not varid.funcname:
+ continue
+ if not match_files(varid.filename, varid.filename):
+ continue
+ if varid.name == varid.name:
+ return knownvars[varid]
+ else:
+ return None
+ elif not varid.filename or varid.filename == UNKNOWN:
+ raise NotImplementedError
+ else:
+ return knownvars.get(varid.id)