summaryrefslogtreecommitdiffstats
path: root/Parser/pgen
diff options
context:
space:
mode:
Diffstat (limited to 'Parser/pgen')
-rw-r--r--Parser/pgen/__init__.py0
-rw-r--r--Parser/pgen/__main__.py34
-rw-r--r--Parser/pgen/grammar.py160
-rw-r--r--Parser/pgen/pgen.py408
-rw-r--r--Parser/pgen/token.py40
5 files changed, 642 insertions, 0 deletions
diff --git a/Parser/pgen/__init__.py b/Parser/pgen/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Parser/pgen/__init__.py
diff --git a/Parser/pgen/__main__.py b/Parser/pgen/__main__.py
new file mode 100644
index 0000000..965b08f
--- /dev/null
+++ b/Parser/pgen/__main__.py
@@ -0,0 +1,34 @@
+import argparse
+
+from .pgen import ParserGenerator
+
+def main():
+ parser = argparse.ArgumentParser(description="Parser generator main program.")
+ parser.add_argument(
+ "grammar", type=str, help="The file with the grammar definition in EBNF format"
+ )
+ parser.add_argument(
+ "tokens", type=str, help="The file with the token definitions"
+ )
+ parser.add_argument(
+ "graminit_h",
+ type=argparse.FileType('w'),
+ help="The path to write the grammar's non-terminals as #defines",
+ )
+ parser.add_argument(
+ "graminit_c",
+ type=argparse.FileType('w'),
+ help="The path to write the grammar as initialized data",
+ )
+
+ parser.add_argument("--verbose", "-v", action="count")
+ args = parser.parse_args()
+
+ p = ParserGenerator(args.grammar, args.tokens, verbose=args.verbose)
+ grammar = p.make_grammar()
+ grammar.produce_graminit_h(args.graminit_h.write)
+ grammar.produce_graminit_c(args.graminit_c.write)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/Parser/pgen/grammar.py b/Parser/pgen/grammar.py
new file mode 100644
index 0000000..bd405e6
--- /dev/null
+++ b/Parser/pgen/grammar.py
@@ -0,0 +1,160 @@
+import collections
+
+class Grammar:
+ """Pgen parsing tables conversion class.
+
+ Once initialized, this class supplies the grammar tables for the
+ parsing engine implemented by parse.py. The parsing engine
+ accesses the instance variables directly. The class here does not
+ provide initialization of the tables; several subclasses exist to
+ do this (see the conv and pgen modules).
+
+ The load() method reads the tables from a pickle file, which is
+ much faster than the other ways offered by subclasses. The pickle
+ file is written by calling dump() (after loading the grammar
+ tables using a subclass). The report() method prints a readable
+ representation of the tables to stdout, for debugging.
+
+ The instance variables are as follows:
+
+ symbol2number -- a dict mapping symbol names to numbers. Symbol
+ numbers are always 256 or higher, to distinguish
+ them from token numbers, which are between 0 and
+ 255 (inclusive).
+
+ number2symbol -- a dict mapping numbers to symbol names;
+ these two are each other's inverse.
+
+ states -- a list of DFAs, where each DFA is a list of
+ states, each state is a list of arcs, and each
+ arc is a (i, j) pair where i is a label and j is
+ a state number. The DFA number is the index into
+ this list. (This name is slightly confusing.)
+ Final states are represented by a special arc of
+ the form (0, j) where j is its own state number.
+
+ dfas -- a dict mapping symbol numbers to (DFA, first)
+ pairs, where DFA is an item from the states list
+ above, and first is a set of tokens that can
+ begin this grammar rule (represented by a dict
+ whose values are always 1).
+
+ labels -- a list of (x, y) pairs where x is either a token
+ number or a symbol number, and y is either None
+ or a string; the strings are keywords. The label
+ number is the index in this list; label numbers
+ are used to mark state transitions (arcs) in the
+ DFAs.
+
+ start -- the number of the grammar's start symbol.
+
+ keywords -- a dict mapping keyword strings to arc labels.
+
+ tokens -- a dict mapping token numbers to arc labels.
+
+ """
+
+ def __init__(self):
+ self.symbol2number = collections.OrderedDict()
+ self.number2symbol = collections.OrderedDict()
+ self.states = []
+ self.dfas = collections.OrderedDict()
+ self.labels = [(0, "EMPTY")]
+ self.keywords = collections.OrderedDict()
+ self.tokens = collections.OrderedDict()
+ self.symbol2label = collections.OrderedDict()
+ self.start = 256
+
+ def produce_graminit_h(self, writer):
+ writer("/* Generated by Parser/pgen */\n\n")
+ for number, symbol in self.number2symbol.items():
+ writer("#define {} {}\n".format(symbol, number))
+
+ def produce_graminit_c(self, writer):
+ writer("/* Generated by Parser/pgen */\n\n")
+
+ writer('#include "pgenheaders.h"\n')
+ writer('#include "grammar.h"\n')
+ writer("grammar _PyParser_Grammar;\n")
+
+ self.print_dfas(writer)
+ self.print_labels(writer)
+
+ writer("grammar _PyParser_Grammar = {\n")
+ writer(" {n_dfas},\n".format(n_dfas=len(self.dfas)))
+ writer(" dfas,\n")
+ writer(" {{{n_labels}, labels}},\n".format(n_labels=len(self.labels)))
+ writer(" {start_number}\n".format(start_number=self.start))
+ writer("};\n")
+
+ def print_labels(self, writer):
+ writer(
+ "static label labels[{n_labels}] = {{\n".format(n_labels=len(self.labels))
+ )
+ for label, name in self.labels:
+ if name is None:
+ writer(" {{{label}, 0}},\n".format(label=label))
+ else:
+ writer(
+ ' {{{label}, "{label_name}"}},\n'.format(
+ label=label, label_name=name
+ )
+ )
+ writer("};\n")
+
+ def print_dfas(self, writer):
+ self.print_states(writer)
+ writer("static dfa dfas[{}] = {{\n".format(len(self.dfas)))
+ for dfaindex, dfa_elem in enumerate(self.dfas.items()):
+ symbol, (dfa, first_sets) = dfa_elem
+ writer(
+ ' {{{dfa_symbol}, "{symbol_name}", '.format(
+ dfa_symbol=symbol, symbol_name=self.number2symbol[symbol]
+ )
+ + "0, {n_states}, states_{dfa_index},\n".format(
+ n_states=len(dfa), dfa_index=dfaindex
+ )
+ )
+ writer(' "')
+
+ k = [name for label, name in self.labels if label in first_sets]
+ bitset = bytearray((len(self.labels) >> 3) + 1)
+ for token in first_sets:
+ bitset[token >> 3] |= 1 << (token & 7)
+ for byte in bitset:
+ writer("\\%03o" % (byte & 0xFF))
+ writer('"},\n')
+ writer("};\n")
+
+ def print_states(self, write):
+ for dfaindex, dfa in enumerate(self.states):
+ self.print_arcs(write, dfaindex, dfa)
+ write(
+ "static state states_{dfa_index}[{n_states}] = {{\n".format(
+ dfa_index=dfaindex, n_states=len(dfa)
+ )
+ )
+ for stateindex, state in enumerate(dfa):
+ narcs = len(state)
+ write(
+ " {{{n_arcs}, arcs_{dfa_index}_{state_index}}},\n".format(
+ n_arcs=narcs, dfa_index=dfaindex, state_index=stateindex
+ )
+ )
+ write("};\n")
+
+ def print_arcs(self, write, dfaindex, states):
+ for stateindex, state in enumerate(states):
+ narcs = len(state)
+ write(
+ "static arc arcs_{dfa_index}_{state_index}[{n_arcs}] = {{\n".format(
+ dfa_index=dfaindex, state_index=stateindex, n_arcs=narcs
+ )
+ )
+ for a, b in state:
+ write(
+ " {{{from_label}, {to_state}}},\n".format(
+ from_label=a, to_state=b
+ )
+ )
+ write("};\n")
diff --git a/Parser/pgen/pgen.py b/Parser/pgen/pgen.py
new file mode 100644
index 0000000..c878919
--- /dev/null
+++ b/Parser/pgen/pgen.py
@@ -0,0 +1,408 @@
+import collections
+import tokenize # from stdlib
+
+from . import grammar, token
+
+class ParserGenerator(object):
+
+ def __init__(self, grammar_file, token_file, stream=None, verbose=False):
+ close_stream = None
+ if stream is None:
+ stream = open(grammar_file)
+ close_stream = stream.close
+ with open(token_file) as tok_file:
+ token_lines = tok_file.readlines()
+ self.tokens = dict(token.generate_tokens(token_lines))
+ self.opmap = dict(token.generate_opmap(token_lines))
+ # Manually add <> so it does not collide with !=
+ self.opmap['<>'] = "NOTEQUAL"
+ self.verbose = verbose
+ self.filename = grammar_file
+ self.stream = stream
+ self.generator = tokenize.generate_tokens(stream.readline)
+ self.gettoken() # Initialize lookahead
+ self.dfas, self.startsymbol = self.parse()
+ if close_stream is not None:
+ close_stream()
+ self.first = {} # map from symbol name to set of tokens
+ self.addfirstsets()
+
+ def make_grammar(self):
+ c = grammar.Grammar()
+ names = list(self.dfas.keys())
+ names.remove(self.startsymbol)
+ names.insert(0, self.startsymbol)
+ for name in names:
+ i = 256 + len(c.symbol2number)
+ c.symbol2number[name] = i
+ c.number2symbol[i] = name
+ for name in names:
+ self.make_label(c, name)
+ dfa = self.dfas[name]
+ states = []
+ for state in dfa:
+ arcs = []
+ for label, next in sorted(state.arcs.items()):
+ arcs.append((self.make_label(c, label), dfa.index(next)))
+ if state.isfinal:
+ arcs.append((0, dfa.index(state)))
+ states.append(arcs)
+ c.states.append(states)
+ c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))
+ c.start = c.symbol2number[self.startsymbol]
+
+ if self.verbose:
+ print("")
+ print("Grammar summary")
+ print("===============")
+
+ print("- {n_labels} labels".format(n_labels=len(c.labels)))
+ print("- {n_dfas} dfas".format(n_dfas=len(c.dfas)))
+ print("- {n_tokens} tokens".format(n_tokens=len(c.tokens)))
+ print("- {n_keywords} keywords".format(n_keywords=len(c.keywords)))
+ print(
+ "- Start symbol: {start_symbol}".format(
+ start_symbol=c.number2symbol[c.start]
+ )
+ )
+ return c
+
+ def make_first(self, c, name):
+ rawfirst = self.first[name]
+ first = set()
+ for label in sorted(rawfirst):
+ ilabel = self.make_label(c, label)
+ ##assert ilabel not in first # XXX failed on <> ... !=
+ first.add(ilabel)
+ return first
+
+ def make_label(self, c, label):
+ # XXX Maybe this should be a method on a subclass of converter?
+ ilabel = len(c.labels)
+ if label[0].isalpha():
+ # Either a symbol name or a named token
+ if label in c.symbol2number:
+ # A symbol name (a non-terminal)
+ if label in c.symbol2label:
+ return c.symbol2label[label]
+ else:
+ c.labels.append((c.symbol2number[label], None))
+ c.symbol2label[label] = ilabel
+ return ilabel
+ else:
+ # A named token (NAME, NUMBER, STRING)
+ itoken = self.tokens.get(label, None)
+ assert isinstance(itoken, int), label
+ assert itoken in self.tokens.values(), label
+ if itoken in c.tokens:
+ return c.tokens[itoken]
+ else:
+ c.labels.append((itoken, None))
+ c.tokens[itoken] = ilabel
+ return ilabel
+ else:
+ # Either a keyword or an operator
+ assert label[0] in ('"', "'"), label
+ value = eval(label)
+ if value[0].isalpha():
+ # A keyword
+ if value in c.keywords:
+ return c.keywords[value]
+ else:
+ c.labels.append((self.tokens["NAME"], value))
+ c.keywords[value] = ilabel
+ return ilabel
+ else:
+ # An operator (any non-numeric token)
+ tok_name = self.opmap[value] # Fails if unknown token
+ itoken = self.tokens[tok_name]
+ if itoken in c.tokens:
+ return c.tokens[itoken]
+ else:
+ c.labels.append((itoken, None))
+ c.tokens[itoken] = ilabel
+ return ilabel
+
+ def addfirstsets(self):
+ names = list(self.dfas.keys())
+ for name in names:
+ if name not in self.first:
+ self.calcfirst(name)
+
+ if self.verbose:
+ print("First set for {dfa_name}".format(dfa_name=name))
+ for item in self.first[name]:
+ print(" - {terminal}".format(terminal=item))
+
+ def calcfirst(self, name):
+ dfa = self.dfas[name]
+ self.first[name] = None # dummy to detect left recursion
+ state = dfa[0]
+ totalset = set()
+ overlapcheck = {}
+ for label, next in state.arcs.items():
+ if label in self.dfas:
+ if label in self.first:
+ fset = self.first[label]
+ if fset is None:
+ raise ValueError("recursion for rule %r" % name)
+ else:
+ self.calcfirst(label)
+ fset = self.first[label]
+ totalset.update(fset)
+ overlapcheck[label] = fset
+ else:
+ totalset.add(label)
+ overlapcheck[label] = {label}
+ inverse = {}
+ for label, itsfirst in overlapcheck.items():
+ for symbol in itsfirst:
+ if symbol in inverse:
+ raise ValueError("rule %s is ambiguous; %s is in the"
+ " first sets of %s as well as %s" %
+ (name, symbol, label, inverse[symbol]))
+ inverse[symbol] = label
+ self.first[name] = totalset
+
+ def parse(self):
+ dfas = collections.OrderedDict()
+ startsymbol = None
+ # MSTART: (NEWLINE | RULE)* ENDMARKER
+ while self.type != tokenize.ENDMARKER:
+ while self.type == tokenize.NEWLINE:
+ self.gettoken()
+ # RULE: NAME ':' RHS NEWLINE
+ name = self.expect(tokenize.NAME)
+ if self.verbose:
+ print("Processing rule {dfa_name}".format(dfa_name=name))
+ self.expect(tokenize.OP, ":")
+ a, z = self.parse_rhs()
+ self.expect(tokenize.NEWLINE)
+ if self.verbose:
+ self.dump_nfa(name, a, z)
+ dfa = self.make_dfa(a, z)
+ if self.verbose:
+ self.dump_dfa(name, dfa)
+ oldlen = len(dfa)
+ self.simplify_dfa(dfa)
+ newlen = len(dfa)
+ dfas[name] = dfa
+ #print name, oldlen, newlen
+ if startsymbol is None:
+ startsymbol = name
+ return dfas, startsymbol
+
+ def make_dfa(self, start, finish):
+ # To turn an NFA into a DFA, we define the states of the DFA
+ # to correspond to *sets* of states of the NFA. Then do some
+ # state reduction. Let's represent sets as dicts with 1 for
+ # values.
+ assert isinstance(start, NFAState)
+ assert isinstance(finish, NFAState)
+ def closure(state):
+ base = set()
+ addclosure(state, base)
+ return base
+ def addclosure(state, base):
+ assert isinstance(state, NFAState)
+ if state in base:
+ return
+ base.add(state)
+ for label, next in state.arcs:
+ if label is None:
+ addclosure(next, base)
+ states = [DFAState(closure(start), finish)]
+ for state in states: # NB states grows while we're iterating
+ arcs = {}
+ for nfastate in state.nfaset:
+ for label, next in nfastate.arcs:
+ if label is not None:
+ addclosure(next, arcs.setdefault(label, set()))
+ for label, nfaset in sorted(arcs.items()):
+ for st in states:
+ if st.nfaset == nfaset:
+ break
+ else:
+ st = DFAState(nfaset, finish)
+ states.append(st)
+ state.addarc(st, label)
+ return states # List of DFAState instances; first one is start
+
+ def dump_nfa(self, name, start, finish):
+ print("Dump of NFA for", name)
+ todo = [start]
+ for i, state in enumerate(todo):
+ print(" State", i, state is finish and "(final)" or "")
+ for label, next in state.arcs:
+ if next in todo:
+ j = todo.index(next)
+ else:
+ j = len(todo)
+ todo.append(next)
+ if label is None:
+ print(" -> %d" % j)
+ else:
+ print(" %s -> %d" % (label, j))
+
+ def dump_dfa(self, name, dfa):
+ print("Dump of DFA for", name)
+ for i, state in enumerate(dfa):
+ print(" State", i, state.isfinal and "(final)" or "")
+ for label, next in sorted(state.arcs.items()):
+ print(" %s -> %d" % (label, dfa.index(next)))
+
+ def simplify_dfa(self, dfa):
+ # This is not theoretically optimal, but works well enough.
+ # Algorithm: repeatedly look for two states that have the same
+ # set of arcs (same labels pointing to the same nodes) and
+ # unify them, until things stop changing.
+
+ # dfa is a list of DFAState instances
+ changes = True
+ while changes:
+ changes = False
+ for i, state_i in enumerate(dfa):
+ for j in range(i+1, len(dfa)):
+ state_j = dfa[j]
+ if state_i == state_j:
+ #print " unify", i, j
+ del dfa[j]
+ for state in dfa:
+ state.unifystate(state_j, state_i)
+ changes = True
+ break
+
+ def parse_rhs(self):
+ # RHS: ALT ('|' ALT)*
+ a, z = self.parse_alt()
+ if self.value != "|":
+ return a, z
+ else:
+ aa = NFAState()
+ zz = NFAState()
+ aa.addarc(a)
+ z.addarc(zz)
+ while self.value == "|":
+ self.gettoken()
+ a, z = self.parse_alt()
+ aa.addarc(a)
+ z.addarc(zz)
+ return aa, zz
+
+ def parse_alt(self):
+ # ALT: ITEM+
+ a, b = self.parse_item()
+ while (self.value in ("(", "[") or
+ self.type in (tokenize.NAME, tokenize.STRING)):
+ c, d = self.parse_item()
+ b.addarc(c)
+ b = d
+ return a, b
+
+ def parse_item(self):
+ # ITEM: '[' RHS ']' | ATOM ['+' | '*']
+ if self.value == "[":
+ self.gettoken()
+ a, z = self.parse_rhs()
+ self.expect(tokenize.OP, "]")
+ a.addarc(z)
+ return a, z
+ else:
+ a, z = self.parse_atom()
+ value = self.value
+ if value not in ("+", "*"):
+ return a, z
+ self.gettoken()
+ z.addarc(a)
+ if value == "+":
+ return a, z
+ else:
+ return a, a
+
+ def parse_atom(self):
+ # ATOM: '(' RHS ')' | NAME | STRING
+ if self.value == "(":
+ self.gettoken()
+ a, z = self.parse_rhs()
+ self.expect(tokenize.OP, ")")
+ return a, z
+ elif self.type in (tokenize.NAME, tokenize.STRING):
+ a = NFAState()
+ z = NFAState()
+ a.addarc(z, self.value)
+ self.gettoken()
+ return a, z
+ else:
+ self.raise_error("expected (...) or NAME or STRING, got %s/%s",
+ self.type, self.value)
+
+ def expect(self, type, value=None):
+ if self.type != type or (value is not None and self.value != value):
+ self.raise_error("expected %s/%s, got %s/%s",
+ type, value, self.type, self.value)
+ value = self.value
+ self.gettoken()
+ return value
+
+ def gettoken(self):
+ tup = next(self.generator)
+ while tup[0] in (tokenize.COMMENT, tokenize.NL):
+ tup = next(self.generator)
+ self.type, self.value, self.begin, self.end, self.line = tup
+ # print(getattr(tokenize, 'tok_name')[self.type], repr(self.value))
+
+ def raise_error(self, msg, *args):
+ if args:
+ try:
+ msg = msg % args
+ except:
+ msg = " ".join([msg] + list(map(str, args)))
+ raise SyntaxError(msg, (self.filename, self.end[0],
+ self.end[1], self.line))
+
+class NFAState(object):
+
+ def __init__(self):
+ self.arcs = [] # list of (label, NFAState) pairs
+
+ def addarc(self, next, label=None):
+ assert label is None or isinstance(label, str)
+ assert isinstance(next, NFAState)
+ self.arcs.append((label, next))
+
+class DFAState(object):
+
+ def __init__(self, nfaset, final):
+ assert isinstance(nfaset, set)
+ assert isinstance(next(iter(nfaset)), NFAState)
+ assert isinstance(final, NFAState)
+ self.nfaset = nfaset
+ self.isfinal = final in nfaset
+ self.arcs = {} # map from label to DFAState
+
+ def addarc(self, next, label):
+ assert isinstance(label, str)
+ assert label not in self.arcs
+ assert isinstance(next, DFAState)
+ self.arcs[label] = next
+
+ def unifystate(self, old, new):
+ for label, next in self.arcs.items():
+ if next is old:
+ self.arcs[label] = new
+
+ def __eq__(self, other):
+ # Equality test -- ignore the nfaset instance variable
+ assert isinstance(other, DFAState)
+ if self.isfinal != other.isfinal:
+ return False
+ # Can't just return self.arcs == other.arcs, because that
+ # would invoke this method recursively, with cycles...
+ if len(self.arcs) != len(other.arcs):
+ return False
+ for label, next in self.arcs.items():
+ if next is not other.arcs.get(label):
+ return False
+ return True
+
+ __hash__ = None # For Py3 compatibility.
diff --git a/Parser/pgen/token.py b/Parser/pgen/token.py
new file mode 100644
index 0000000..f9d45c4
--- /dev/null
+++ b/Parser/pgen/token.py
@@ -0,0 +1,40 @@
+import itertools
+
+def generate_tokens(tokens):
+ numbers = itertools.count(0)
+ for line in tokens:
+ line = line.strip()
+
+ if not line:
+ continue
+ if line.strip().startswith('#'):
+ continue
+
+ name = line.split()[0]
+ yield (name, next(numbers))
+
+ yield ('N_TOKENS', next(numbers))
+ yield ('NT_OFFSET', 256)
+
+def generate_opmap(tokens):
+ for line in tokens:
+ line = line.strip()
+
+ if not line:
+ continue
+ if line.strip().startswith('#'):
+ continue
+
+ pieces = line.split()
+
+ if len(pieces) != 2:
+ continue
+
+ name, op = pieces
+ yield (op.strip("'"), name)
+
+ # Yield independently <>. This is needed so it does not collide
+ # with the token generation in "generate_tokens" because if this
+ # symbol is included in Grammar/Tokens, it will collide with !=
+ # as it has the same name (NOTEQUAL).
+ yield ('<>', 'NOTEQUAL')