diff options
Diffstat (limited to 'Parser/pgen')
-rw-r--r-- | Parser/pgen/__init__.py | 0 | ||||
-rw-r--r-- | Parser/pgen/__main__.py | 34 | ||||
-rw-r--r-- | Parser/pgen/grammar.py | 160 | ||||
-rw-r--r-- | Parser/pgen/pgen.py | 408 | ||||
-rw-r--r-- | Parser/pgen/token.py | 40 |
5 files changed, 642 insertions, 0 deletions
diff --git a/Parser/pgen/__init__.py b/Parser/pgen/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/Parser/pgen/__init__.py diff --git a/Parser/pgen/__main__.py b/Parser/pgen/__main__.py new file mode 100644 index 0000000..965b08f --- /dev/null +++ b/Parser/pgen/__main__.py @@ -0,0 +1,34 @@ +import argparse + +from .pgen import ParserGenerator + +def main(): + parser = argparse.ArgumentParser(description="Parser generator main program.") + parser.add_argument( + "grammar", type=str, help="The file with the grammar definition in EBNF format" + ) + parser.add_argument( + "tokens", type=str, help="The file with the token definitions" + ) + parser.add_argument( + "graminit_h", + type=argparse.FileType('w'), + help="The path to write the grammar's non-terminals as #defines", + ) + parser.add_argument( + "graminit_c", + type=argparse.FileType('w'), + help="The path to write the grammar as initialized data", + ) + + parser.add_argument("--verbose", "-v", action="count") + args = parser.parse_args() + + p = ParserGenerator(args.grammar, args.tokens, verbose=args.verbose) + grammar = p.make_grammar() + grammar.produce_graminit_h(args.graminit_h.write) + grammar.produce_graminit_c(args.graminit_c.write) + + +if __name__ == "__main__": + main() diff --git a/Parser/pgen/grammar.py b/Parser/pgen/grammar.py new file mode 100644 index 0000000..bd405e6 --- /dev/null +++ b/Parser/pgen/grammar.py @@ -0,0 +1,160 @@ +import collections + +class Grammar: + """Pgen parsing tables conversion class. + + Once initialized, this class supplies the grammar tables for the + parsing engine implemented by parse.py. The parsing engine + accesses the instance variables directly. The class here does not + provide initialization of the tables; several subclasses exist to + do this (see the conv and pgen modules). + + The load() method reads the tables from a pickle file, which is + much faster than the other ways offered by subclasses. The pickle + file is written by calling dump() (after loading the grammar + tables using a subclass). The report() method prints a readable + representation of the tables to stdout, for debugging. + + The instance variables are as follows: + + symbol2number -- a dict mapping symbol names to numbers. Symbol + numbers are always 256 or higher, to distinguish + them from token numbers, which are between 0 and + 255 (inclusive). + + number2symbol -- a dict mapping numbers to symbol names; + these two are each other's inverse. + + states -- a list of DFAs, where each DFA is a list of + states, each state is a list of arcs, and each + arc is a (i, j) pair where i is a label and j is + a state number. The DFA number is the index into + this list. (This name is slightly confusing.) + Final states are represented by a special arc of + the form (0, j) where j is its own state number. + + dfas -- a dict mapping symbol numbers to (DFA, first) + pairs, where DFA is an item from the states list + above, and first is a set of tokens that can + begin this grammar rule (represented by a dict + whose values are always 1). + + labels -- a list of (x, y) pairs where x is either a token + number or a symbol number, and y is either None + or a string; the strings are keywords. The label + number is the index in this list; label numbers + are used to mark state transitions (arcs) in the + DFAs. + + start -- the number of the grammar's start symbol. + + keywords -- a dict mapping keyword strings to arc labels. + + tokens -- a dict mapping token numbers to arc labels. + + """ + + def __init__(self): + self.symbol2number = collections.OrderedDict() + self.number2symbol = collections.OrderedDict() + self.states = [] + self.dfas = collections.OrderedDict() + self.labels = [(0, "EMPTY")] + self.keywords = collections.OrderedDict() + self.tokens = collections.OrderedDict() + self.symbol2label = collections.OrderedDict() + self.start = 256 + + def produce_graminit_h(self, writer): + writer("/* Generated by Parser/pgen */\n\n") + for number, symbol in self.number2symbol.items(): + writer("#define {} {}\n".format(symbol, number)) + + def produce_graminit_c(self, writer): + writer("/* Generated by Parser/pgen */\n\n") + + writer('#include "pgenheaders.h"\n') + writer('#include "grammar.h"\n') + writer("grammar _PyParser_Grammar;\n") + + self.print_dfas(writer) + self.print_labels(writer) + + writer("grammar _PyParser_Grammar = {\n") + writer(" {n_dfas},\n".format(n_dfas=len(self.dfas))) + writer(" dfas,\n") + writer(" {{{n_labels}, labels}},\n".format(n_labels=len(self.labels))) + writer(" {start_number}\n".format(start_number=self.start)) + writer("};\n") + + def print_labels(self, writer): + writer( + "static label labels[{n_labels}] = {{\n".format(n_labels=len(self.labels)) + ) + for label, name in self.labels: + if name is None: + writer(" {{{label}, 0}},\n".format(label=label)) + else: + writer( + ' {{{label}, "{label_name}"}},\n'.format( + label=label, label_name=name + ) + ) + writer("};\n") + + def print_dfas(self, writer): + self.print_states(writer) + writer("static dfa dfas[{}] = {{\n".format(len(self.dfas))) + for dfaindex, dfa_elem in enumerate(self.dfas.items()): + symbol, (dfa, first_sets) = dfa_elem + writer( + ' {{{dfa_symbol}, "{symbol_name}", '.format( + dfa_symbol=symbol, symbol_name=self.number2symbol[symbol] + ) + + "0, {n_states}, states_{dfa_index},\n".format( + n_states=len(dfa), dfa_index=dfaindex + ) + ) + writer(' "') + + k = [name for label, name in self.labels if label in first_sets] + bitset = bytearray((len(self.labels) >> 3) + 1) + for token in first_sets: + bitset[token >> 3] |= 1 << (token & 7) + for byte in bitset: + writer("\\%03o" % (byte & 0xFF)) + writer('"},\n') + writer("};\n") + + def print_states(self, write): + for dfaindex, dfa in enumerate(self.states): + self.print_arcs(write, dfaindex, dfa) + write( + "static state states_{dfa_index}[{n_states}] = {{\n".format( + dfa_index=dfaindex, n_states=len(dfa) + ) + ) + for stateindex, state in enumerate(dfa): + narcs = len(state) + write( + " {{{n_arcs}, arcs_{dfa_index}_{state_index}}},\n".format( + n_arcs=narcs, dfa_index=dfaindex, state_index=stateindex + ) + ) + write("};\n") + + def print_arcs(self, write, dfaindex, states): + for stateindex, state in enumerate(states): + narcs = len(state) + write( + "static arc arcs_{dfa_index}_{state_index}[{n_arcs}] = {{\n".format( + dfa_index=dfaindex, state_index=stateindex, n_arcs=narcs + ) + ) + for a, b in state: + write( + " {{{from_label}, {to_state}}},\n".format( + from_label=a, to_state=b + ) + ) + write("};\n") diff --git a/Parser/pgen/pgen.py b/Parser/pgen/pgen.py new file mode 100644 index 0000000..c878919 --- /dev/null +++ b/Parser/pgen/pgen.py @@ -0,0 +1,408 @@ +import collections +import tokenize # from stdlib + +from . import grammar, token + +class ParserGenerator(object): + + def __init__(self, grammar_file, token_file, stream=None, verbose=False): + close_stream = None + if stream is None: + stream = open(grammar_file) + close_stream = stream.close + with open(token_file) as tok_file: + token_lines = tok_file.readlines() + self.tokens = dict(token.generate_tokens(token_lines)) + self.opmap = dict(token.generate_opmap(token_lines)) + # Manually add <> so it does not collide with != + self.opmap['<>'] = "NOTEQUAL" + self.verbose = verbose + self.filename = grammar_file + self.stream = stream + self.generator = tokenize.generate_tokens(stream.readline) + self.gettoken() # Initialize lookahead + self.dfas, self.startsymbol = self.parse() + if close_stream is not None: + close_stream() + self.first = {} # map from symbol name to set of tokens + self.addfirstsets() + + def make_grammar(self): + c = grammar.Grammar() + names = list(self.dfas.keys()) + names.remove(self.startsymbol) + names.insert(0, self.startsymbol) + for name in names: + i = 256 + len(c.symbol2number) + c.symbol2number[name] = i + c.number2symbol[i] = name + for name in names: + self.make_label(c, name) + dfa = self.dfas[name] + states = [] + for state in dfa: + arcs = [] + for label, next in sorted(state.arcs.items()): + arcs.append((self.make_label(c, label), dfa.index(next))) + if state.isfinal: + arcs.append((0, dfa.index(state))) + states.append(arcs) + c.states.append(states) + c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name)) + c.start = c.symbol2number[self.startsymbol] + + if self.verbose: + print("") + print("Grammar summary") + print("===============") + + print("- {n_labels} labels".format(n_labels=len(c.labels))) + print("- {n_dfas} dfas".format(n_dfas=len(c.dfas))) + print("- {n_tokens} tokens".format(n_tokens=len(c.tokens))) + print("- {n_keywords} keywords".format(n_keywords=len(c.keywords))) + print( + "- Start symbol: {start_symbol}".format( + start_symbol=c.number2symbol[c.start] + ) + ) + return c + + def make_first(self, c, name): + rawfirst = self.first[name] + first = set() + for label in sorted(rawfirst): + ilabel = self.make_label(c, label) + ##assert ilabel not in first # XXX failed on <> ... != + first.add(ilabel) + return first + + def make_label(self, c, label): + # XXX Maybe this should be a method on a subclass of converter? + ilabel = len(c.labels) + if label[0].isalpha(): + # Either a symbol name or a named token + if label in c.symbol2number: + # A symbol name (a non-terminal) + if label in c.symbol2label: + return c.symbol2label[label] + else: + c.labels.append((c.symbol2number[label], None)) + c.symbol2label[label] = ilabel + return ilabel + else: + # A named token (NAME, NUMBER, STRING) + itoken = self.tokens.get(label, None) + assert isinstance(itoken, int), label + assert itoken in self.tokens.values(), label + if itoken in c.tokens: + return c.tokens[itoken] + else: + c.labels.append((itoken, None)) + c.tokens[itoken] = ilabel + return ilabel + else: + # Either a keyword or an operator + assert label[0] in ('"', "'"), label + value = eval(label) + if value[0].isalpha(): + # A keyword + if value in c.keywords: + return c.keywords[value] + else: + c.labels.append((self.tokens["NAME"], value)) + c.keywords[value] = ilabel + return ilabel + else: + # An operator (any non-numeric token) + tok_name = self.opmap[value] # Fails if unknown token + itoken = self.tokens[tok_name] + if itoken in c.tokens: + return c.tokens[itoken] + else: + c.labels.append((itoken, None)) + c.tokens[itoken] = ilabel + return ilabel + + def addfirstsets(self): + names = list(self.dfas.keys()) + for name in names: + if name not in self.first: + self.calcfirst(name) + + if self.verbose: + print("First set for {dfa_name}".format(dfa_name=name)) + for item in self.first[name]: + print(" - {terminal}".format(terminal=item)) + + def calcfirst(self, name): + dfa = self.dfas[name] + self.first[name] = None # dummy to detect left recursion + state = dfa[0] + totalset = set() + overlapcheck = {} + for label, next in state.arcs.items(): + if label in self.dfas: + if label in self.first: + fset = self.first[label] + if fset is None: + raise ValueError("recursion for rule %r" % name) + else: + self.calcfirst(label) + fset = self.first[label] + totalset.update(fset) + overlapcheck[label] = fset + else: + totalset.add(label) + overlapcheck[label] = {label} + inverse = {} + for label, itsfirst in overlapcheck.items(): + for symbol in itsfirst: + if symbol in inverse: + raise ValueError("rule %s is ambiguous; %s is in the" + " first sets of %s as well as %s" % + (name, symbol, label, inverse[symbol])) + inverse[symbol] = label + self.first[name] = totalset + + def parse(self): + dfas = collections.OrderedDict() + startsymbol = None + # MSTART: (NEWLINE | RULE)* ENDMARKER + while self.type != tokenize.ENDMARKER: + while self.type == tokenize.NEWLINE: + self.gettoken() + # RULE: NAME ':' RHS NEWLINE + name = self.expect(tokenize.NAME) + if self.verbose: + print("Processing rule {dfa_name}".format(dfa_name=name)) + self.expect(tokenize.OP, ":") + a, z = self.parse_rhs() + self.expect(tokenize.NEWLINE) + if self.verbose: + self.dump_nfa(name, a, z) + dfa = self.make_dfa(a, z) + if self.verbose: + self.dump_dfa(name, dfa) + oldlen = len(dfa) + self.simplify_dfa(dfa) + newlen = len(dfa) + dfas[name] = dfa + #print name, oldlen, newlen + if startsymbol is None: + startsymbol = name + return dfas, startsymbol + + def make_dfa(self, start, finish): + # To turn an NFA into a DFA, we define the states of the DFA + # to correspond to *sets* of states of the NFA. Then do some + # state reduction. Let's represent sets as dicts with 1 for + # values. + assert isinstance(start, NFAState) + assert isinstance(finish, NFAState) + def closure(state): + base = set() + addclosure(state, base) + return base + def addclosure(state, base): + assert isinstance(state, NFAState) + if state in base: + return + base.add(state) + for label, next in state.arcs: + if label is None: + addclosure(next, base) + states = [DFAState(closure(start), finish)] + for state in states: # NB states grows while we're iterating + arcs = {} + for nfastate in state.nfaset: + for label, next in nfastate.arcs: + if label is not None: + addclosure(next, arcs.setdefault(label, set())) + for label, nfaset in sorted(arcs.items()): + for st in states: + if st.nfaset == nfaset: + break + else: + st = DFAState(nfaset, finish) + states.append(st) + state.addarc(st, label) + return states # List of DFAState instances; first one is start + + def dump_nfa(self, name, start, finish): + print("Dump of NFA for", name) + todo = [start] + for i, state in enumerate(todo): + print(" State", i, state is finish and "(final)" or "") + for label, next in state.arcs: + if next in todo: + j = todo.index(next) + else: + j = len(todo) + todo.append(next) + if label is None: + print(" -> %d" % j) + else: + print(" %s -> %d" % (label, j)) + + def dump_dfa(self, name, dfa): + print("Dump of DFA for", name) + for i, state in enumerate(dfa): + print(" State", i, state.isfinal and "(final)" or "") + for label, next in sorted(state.arcs.items()): + print(" %s -> %d" % (label, dfa.index(next))) + + def simplify_dfa(self, dfa): + # This is not theoretically optimal, but works well enough. + # Algorithm: repeatedly look for two states that have the same + # set of arcs (same labels pointing to the same nodes) and + # unify them, until things stop changing. + + # dfa is a list of DFAState instances + changes = True + while changes: + changes = False + for i, state_i in enumerate(dfa): + for j in range(i+1, len(dfa)): + state_j = dfa[j] + if state_i == state_j: + #print " unify", i, j + del dfa[j] + for state in dfa: + state.unifystate(state_j, state_i) + changes = True + break + + def parse_rhs(self): + # RHS: ALT ('|' ALT)* + a, z = self.parse_alt() + if self.value != "|": + return a, z + else: + aa = NFAState() + zz = NFAState() + aa.addarc(a) + z.addarc(zz) + while self.value == "|": + self.gettoken() + a, z = self.parse_alt() + aa.addarc(a) + z.addarc(zz) + return aa, zz + + def parse_alt(self): + # ALT: ITEM+ + a, b = self.parse_item() + while (self.value in ("(", "[") or + self.type in (tokenize.NAME, tokenize.STRING)): + c, d = self.parse_item() + b.addarc(c) + b = d + return a, b + + def parse_item(self): + # ITEM: '[' RHS ']' | ATOM ['+' | '*'] + if self.value == "[": + self.gettoken() + a, z = self.parse_rhs() + self.expect(tokenize.OP, "]") + a.addarc(z) + return a, z + else: + a, z = self.parse_atom() + value = self.value + if value not in ("+", "*"): + return a, z + self.gettoken() + z.addarc(a) + if value == "+": + return a, z + else: + return a, a + + def parse_atom(self): + # ATOM: '(' RHS ')' | NAME | STRING + if self.value == "(": + self.gettoken() + a, z = self.parse_rhs() + self.expect(tokenize.OP, ")") + return a, z + elif self.type in (tokenize.NAME, tokenize.STRING): + a = NFAState() + z = NFAState() + a.addarc(z, self.value) + self.gettoken() + return a, z + else: + self.raise_error("expected (...) or NAME or STRING, got %s/%s", + self.type, self.value) + + def expect(self, type, value=None): + if self.type != type or (value is not None and self.value != value): + self.raise_error("expected %s/%s, got %s/%s", + type, value, self.type, self.value) + value = self.value + self.gettoken() + return value + + def gettoken(self): + tup = next(self.generator) + while tup[0] in (tokenize.COMMENT, tokenize.NL): + tup = next(self.generator) + self.type, self.value, self.begin, self.end, self.line = tup + # print(getattr(tokenize, 'tok_name')[self.type], repr(self.value)) + + def raise_error(self, msg, *args): + if args: + try: + msg = msg % args + except: + msg = " ".join([msg] + list(map(str, args))) + raise SyntaxError(msg, (self.filename, self.end[0], + self.end[1], self.line)) + +class NFAState(object): + + def __init__(self): + self.arcs = [] # list of (label, NFAState) pairs + + def addarc(self, next, label=None): + assert label is None or isinstance(label, str) + assert isinstance(next, NFAState) + self.arcs.append((label, next)) + +class DFAState(object): + + def __init__(self, nfaset, final): + assert isinstance(nfaset, set) + assert isinstance(next(iter(nfaset)), NFAState) + assert isinstance(final, NFAState) + self.nfaset = nfaset + self.isfinal = final in nfaset + self.arcs = {} # map from label to DFAState + + def addarc(self, next, label): + assert isinstance(label, str) + assert label not in self.arcs + assert isinstance(next, DFAState) + self.arcs[label] = next + + def unifystate(self, old, new): + for label, next in self.arcs.items(): + if next is old: + self.arcs[label] = new + + def __eq__(self, other): + # Equality test -- ignore the nfaset instance variable + assert isinstance(other, DFAState) + if self.isfinal != other.isfinal: + return False + # Can't just return self.arcs == other.arcs, because that + # would invoke this method recursively, with cycles... + if len(self.arcs) != len(other.arcs): + return False + for label, next in self.arcs.items(): + if next is not other.arcs.get(label): + return False + return True + + __hash__ = None # For Py3 compatibility. diff --git a/Parser/pgen/token.py b/Parser/pgen/token.py new file mode 100644 index 0000000..f9d45c4 --- /dev/null +++ b/Parser/pgen/token.py @@ -0,0 +1,40 @@ +import itertools + +def generate_tokens(tokens): + numbers = itertools.count(0) + for line in tokens: + line = line.strip() + + if not line: + continue + if line.strip().startswith('#'): + continue + + name = line.split()[0] + yield (name, next(numbers)) + + yield ('N_TOKENS', next(numbers)) + yield ('NT_OFFSET', 256) + +def generate_opmap(tokens): + for line in tokens: + line = line.strip() + + if not line: + continue + if line.strip().startswith('#'): + continue + + pieces = line.split() + + if len(pieces) != 2: + continue + + name, op = pieces + yield (op.strip("'"), name) + + # Yield independently <>. This is needed so it does not collide + # with the token generation in "generate_tokens" because if this + # symbol is included in Grammar/Tokens, it will collide with != + # as it has the same name (NOTEQUAL). + yield ('<>', 'NOTEQUAL') |