diff options
author | Pablo Galindo <Pablogsal@gmail.com> | 2019-03-01 23:34:44 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-03-01 23:34:44 (GMT) |
commit | 1f24a719e7be5e49b876a5dc7daf21d01ee69faa (patch) | |
tree | 8f8f56cab78ef671a8cb7f54b8ec2495d9a435e6 /Parser/pgen | |
parent | 7eebbbd5b3907447eddadf5cb7cb1cc9230d15b2 (diff) | |
download | cpython-1f24a719e7be5e49b876a5dc7daf21d01ee69faa.zip cpython-1f24a719e7be5e49b876a5dc7daf21d01ee69faa.tar.gz cpython-1f24a719e7be5e49b876a5dc7daf21d01ee69faa.tar.bz2 |
bpo-35808: Retire pgen and use pgen2 to generate the parser (GH-11814)
Pgen is the oldest piece of technology in the CPython repository, building it requires various #if[n]def PGEN hacks in other parts of the code and it also depends more and more on CPython internals. This commit removes the old pgen C code and replaces it for a new version implemented in pure Python. This is a modified and adapted version of lib2to3/pgen2 that can generate grammar files compatibles with the current parser.
This commit also eliminates all the #ifdef and code branches related to pgen, simplifying the code and making it more maintainable. The regen-grammar step now uses $(PYTHON_FOR_REGEN) that can be any version of the interpreter, so the new pgen code maintains compatibility with older versions of the interpreter (this also allows regenerating the grammar with the current CI solution that uses Python3.5). The new pgen Python module also makes use of the Grammar/Tokens file that holds the token specification, so is always kept in sync and avoids having to maintain duplicate token definitions.
Diffstat (limited to 'Parser/pgen')
-rw-r--r-- | Parser/pgen/__init__.py | 0 | ||||
-rw-r--r-- | Parser/pgen/__main__.py | 34 | ||||
-rw-r--r-- | Parser/pgen/grammar.py | 160 | ||||
-rw-r--r-- | Parser/pgen/pgen.py | 408 | ||||
-rw-r--r-- | Parser/pgen/token.py | 40 |
5 files changed, 642 insertions, 0 deletions
diff --git a/Parser/pgen/__init__.py b/Parser/pgen/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/Parser/pgen/__init__.py diff --git a/Parser/pgen/__main__.py b/Parser/pgen/__main__.py new file mode 100644 index 0000000..965b08f --- /dev/null +++ b/Parser/pgen/__main__.py @@ -0,0 +1,34 @@ +import argparse + +from .pgen import ParserGenerator + +def main(): + parser = argparse.ArgumentParser(description="Parser generator main program.") + parser.add_argument( + "grammar", type=str, help="The file with the grammar definition in EBNF format" + ) + parser.add_argument( + "tokens", type=str, help="The file with the token definitions" + ) + parser.add_argument( + "graminit_h", + type=argparse.FileType('w'), + help="The path to write the grammar's non-terminals as #defines", + ) + parser.add_argument( + "graminit_c", + type=argparse.FileType('w'), + help="The path to write the grammar as initialized data", + ) + + parser.add_argument("--verbose", "-v", action="count") + args = parser.parse_args() + + p = ParserGenerator(args.grammar, args.tokens, verbose=args.verbose) + grammar = p.make_grammar() + grammar.produce_graminit_h(args.graminit_h.write) + grammar.produce_graminit_c(args.graminit_c.write) + + +if __name__ == "__main__": + main() diff --git a/Parser/pgen/grammar.py b/Parser/pgen/grammar.py new file mode 100644 index 0000000..bd405e6 --- /dev/null +++ b/Parser/pgen/grammar.py @@ -0,0 +1,160 @@ +import collections + +class Grammar: + """Pgen parsing tables conversion class. + + Once initialized, this class supplies the grammar tables for the + parsing engine implemented by parse.py. The parsing engine + accesses the instance variables directly. The class here does not + provide initialization of the tables; several subclasses exist to + do this (see the conv and pgen modules). + + The load() method reads the tables from a pickle file, which is + much faster than the other ways offered by subclasses. The pickle + file is written by calling dump() (after loading the grammar + tables using a subclass). The report() method prints a readable + representation of the tables to stdout, for debugging. + + The instance variables are as follows: + + symbol2number -- a dict mapping symbol names to numbers. Symbol + numbers are always 256 or higher, to distinguish + them from token numbers, which are between 0 and + 255 (inclusive). + + number2symbol -- a dict mapping numbers to symbol names; + these two are each other's inverse. + + states -- a list of DFAs, where each DFA is a list of + states, each state is a list of arcs, and each + arc is a (i, j) pair where i is a label and j is + a state number. The DFA number is the index into + this list. (This name is slightly confusing.) + Final states are represented by a special arc of + the form (0, j) where j is its own state number. + + dfas -- a dict mapping symbol numbers to (DFA, first) + pairs, where DFA is an item from the states list + above, and first is a set of tokens that can + begin this grammar rule (represented by a dict + whose values are always 1). + + labels -- a list of (x, y) pairs where x is either a token + number or a symbol number, and y is either None + or a string; the strings are keywords. The label + number is the index in this list; label numbers + are used to mark state transitions (arcs) in the + DFAs. + + start -- the number of the grammar's start symbol. + + keywords -- a dict mapping keyword strings to arc labels. + + tokens -- a dict mapping token numbers to arc labels. + + """ + + def __init__(self): + self.symbol2number = collections.OrderedDict() + self.number2symbol = collections.OrderedDict() + self.states = [] + self.dfas = collections.OrderedDict() + self.labels = [(0, "EMPTY")] + self.keywords = collections.OrderedDict() + self.tokens = collections.OrderedDict() + self.symbol2label = collections.OrderedDict() + self.start = 256 + + def produce_graminit_h(self, writer): + writer("/* Generated by Parser/pgen */\n\n") + for number, symbol in self.number2symbol.items(): + writer("#define {} {}\n".format(symbol, number)) + + def produce_graminit_c(self, writer): + writer("/* Generated by Parser/pgen */\n\n") + + writer('#include "pgenheaders.h"\n') + writer('#include "grammar.h"\n') + writer("grammar _PyParser_Grammar;\n") + + self.print_dfas(writer) + self.print_labels(writer) + + writer("grammar _PyParser_Grammar = {\n") + writer(" {n_dfas},\n".format(n_dfas=len(self.dfas))) + writer(" dfas,\n") + writer(" {{{n_labels}, labels}},\n".format(n_labels=len(self.labels))) + writer(" {start_number}\n".format(start_number=self.start)) + writer("};\n") + + def print_labels(self, writer): + writer( + "static label labels[{n_labels}] = {{\n".format(n_labels=len(self.labels)) + ) + for label, name in self.labels: + if name is None: + writer(" {{{label}, 0}},\n".format(label=label)) + else: + writer( + ' {{{label}, "{label_name}"}},\n'.format( + label=label, label_name=name + ) + ) + writer("};\n") + + def print_dfas(self, writer): + self.print_states(writer) + writer("static dfa dfas[{}] = {{\n".format(len(self.dfas))) + for dfaindex, dfa_elem in enumerate(self.dfas.items()): + symbol, (dfa, first_sets) = dfa_elem + writer( + ' {{{dfa_symbol}, "{symbol_name}", '.format( + dfa_symbol=symbol, symbol_name=self.number2symbol[symbol] + ) + + "0, {n_states}, states_{dfa_index},\n".format( + n_states=len(dfa), dfa_index=dfaindex + ) + ) + writer(' "') + + k = [name for label, name in self.labels if label in first_sets] + bitset = bytearray((len(self.labels) >> 3) + 1) + for token in first_sets: + bitset[token >> 3] |= 1 << (token & 7) + for byte in bitset: + writer("\\%03o" % (byte & 0xFF)) + writer('"},\n') + writer("};\n") + + def print_states(self, write): + for dfaindex, dfa in enumerate(self.states): + self.print_arcs(write, dfaindex, dfa) + write( + "static state states_{dfa_index}[{n_states}] = {{\n".format( + dfa_index=dfaindex, n_states=len(dfa) + ) + ) + for stateindex, state in enumerate(dfa): + narcs = len(state) + write( + " {{{n_arcs}, arcs_{dfa_index}_{state_index}}},\n".format( + n_arcs=narcs, dfa_index=dfaindex, state_index=stateindex + ) + ) + write("};\n") + + def print_arcs(self, write, dfaindex, states): + for stateindex, state in enumerate(states): + narcs = len(state) + write( + "static arc arcs_{dfa_index}_{state_index}[{n_arcs}] = {{\n".format( + dfa_index=dfaindex, state_index=stateindex, n_arcs=narcs + ) + ) + for a, b in state: + write( + " {{{from_label}, {to_state}}},\n".format( + from_label=a, to_state=b + ) + ) + write("};\n") diff --git a/Parser/pgen/pgen.py b/Parser/pgen/pgen.py new file mode 100644 index 0000000..c878919 --- /dev/null +++ b/Parser/pgen/pgen.py @@ -0,0 +1,408 @@ +import collections +import tokenize # from stdlib + +from . import grammar, token + +class ParserGenerator(object): + + def __init__(self, grammar_file, token_file, stream=None, verbose=False): + close_stream = None + if stream is None: + stream = open(grammar_file) + close_stream = stream.close + with open(token_file) as tok_file: + token_lines = tok_file.readlines() + self.tokens = dict(token.generate_tokens(token_lines)) + self.opmap = dict(token.generate_opmap(token_lines)) + # Manually add <> so it does not collide with != + self.opmap['<>'] = "NOTEQUAL" + self.verbose = verbose + self.filename = grammar_file + self.stream = stream + self.generator = tokenize.generate_tokens(stream.readline) + self.gettoken() # Initialize lookahead + self.dfas, self.startsymbol = self.parse() + if close_stream is not None: + close_stream() + self.first = {} # map from symbol name to set of tokens + self.addfirstsets() + + def make_grammar(self): + c = grammar.Grammar() + names = list(self.dfas.keys()) + names.remove(self.startsymbol) + names.insert(0, self.startsymbol) + for name in names: + i = 256 + len(c.symbol2number) + c.symbol2number[name] = i + c.number2symbol[i] = name + for name in names: + self.make_label(c, name) + dfa = self.dfas[name] + states = [] + for state in dfa: + arcs = [] + for label, next in sorted(state.arcs.items()): + arcs.append((self.make_label(c, label), dfa.index(next))) + if state.isfinal: + arcs.append((0, dfa.index(state))) + states.append(arcs) + c.states.append(states) + c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name)) + c.start = c.symbol2number[self.startsymbol] + + if self.verbose: + print("") + print("Grammar summary") + print("===============") + + print("- {n_labels} labels".format(n_labels=len(c.labels))) + print("- {n_dfas} dfas".format(n_dfas=len(c.dfas))) + print("- {n_tokens} tokens".format(n_tokens=len(c.tokens))) + print("- {n_keywords} keywords".format(n_keywords=len(c.keywords))) + print( + "- Start symbol: {start_symbol}".format( + start_symbol=c.number2symbol[c.start] + ) + ) + return c + + def make_first(self, c, name): + rawfirst = self.first[name] + first = set() + for label in sorted(rawfirst): + ilabel = self.make_label(c, label) + ##assert ilabel not in first # XXX failed on <> ... != + first.add(ilabel) + return first + + def make_label(self, c, label): + # XXX Maybe this should be a method on a subclass of converter? + ilabel = len(c.labels) + if label[0].isalpha(): + # Either a symbol name or a named token + if label in c.symbol2number: + # A symbol name (a non-terminal) + if label in c.symbol2label: + return c.symbol2label[label] + else: + c.labels.append((c.symbol2number[label], None)) + c.symbol2label[label] = ilabel + return ilabel + else: + # A named token (NAME, NUMBER, STRING) + itoken = self.tokens.get(label, None) + assert isinstance(itoken, int), label + assert itoken in self.tokens.values(), label + if itoken in c.tokens: + return c.tokens[itoken] + else: + c.labels.append((itoken, None)) + c.tokens[itoken] = ilabel + return ilabel + else: + # Either a keyword or an operator + assert label[0] in ('"', "'"), label + value = eval(label) + if value[0].isalpha(): + # A keyword + if value in c.keywords: + return c.keywords[value] + else: + c.labels.append((self.tokens["NAME"], value)) + c.keywords[value] = ilabel + return ilabel + else: + # An operator (any non-numeric token) + tok_name = self.opmap[value] # Fails if unknown token + itoken = self.tokens[tok_name] + if itoken in c.tokens: + return c.tokens[itoken] + else: + c.labels.append((itoken, None)) + c.tokens[itoken] = ilabel + return ilabel + + def addfirstsets(self): + names = list(self.dfas.keys()) + for name in names: + if name not in self.first: + self.calcfirst(name) + + if self.verbose: + print("First set for {dfa_name}".format(dfa_name=name)) + for item in self.first[name]: + print(" - {terminal}".format(terminal=item)) + + def calcfirst(self, name): + dfa = self.dfas[name] + self.first[name] = None # dummy to detect left recursion + state = dfa[0] + totalset = set() + overlapcheck = {} + for label, next in state.arcs.items(): + if label in self.dfas: + if label in self.first: + fset = self.first[label] + if fset is None: + raise ValueError("recursion for rule %r" % name) + else: + self.calcfirst(label) + fset = self.first[label] + totalset.update(fset) + overlapcheck[label] = fset + else: + totalset.add(label) + overlapcheck[label] = {label} + inverse = {} + for label, itsfirst in overlapcheck.items(): + for symbol in itsfirst: + if symbol in inverse: + raise ValueError("rule %s is ambiguous; %s is in the" + " first sets of %s as well as %s" % + (name, symbol, label, inverse[symbol])) + inverse[symbol] = label + self.first[name] = totalset + + def parse(self): + dfas = collections.OrderedDict() + startsymbol = None + # MSTART: (NEWLINE | RULE)* ENDMARKER + while self.type != tokenize.ENDMARKER: + while self.type == tokenize.NEWLINE: + self.gettoken() + # RULE: NAME ':' RHS NEWLINE + name = self.expect(tokenize.NAME) + if self.verbose: + print("Processing rule {dfa_name}".format(dfa_name=name)) + self.expect(tokenize.OP, ":") + a, z = self.parse_rhs() + self.expect(tokenize.NEWLINE) + if self.verbose: + self.dump_nfa(name, a, z) + dfa = self.make_dfa(a, z) + if self.verbose: + self.dump_dfa(name, dfa) + oldlen = len(dfa) + self.simplify_dfa(dfa) + newlen = len(dfa) + dfas[name] = dfa + #print name, oldlen, newlen + if startsymbol is None: + startsymbol = name + return dfas, startsymbol + + def make_dfa(self, start, finish): + # To turn an NFA into a DFA, we define the states of the DFA + # to correspond to *sets* of states of the NFA. Then do some + # state reduction. Let's represent sets as dicts with 1 for + # values. + assert isinstance(start, NFAState) + assert isinstance(finish, NFAState) + def closure(state): + base = set() + addclosure(state, base) + return base + def addclosure(state, base): + assert isinstance(state, NFAState) + if state in base: + return + base.add(state) + for label, next in state.arcs: + if label is None: + addclosure(next, base) + states = [DFAState(closure(start), finish)] + for state in states: # NB states grows while we're iterating + arcs = {} + for nfastate in state.nfaset: + for label, next in nfastate.arcs: + if label is not None: + addclosure(next, arcs.setdefault(label, set())) + for label, nfaset in sorted(arcs.items()): + for st in states: + if st.nfaset == nfaset: + break + else: + st = DFAState(nfaset, finish) + states.append(st) + state.addarc(st, label) + return states # List of DFAState instances; first one is start + + def dump_nfa(self, name, start, finish): + print("Dump of NFA for", name) + todo = [start] + for i, state in enumerate(todo): + print(" State", i, state is finish and "(final)" or "") + for label, next in state.arcs: + if next in todo: + j = todo.index(next) + else: + j = len(todo) + todo.append(next) + if label is None: + print(" -> %d" % j) + else: + print(" %s -> %d" % (label, j)) + + def dump_dfa(self, name, dfa): + print("Dump of DFA for", name) + for i, state in enumerate(dfa): + print(" State", i, state.isfinal and "(final)" or "") + for label, next in sorted(state.arcs.items()): + print(" %s -> %d" % (label, dfa.index(next))) + + def simplify_dfa(self, dfa): + # This is not theoretically optimal, but works well enough. + # Algorithm: repeatedly look for two states that have the same + # set of arcs (same labels pointing to the same nodes) and + # unify them, until things stop changing. + + # dfa is a list of DFAState instances + changes = True + while changes: + changes = False + for i, state_i in enumerate(dfa): + for j in range(i+1, len(dfa)): + state_j = dfa[j] + if state_i == state_j: + #print " unify", i, j + del dfa[j] + for state in dfa: + state.unifystate(state_j, state_i) + changes = True + break + + def parse_rhs(self): + # RHS: ALT ('|' ALT)* + a, z = self.parse_alt() + if self.value != "|": + return a, z + else: + aa = NFAState() + zz = NFAState() + aa.addarc(a) + z.addarc(zz) + while self.value == "|": + self.gettoken() + a, z = self.parse_alt() + aa.addarc(a) + z.addarc(zz) + return aa, zz + + def parse_alt(self): + # ALT: ITEM+ + a, b = self.parse_item() + while (self.value in ("(", "[") or + self.type in (tokenize.NAME, tokenize.STRING)): + c, d = self.parse_item() + b.addarc(c) + b = d + return a, b + + def parse_item(self): + # ITEM: '[' RHS ']' | ATOM ['+' | '*'] + if self.value == "[": + self.gettoken() + a, z = self.parse_rhs() + self.expect(tokenize.OP, "]") + a.addarc(z) + return a, z + else: + a, z = self.parse_atom() + value = self.value + if value not in ("+", "*"): + return a, z + self.gettoken() + z.addarc(a) + if value == "+": + return a, z + else: + return a, a + + def parse_atom(self): + # ATOM: '(' RHS ')' | NAME | STRING + if self.value == "(": + self.gettoken() + a, z = self.parse_rhs() + self.expect(tokenize.OP, ")") + return a, z + elif self.type in (tokenize.NAME, tokenize.STRING): + a = NFAState() + z = NFAState() + a.addarc(z, self.value) + self.gettoken() + return a, z + else: + self.raise_error("expected (...) or NAME or STRING, got %s/%s", + self.type, self.value) + + def expect(self, type, value=None): + if self.type != type or (value is not None and self.value != value): + self.raise_error("expected %s/%s, got %s/%s", + type, value, self.type, self.value) + value = self.value + self.gettoken() + return value + + def gettoken(self): + tup = next(self.generator) + while tup[0] in (tokenize.COMMENT, tokenize.NL): + tup = next(self.generator) + self.type, self.value, self.begin, self.end, self.line = tup + # print(getattr(tokenize, 'tok_name')[self.type], repr(self.value)) + + def raise_error(self, msg, *args): + if args: + try: + msg = msg % args + except: + msg = " ".join([msg] + list(map(str, args))) + raise SyntaxError(msg, (self.filename, self.end[0], + self.end[1], self.line)) + +class NFAState(object): + + def __init__(self): + self.arcs = [] # list of (label, NFAState) pairs + + def addarc(self, next, label=None): + assert label is None or isinstance(label, str) + assert isinstance(next, NFAState) + self.arcs.append((label, next)) + +class DFAState(object): + + def __init__(self, nfaset, final): + assert isinstance(nfaset, set) + assert isinstance(next(iter(nfaset)), NFAState) + assert isinstance(final, NFAState) + self.nfaset = nfaset + self.isfinal = final in nfaset + self.arcs = {} # map from label to DFAState + + def addarc(self, next, label): + assert isinstance(label, str) + assert label not in self.arcs + assert isinstance(next, DFAState) + self.arcs[label] = next + + def unifystate(self, old, new): + for label, next in self.arcs.items(): + if next is old: + self.arcs[label] = new + + def __eq__(self, other): + # Equality test -- ignore the nfaset instance variable + assert isinstance(other, DFAState) + if self.isfinal != other.isfinal: + return False + # Can't just return self.arcs == other.arcs, because that + # would invoke this method recursively, with cycles... + if len(self.arcs) != len(other.arcs): + return False + for label, next in self.arcs.items(): + if next is not other.arcs.get(label): + return False + return True + + __hash__ = None # For Py3 compatibility. diff --git a/Parser/pgen/token.py b/Parser/pgen/token.py new file mode 100644 index 0000000..f9d45c4 --- /dev/null +++ b/Parser/pgen/token.py @@ -0,0 +1,40 @@ +import itertools + +def generate_tokens(tokens): + numbers = itertools.count(0) + for line in tokens: + line = line.strip() + + if not line: + continue + if line.strip().startswith('#'): + continue + + name = line.split()[0] + yield (name, next(numbers)) + + yield ('N_TOKENS', next(numbers)) + yield ('NT_OFFSET', 256) + +def generate_opmap(tokens): + for line in tokens: + line = line.strip() + + if not line: + continue + if line.strip().startswith('#'): + continue + + pieces = line.split() + + if len(pieces) != 2: + continue + + name, op = pieces + yield (op.strip("'"), name) + + # Yield independently <>. This is needed so it does not collide + # with the token generation in "generate_tokens" because if this + # symbol is included in Grammar/Tokens, it will collide with != + # as it has the same name (NOTEQUAL). + yield ('<>', 'NOTEQUAL') |