import collections class Grammar: """Pgen parsing tables class. The instance variables are as follows: symbol2number -- a dict mapping symbol names to numbers. Symbol numbers are always 256 or higher, to distinguish them from token numbers, which are between 0 and 255 (inclusive). number2symbol -- a dict mapping numbers to symbol names; these two are each other's inverse. states -- a list of DFAs, where each DFA is a list of states, each state is a list of arcs, and each arc is a (i, j) pair where i is a label and j is a state number. The DFA number is the index into this list. (This name is slightly confusing.) Final states are represented by a special arc of the form (0, j) where j is its own state number. dfas -- a dict mapping symbol numbers to (DFA, first) pairs, where DFA is an item from the states list above, and first is a set of tokens that can begin this grammar rule. labels -- a list of (x, y) pairs where x is either a token number or a symbol number, and y is either None or a string; the strings are keywords. The label number is the index in this list; label numbers are used to mark state transitions (arcs) in the DFAs. start -- the number of the grammar's start symbol. keywords -- a dict mapping keyword strings to arc labels. tokens -- a dict mapping token numbers to arc labels. """ def __init__(self): self.symbol2number = collections.OrderedDict() self.number2symbol = collections.OrderedDict() self.states = [] self.dfas = collections.OrderedDict() self.labels = [(0, "EMPTY")] self.keywords = collections.OrderedDict() self.tokens = collections.OrderedDict() self.symbol2label = collections.OrderedDict() self.start = 256 def produce_graminit_h(self, writer): writer("/* Generated by Parser/pgen */\n\n") for number, symbol in self.number2symbol.items(): writer("#define {} {}\n".format(symbol, number)) def produce_graminit_c(self, writer): writer("/* Generated by Parser/pgen */\n\n") writer('#include "pgenheaders.h"\n') writer('#include "grammar.h"\n') writer("grammar _PyParser_Grammar;\n") self.print_dfas(writer) self.print_labels(writer) writer("grammar _PyParser_Grammar = {\n") writer(" {n_dfas},\n".format(n_dfas=len(self.dfas))) writer(" dfas,\n") writer(" {{{n_labels}, labels}},\n".format(n_labels=len(self.labels))) writer(" {start_number}\n".format(start_number=self.start)) writer("};\n") def print_labels(self, writer): writer( "static label labels[{n_labels}] = {{\n".format(n_labels=len(self.labels)) ) for label, name in self.labels: label_name = '"{}"'.format(name) if name is not None else 0 writer( ' {{{label}, {label_name}}},\n'.format( label=label, label_name=label_name ) ) writer("};\n") def print_dfas(self, writer): self.print_states(writer) writer("static dfa dfas[{}] = {{\n".format(len(self.dfas))) for dfaindex, dfa_elem in enumerate(self.dfas.items()): symbol, (dfa, first_sets) = dfa_elem writer( ' {{{dfa_symbol}, "{symbol_name}", '.format( dfa_symbol=symbol, symbol_name=self.number2symbol[symbol] ) + "{n_states}, states_{dfa_index},\n".format( n_states=len(dfa), dfa_index=dfaindex ) + ' "' ) bitset = bytearray((len(self.labels) >> 3) + 1) for token in first_sets: bitset[token >> 3] |= 1 << (token & 7) for byte in bitset: writer("\\%03o" % (byte & 0xFF)) writer('"},\n') writer("};\n") def print_states(self, write): for dfaindex, dfa in enumerate(self.states): self.print_arcs(write, dfaindex, dfa) write( "static state states_{dfa_index}[{n_states}] = {{\n".format( dfa_index=dfaindex, n_states=len(dfa) ) ) for stateindex, state in enumerate(dfa): narcs = len(state) write( " {{{n_arcs}, arcs_{dfa_index}_{state_index}}},\n".format( n_arcs=narcs, dfa_index=dfaindex, state_index=stateindex ) ) write("};\n") def print_arcs(self, write, dfaindex, states): for stateindex, state in enumerate(states): narcs = len(state) write( "static arc arcs_{dfa_index}_{state_index}[{n_arcs}] = {{\n".format( dfa_index=dfaindex, state_index=stateindex, n_arcs=narcs ) ) for a, b in state: write( " {{{from_label}, {to_state}}},\n".format( from_label=a, to_state=b ) ) write("};\n")