diff options
Diffstat (limited to 'Parser/pgen')
-rw-r--r-- | Parser/pgen/__init__.py | 0 | ||||
-rw-r--r-- | Parser/pgen/__main__.py | 43 | ||||
-rw-r--r-- | Parser/pgen/automata.py | 400 | ||||
-rw-r--r-- | Parser/pgen/grammar.py | 147 | ||||
-rw-r--r-- | Parser/pgen/keywordgen.py | 59 | ||||
-rw-r--r-- | Parser/pgen/metaparser.py | 152 | ||||
-rw-r--r-- | Parser/pgen/pgen.py | 310 | ||||
-rw-r--r-- | Parser/pgen/token.py | 38 |
8 files changed, 0 insertions, 1149 deletions
diff --git a/Parser/pgen/__init__.py b/Parser/pgen/__init__.py deleted file mode 100644 index e69de29..0000000 --- a/Parser/pgen/__init__.py +++ /dev/null diff --git a/Parser/pgen/__main__.py b/Parser/pgen/__main__.py deleted file mode 100644 index d3780a7..0000000 --- a/Parser/pgen/__main__.py +++ /dev/null @@ -1,43 +0,0 @@ -import argparse - -from .pgen import ParserGenerator - - -def main(): - parser = argparse.ArgumentParser(description="Parser generator main program.") - parser.add_argument( - "grammar", type=str, help="The file with the grammar definition in EBNF format" - ) - parser.add_argument("tokens", type=str, help="The file with the token definitions") - parser.add_argument( - "graminit_h", - type=argparse.FileType("w"), - help="The path to write the grammar's non-terminals as #defines", - ) - parser.add_argument( - "graminit_c", - type=argparse.FileType("w"), - help="The path to write the grammar as initialized data", - ) - - parser.add_argument("--verbose", "-v", action="count") - parser.add_argument( - "--graph", - type=argparse.FileType("w"), - action="store", - metavar="GRAPH_OUTPUT_FILE", - help="Dumps a DOT representation of the generated automata in a file", - ) - - args = parser.parse_args() - - p = ParserGenerator( - args.grammar, args.tokens, verbose=args.verbose, graph_file=args.graph - ) - grammar = p.make_grammar() - grammar.produce_graminit_h(args.graminit_h.write) - grammar.produce_graminit_c(args.graminit_c.write) - - -if __name__ == "__main__": - main() diff --git a/Parser/pgen/automata.py b/Parser/pgen/automata.py deleted file mode 100644 index f2ed221..0000000 --- a/Parser/pgen/automata.py +++ /dev/null @@ -1,400 +0,0 @@ -"""Classes representing state-machine concepts""" - -class NFA: - """A non deterministic finite automata - - A non deterministic automata is a form of a finite state - machine. An NFA's rules are less restrictive than a DFA. - The NFA rules are: - - * A transition can be non-deterministic and can result in - nothing, one, or two or more states. - - * An epsilon transition consuming empty input is valid. - Transitions consuming labeled symbols are also permitted. - - This class assumes that there is only one starting state and one - accepting (ending) state. - - Attributes: - name (str): The name of the rule the NFA is representing. - start (NFAState): The starting state. - end (NFAState): The ending state - """ - - def __init__(self, start, end): - self.name = start.rule_name - self.start = start - self.end = end - - def __repr__(self): - return "NFA(start={}, end={})".format(self.start, self.end) - - def dump(self, writer=print): - """Dump a graphical representation of the NFA""" - todo = [self.start] - for i, state in enumerate(todo): - writer(" State", i, state is self.end and "(final)" or "") - for arc in state.arcs: - label = arc.label - next = arc.target - if next in todo: - j = todo.index(next) - else: - j = len(todo) - todo.append(next) - if label is None: - writer(" -> %d" % j) - else: - writer(" %s -> %d" % (label, j)) - - def dump_graph(self, writer): - """Dump a DOT representation of the NFA""" - writer('digraph %s_nfa {\n' % self.name) - todo = [self.start] - for i, state in enumerate(todo): - writer(' %d [label="State %d %s"];\n' % (i, i, state is self.end and "(final)" or "")) - for arc in state.arcs: - label = arc.label - next = arc.target - if next in todo: - j = todo.index(next) - else: - j = len(todo) - todo.append(next) - if label is None: - writer(" %d -> %d [style=dotted label=ε];\n" % (i, j)) - else: - writer(" %d -> %d [label=%s];\n" % (i, j, label.replace("'", '"'))) - writer('}\n') - - -class NFAArc: - """An arc representing a transition between two NFA states. - - NFA states can be connected via two ways: - - * A label transition: An input equal to the label must - be consumed to perform the transition. - * An epsilon transition: The transition can be taken without - consuming any input symbol. - - Attributes: - target (NFAState): The ending state of the transition arc. - label (Optional[str]): The label that must be consumed to make - the transition. An epsilon transition is represented - using `None`. - """ - - def __init__(self, target, label): - self.target = target - self.label = label - - def __repr__(self): - return "<%s: %s>" % (self.__class__.__name__, self.label) - - -class NFAState: - """A state of a NFA, non deterministic finite automata. - - Attributes: - target (rule_name): The name of the rule used to represent the NFA's - ending state after a transition. - arcs (Dict[Optional[str], NFAState]): A mapping representing transitions - between the current NFA state and another NFA state via following - a label. - """ - - def __init__(self, rule_name): - self.rule_name = rule_name - self.arcs = [] - - def add_arc(self, target, label=None): - """Add a new arc to connect the state to a target state within the NFA - - The method adds a new arc to the list of arcs available as transitions - from the present state. An optional label indicates a named transition - that consumes an input while the absence of a label represents an epsilon - transition. - - Attributes: - target (NFAState): The end of the transition that the arc represents. - label (Optional[str]): The label that must be consumed for making - the transition. If the label is not provided the transition is assumed - to be an epsilon-transition. - """ - assert label is None or isinstance(label, str) - assert isinstance(target, NFAState) - self.arcs.append(NFAArc(target, label)) - - def __repr__(self): - return "<%s: from %s>" % (self.__class__.__name__, self.rule_name) - - -class DFA: - """A deterministic finite automata - - A deterministic finite automata is a form of a finite state machine - that obeys the following rules: - - * Each of the transitions is uniquely determined by - the source state and input symbol - * Reading an input symbol is required for each state - transition (no epsilon transitions). - - The finite-state machine will accept or reject a string of symbols - and only produces a unique computation of the automaton for each input - string. The DFA must have a unique starting state (represented as the first - element in the list of states) but can have multiple final states. - - Attributes: - name (str): The name of the rule the DFA is representing. - states (List[DFAState]): A collection of DFA states. - """ - - def __init__(self, name, states): - self.name = name - self.states = states - - @classmethod - def from_nfa(cls, nfa): - """Constructs a DFA from a NFA using the Rabin–Scott construction algorithm. - - To simulate the operation of a DFA on a given input string, it's - necessary to keep track of a single state at any time, or more precisely, - the state that the automaton will reach after seeing a prefix of the - input. In contrast, to simulate an NFA, it's necessary to keep track of - a set of states: all of the states that the automaton could reach after - seeing the same prefix of the input, according to the nondeterministic - choices made by the automaton. There are two possible sources of - non-determinism: - - 1) Multiple (one or more) transitions with the same label - - 'A' +-------+ - +----------->+ State +----------->+ - | | 2 | - +-------+ +-------+ - | State | - | 1 | +-------+ - +-------+ | State | - +----------->+ 3 +----------->+ - 'A' +-------+ - - 2) Epsilon transitions (transitions that can be taken without consuming any input) - - +-------+ +-------+ - | State | ε | State | - | 1 +----------->+ 2 +----------->+ - +-------+ +-------+ - - Looking at the first case above, we can't determine which transition should be - followed when given an input A. We could choose whether or not to follow the - transition while in the second case the problem is that we can choose both to - follow the transition or not doing it. To solve this problem we can imagine that - we follow all possibilities at the same time and we construct new states from the - set of all possible reachable states. For every case in the previous example: - - - 1) For multiple transitions with the same label we colapse all of the - final states under the same one - - +-------+ +-------+ - | State | 'A' | State | - | 1 +----------->+ 2-3 +----------->+ - +-------+ +-------+ - - 2) For epsilon transitions we collapse all epsilon-reachable states - into the same one - - +-------+ - | State | - | 1-2 +-----------> - +-------+ - - Because the DFA states consist of sets of NFA states, an n-state NFA - may be converted to a DFA with at most 2**n states. Notice that the - constructed DFA is not minimal and can be simplified or reduced - afterwards. - - Parameters: - name (NFA): The NFA to transform to DFA. - """ - assert isinstance(nfa, NFA) - - def add_closure(nfa_state, base_nfa_set): - """Calculate the epsilon-closure of a given state - - Add to the *base_nfa_set* all the states that are - reachable from *nfa_state* via epsilon-transitions. - """ - assert isinstance(nfa_state, NFAState) - if nfa_state in base_nfa_set: - return - base_nfa_set.add(nfa_state) - for nfa_arc in nfa_state.arcs: - if nfa_arc.label is None: - add_closure(nfa_arc.target, base_nfa_set) - - # Calculate the epsilon-closure of the starting state - base_nfa_set = set() - add_closure(nfa.start, base_nfa_set) - - # Start by visiting the NFA starting state (there is only one). - states = [DFAState(nfa.name, base_nfa_set, nfa.end)] - - for state in states: # NB states grow while we're iterating - - # Find transitions from the current state to other reachable states - # and store them in mapping that correlates the label to all the - # possible reachable states that can be obtained by consuming a - # token equal to the label. Each set of all the states that can - # be reached after following a label will be the a DFA state. - arcs = {} - for nfa_state in state.nfa_set: - for nfa_arc in nfa_state.arcs: - if nfa_arc.label is not None: - nfa_set = arcs.setdefault(nfa_arc.label, set()) - # All states that can be reached by epsilon-transitions - # are also included in the set of reachable states. - add_closure(nfa_arc.target, nfa_set) - - # Now create new DFAs by visiting all posible transitions between - # the current DFA state and the new power-set states (each nfa_set) - # via the different labels. As the nodes are appended to *states* this - # is performing a breadth-first search traversal over the power-set of - # the states of the original NFA. - for label, nfa_set in sorted(arcs.items()): - for exisisting_state in states: - if exisisting_state.nfa_set == nfa_set: - # The DFA state already exists for this rule. - next_state = exisisting_state - break - else: - next_state = DFAState(nfa.name, nfa_set, nfa.end) - states.append(next_state) - - # Add a transition between the current DFA state and the new - # DFA state (the power-set state) via the current label. - state.add_arc(next_state, label) - - return cls(nfa.name, states) - - def __iter__(self): - return iter(self.states) - - def simplify(self): - """Attempt to reduce the number of states of the DFA - - Transform the DFA into an equivalent DFA that has fewer states. Two - classes of states can be removed or merged from the original DFA without - affecting the language it accepts to minimize it: - - * Unreachable states can not be reached from the initial - state of the DFA, for any input string. - * Nondistinguishable states are those that cannot be distinguished - from one another for any input string. - - This algorithm does not achieve the optimal fully-reduced solution, but it - works well enough for the particularities of the Python grammar. The - algorithm repeatedly looks for two states that have the same set of - arcs (same labels pointing to the same nodes) and unifies them, until - things stop changing. - """ - changes = True - while changes: - changes = False - for i, state_i in enumerate(self.states): - for j in range(i + 1, len(self.states)): - state_j = self.states[j] - if state_i == state_j: - del self.states[j] - for state in self.states: - state.unifystate(state_j, state_i) - changes = True - break - - def dump(self, writer=print): - """Dump a graphical representation of the DFA""" - for i, state in enumerate(self.states): - writer(" State", i, state.is_final and "(final)" or "") - for label, next in sorted(state.arcs.items()): - writer(" %s -> %d" % (label, self.states.index(next))) - - def dump_graph(self, writer): - """Dump a DOT representation of the DFA""" - writer('digraph %s_dfa {\n' % self.name) - for i, state in enumerate(self.states): - writer(' %d [label="State %d %s"];\n' % (i, i, state.is_final and "(final)" or "")) - for label, next in sorted(state.arcs.items()): - writer(" %d -> %d [label=%s];\n" % (i, self.states.index(next), label.replace("'", '"'))) - writer('}\n') - - -class DFAState(object): - """A state of a DFA - - Attributes: - rule_name (rule_name): The name of the DFA rule containing the represented state. - nfa_set (Set[NFAState]): The set of NFA states used to create this state. - final (bool): True if the state represents an accepting state of the DFA - containing this state. - arcs (Dict[label, DFAState]): A mapping representing transitions between - the current DFA state and another DFA state via following a label. - """ - - def __init__(self, rule_name, nfa_set, final): - assert isinstance(nfa_set, set) - assert isinstance(next(iter(nfa_set)), NFAState) - assert isinstance(final, NFAState) - self.rule_name = rule_name - self.nfa_set = nfa_set - self.arcs = {} # map from terminals/nonterminals to DFAState - self.is_final = final in nfa_set - - def add_arc(self, target, label): - """Add a new arc to the current state. - - Parameters: - target (DFAState): The DFA state at the end of the arc. - label (str): The label respresenting the token that must be consumed - to perform this transition. - """ - assert isinstance(label, str) - assert label not in self.arcs - assert isinstance(target, DFAState) - self.arcs[label] = target - - def unifystate(self, old, new): - """Replace all arcs from the current node to *old* with *new*. - - Parameters: - old (DFAState): The DFA state to remove from all existing arcs. - new (DFAState): The DFA state to replace in all existing arcs. - """ - for label, next_ in self.arcs.items(): - if next_ is old: - self.arcs[label] = new - - def __eq__(self, other): - # The nfa_set does not matter for equality - assert isinstance(other, DFAState) - if self.is_final != other.is_final: - return False - # We cannot just return self.arcs == other.arcs because that - # would invoke this method recursively if there are any cycles. - if len(self.arcs) != len(other.arcs): - return False - for label, next_ in self.arcs.items(): - if next_ is not other.arcs.get(label): - return False - return True - - __hash__ = None # For Py3 compatibility. - - def __repr__(self): - return "<%s: %s is_final=%s>" % ( - self.__class__.__name__, - self.rule_name, - self.is_final, - ) diff --git a/Parser/pgen/grammar.py b/Parser/pgen/grammar.py deleted file mode 100644 index ce40e16..0000000 --- a/Parser/pgen/grammar.py +++ /dev/null @@ -1,147 +0,0 @@ -import collections - - -class Grammar: - """Pgen parsing tables class. - - The instance variables are as follows: - - symbol2number -- a dict mapping symbol names to numbers. Symbol - numbers are always 256 or higher, to distinguish - them from token numbers, which are between 0 and - 255 (inclusive). - - number2symbol -- a dict mapping numbers to symbol names; - these two are each other's inverse. - - states -- a list of DFAs, where each DFA is a list of - states, each state is a list of arcs, and each - arc is a (i, j) pair where i is a label and j is - a state number. The DFA number is the index into - this list. (This name is slightly confusing.) - Final states are represented by a special arc of - the form (0, j) where j is its own state number. - - dfas -- a dict mapping symbol numbers to (DFA, first) - pairs, where DFA is an item from the states list - above, and first is a set of tokens that can - begin this grammar rule. - - labels -- a list of (x, y) pairs where x is either a token - number or a symbol number, and y is either None - or a string; the strings are keywords. The label - number is the index in this list; label numbers - are used to mark state transitions (arcs) in the - DFAs. - - start -- the number of the grammar's start symbol. - - keywords -- a dict mapping keyword strings to arc labels. - - tokens -- a dict mapping token numbers to arc labels. - - """ - - def __init__(self): - self.symbol2number = collections.OrderedDict() - self.number2symbol = collections.OrderedDict() - self.states = [] - self.dfas = collections.OrderedDict() - self.labels = [(0, "EMPTY")] - self.keywords = collections.OrderedDict() - self.tokens = collections.OrderedDict() - self.symbol2label = collections.OrderedDict() - self.start = 256 - - def produce_graminit_h(self, writer): - writer("/* Generated by Parser/pgen */\n\n") - for number, symbol in self.number2symbol.items(): - writer("#define {} {}\n".format(symbol, number)) - - def produce_graminit_c(self, writer): - writer("/* Generated by Parser/pgen */\n\n") - - writer('#include "exports.h"\n') - writer('#include "grammar.h"\n') - writer("Py_EXPORTED_SYMBOL grammar _PyParser_Grammar;\n") - - self.print_dfas(writer) - self.print_labels(writer) - - writer("Py_EXPORTED_SYMBOL grammar _PyParser_Grammar = {\n") - writer(" {n_dfas},\n".format(n_dfas=len(self.dfas))) - writer(" dfas,\n") - writer(" {{{n_labels}, labels}},\n".format(n_labels=len(self.labels))) - writer(" {start_number}\n".format(start_number=self.start)) - writer("};\n") - - def print_labels(self, writer): - writer( - "static const label labels[{n_labels}] = {{\n".format( - n_labels=len(self.labels) - ) - ) - for label, name in self.labels: - label_name = '"{}"'.format(name) if name is not None else 0 - writer( - " {{{label}, {label_name}}},\n".format( - label=label, label_name=label_name - ) - ) - writer("};\n") - - def print_dfas(self, writer): - self.print_states(writer) - writer("static const dfa dfas[{}] = {{\n".format(len(self.dfas))) - for dfaindex, dfa_elem in enumerate(self.dfas.items()): - symbol, (dfa, first_sets) = dfa_elem - writer( - ' {{{dfa_symbol}, "{symbol_name}", '.format( - dfa_symbol=symbol, symbol_name=self.number2symbol[symbol] - ) - + "{n_states}, states_{dfa_index},\n".format( - n_states=len(dfa), dfa_index=dfaindex - ) - + ' "' - ) - - bitset = bytearray((len(self.labels) >> 3) + 1) - for token in first_sets: - bitset[token >> 3] |= 1 << (token & 7) - for byte in bitset: - writer("\\%03o" % (byte & 0xFF)) - writer('"},\n') - writer("};\n") - - def print_states(self, write): - for dfaindex, dfa in enumerate(self.states): - self.print_arcs(write, dfaindex, dfa) - write( - "static state states_{dfa_index}[{n_states}] = {{\n".format( - dfa_index=dfaindex, n_states=len(dfa) - ) - ) - for stateindex, state in enumerate(dfa): - narcs = len(state) - write( - " {{{n_arcs}, arcs_{dfa_index}_{state_index}}},\n".format( - n_arcs=narcs, dfa_index=dfaindex, state_index=stateindex - ) - ) - write("};\n") - - def print_arcs(self, write, dfaindex, states): - for stateindex, state in enumerate(states): - narcs = len(state) - write( - "static const arc arcs_{dfa_index}_{state_index}[{n_arcs}] = {{\n".format( - dfa_index=dfaindex, state_index=stateindex, n_arcs=narcs - ) - ) - for a, b in state: - write( - " {{{from_label}, {to_state}}},\n".format( - from_label=a, to_state=b - ) - ) - write("};\n") diff --git a/Parser/pgen/keywordgen.py b/Parser/pgen/keywordgen.py deleted file mode 100644 index f0234a8..0000000 --- a/Parser/pgen/keywordgen.py +++ /dev/null @@ -1,59 +0,0 @@ -"""Generate Lib/keyword.py from the Grammar and Tokens files using pgen""" - -import argparse - -from .pgen import ParserGenerator - -TEMPLATE = r''' -"""Keywords (from "Grammar/Grammar") - -This file is automatically generated; please don't muck it up! - -To update the symbols in this file, 'cd' to the top directory of -the python source tree and run: - - python3 -m Parser.pgen.keywordgen Grammar/Grammar \ - Grammar/Tokens \ - Lib/keyword.py - -Alternatively, you can run 'make regen-keyword'. -""" - -__all__ = ["iskeyword", "kwlist"] - -kwlist = [ - {keywords} -] - -iskeyword = frozenset(kwlist).__contains__ -'''.lstrip() - -EXTRA_KEYWORDS = ["async", "await"] - - -def main(): - parser = argparse.ArgumentParser( - description="Generate the Lib/keywords.py " "file from the grammar." - ) - parser.add_argument( - "grammar", type=str, help="The file with the grammar definition in EBNF format" - ) - parser.add_argument("tokens", type=str, help="The file with the token definitions") - parser.add_argument( - "keyword_file", - type=argparse.FileType("w"), - help="The path to write the keyword definitions", - ) - args = parser.parse_args() - p = ParserGenerator(args.grammar, args.tokens) - grammar = p.make_grammar() - - with args.keyword_file as thefile: - all_keywords = sorted(list(grammar.keywords) + EXTRA_KEYWORDS) - - keywords = ",\n ".join(map(repr, all_keywords)) - thefile.write(TEMPLATE.format(keywords=keywords)) - - -if __name__ == "__main__": - main() diff --git a/Parser/pgen/metaparser.py b/Parser/pgen/metaparser.py deleted file mode 100644 index 074a083..0000000 --- a/Parser/pgen/metaparser.py +++ /dev/null @@ -1,152 +0,0 @@ -"""Parser for the Python metagrammar""" - -import io -import tokenize # from stdlib - -from .automata import NFA, NFAState - - -class GrammarParser: - """Parser for Python grammar files.""" - - _translation_table = { - tokenize.NAME: "NAME", - tokenize.STRING: "STRING", - tokenize.NEWLINE: "NEWLINE", - tokenize.NL: "NL", - tokenize.OP: "OP", - tokenize.ENDMARKER: "ENDMARKER", - tokenize.COMMENT: "COMMENT", - } - - def __init__(self, grammar): - self.grammar = grammar - grammar_adaptor = io.StringIO(grammar) - self.generator = tokenize.generate_tokens(grammar_adaptor.readline) - self._gettoken() # Initialize lookahead - self._current_rule_name = None - - def parse(self): - """Turn the grammar into a collection of NFAs""" - # grammar: (NEWLINE | rule)* ENDMARKER - while self.type != tokenize.ENDMARKER: - while self.type == tokenize.NEWLINE: - self._gettoken() - # rule: NAME ':' rhs NEWLINE - self._current_rule_name = self._expect(tokenize.NAME) - self._expect(tokenize.OP, ":") - a, z = self._parse_rhs() - self._expect(tokenize.NEWLINE) - - yield NFA(a, z) - - def _parse_rhs(self): - # rhs: items ('|' items)* - a, z = self._parse_items() - if self.value != "|": - return a, z - else: - aa = NFAState(self._current_rule_name) - zz = NFAState(self._current_rule_name) - while True: - # Allow to transit directly to the previous state and connect the end of the - # previous state to the end of the current one, effectively allowing to skip - # the current state. - aa.add_arc(a) - z.add_arc(zz) - if self.value != "|": - break - - self._gettoken() - a, z = self._parse_items() - return aa, zz - - def _parse_items(self): - # items: item+ - a, b = self._parse_item() - while self.type in (tokenize.NAME, tokenize.STRING) or self.value in ("(", "["): - c, d = self._parse_item() - # Allow a transition between the end of the previous state - # and the beginning of the new one, connecting all the items - # together. In this way we can only reach the end if we visit - # all the items. - b.add_arc(c) - b = d - return a, b - - def _parse_item(self): - # item: '[' rhs ']' | atom ['+' | '*'] - if self.value == "[": - self._gettoken() - a, z = self._parse_rhs() - self._expect(tokenize.OP, "]") - # Make a transition from the beginning to the end so it is possible to - # advance for free to the next state of this item # without consuming - # anything from the rhs. - a.add_arc(z) - return a, z - else: - a, z = self._parse_atom() - value = self.value - if value not in ("+", "*"): - return a, z - self._gettoken() - z.add_arc(a) - if value == "+": - # Create a cycle to the beginning so we go back to the old state in this - # item and repeat. - return a, z - else: - # The end state is the same as the beginning, so we can cycle arbitrarily - # and end in the beginning if necessary. - return a, a - - def _parse_atom(self): - # atom: '(' rhs ')' | NAME | STRING - if self.value == "(": - self._gettoken() - a, z = self._parse_rhs() - self._expect(tokenize.OP, ")") - return a, z - elif self.type in (tokenize.NAME, tokenize.STRING): - a = NFAState(self._current_rule_name) - z = NFAState(self._current_rule_name) - # We can transit to the next state only if we consume the value. - a.add_arc(z, self.value) - self._gettoken() - return a, z - else: - self._raise_error( - "expected (...) or NAME or STRING, got {} ({})", - self._translation_table.get(self.type, self.type), - self.value, - ) - - def _expect(self, type_, value=None): - if self.type != type_: - self._raise_error( - "expected {}, got {} ({})", - self._translation_table.get(type_, type_), - self._translation_table.get(self.type, self.type), - self.value, - ) - if value is not None and self.value != value: - self._raise_error("expected {}, got {}", value, self.value) - value = self.value - self._gettoken() - return value - - def _gettoken(self): - tup = next(self.generator) - while tup[0] in (tokenize.COMMENT, tokenize.NL): - tup = next(self.generator) - self.type, self.value, self.begin, self.end, self.line = tup - - def _raise_error(self, msg, *args): - if args: - try: - msg = msg.format(*args) - except Exception: - msg = " ".join([msg] + list(map(str, args))) - line = self.grammar.splitlines()[self.begin[0] - 1] - raise SyntaxError(msg, ("<grammar>", self.begin[0], self.begin[1], line)) diff --git a/Parser/pgen/pgen.py b/Parser/pgen/pgen.py deleted file mode 100644 index 03032d4..0000000 --- a/Parser/pgen/pgen.py +++ /dev/null @@ -1,310 +0,0 @@ -"""Python parser generator - - -This parser generator transforms a Python grammar file into parsing tables -that can be consumed by Python's LL(1) parser written in C. - -Concepts --------- - -* An LL(1) parser (Left-to-right, Leftmost derivation, 1 token-lookahead) is a - top-down parser for a subset of context-free languages. It parses the input - from Left to right, performing Leftmost derivation of the sentence, and can - only use 1 token of lookahead when parsing a sentence. - -* A parsing table is a collection of data that a generic implementation of the - LL(1) parser consumes to know how to parse a given context-free grammar. In - this case the collection of data involves Deterministic Finite Automatons, - calculated first sets, keywords and transition labels. - -* A grammar is defined by production rules (or just 'productions') that specify - which symbols may replace which other symbols; these rules may be used to - generate strings, or to parse them. Each such rule has a head, or left-hand - side, which consists of the string that may be replaced, and a body, or - right-hand side, which consists of a string that may replace it. In the - Python grammar, rules are written in the form - - rule_name: rule_description; - - meaning the rule 'a: b' specifies that a can be replaced by b. A context-free - grammar is a grammar in which the left-hand side of each production rule - consists of only a single nonterminal symbol. Context-free grammars can - always be recognized by a Non-Deterministic Automatons. - -* Terminal symbols are literal symbols which may appear in the outputs of the - production rules of the grammar and which cannot be changed using the rules - of the grammar. Applying the rules recursively to a source string of symbols - will usually terminate in a final output string consisting only of terminal - symbols. - -* Nonterminal symbols are those symbols which can be replaced. The grammar - includes a start symbol a designated member of the set of nonterminals from - which all the strings in the language may be derived by successive - applications of the production rules. - -* The language defined by the grammar is defined as the set of terminal strings - that can be derived using the production rules. - -* The first sets of a rule (FIRST(rule)) are defined to be the set of terminals - that can appear in the first position of any string derived from the rule. - This is useful for LL(1) parsers as the parser is only allowed to look at the - next token in the input to know which rule needs to parse. For example, given - this grammar: - - start: '(' A | B ')' - A: 'a' '<' - B: 'b' '<' - - and the input '(b<)' the parser can only look at 'b' to know if it needs - to parse A o B. Because FIRST(A) = {'a'} and FIRST(B) = {'b'} it knows - that needs to continue parsing rule B because only that rule can start - with 'b'. - -Description ------------ - -The input for the parser generator is a grammar in extended BNF form (using * -for repetition, + for at-least-once repetition, [] for optional parts, | for -alternatives and () for grouping). - -Each rule in the grammar file is considered as a regular expression in its -own right. It is turned into a Non-deterministic Finite Automaton (NFA), -which is then turned into a Deterministic Finite Automaton (DFA), which is -then optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3, -or similar compiler books (this technique is more often used for lexical -analyzers). - -The DFA's are used by the parser as parsing tables in a special way that's -probably unique. Before they are usable, the FIRST sets of all non-terminals -are computed so the LL(1) parser consuming the parsing tables can distinguish -between different transitions. -Reference ---------- - -[Aho&Ullman 77] - Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977 - (first edition) -""" - -from ast import literal_eval -import collections - -from . import grammar, token -from .automata import DFA -from .metaparser import GrammarParser - -import enum - - -class LabelType(enum.Enum): - NONTERMINAL = 0 - NAMED_TOKEN = 1 - KEYWORD = 2 - OPERATOR = 3 - NONE = 4 - - -class Label(str): - def __init__(self, value): - self.type = self._get_type() - - def _get_type(self): - if self[0].isalpha(): - if self.upper() == self: - # NAMED tokens (ASYNC, NAME...) are all uppercase by convention - return LabelType.NAMED_TOKEN - else: - # If is not uppercase it must be a non terminal. - return LabelType.NONTERMINAL - else: - # Keywords and operators are wrapped in quotes - assert self[0] == self[-1] in ('"', "'"), self - value = literal_eval(self) - if value[0].isalpha(): - return LabelType.KEYWORD - else: - return LabelType.OPERATOR - - def __repr__(self): - return "{}({})".format(self.type, super().__repr__()) - - -class ParserGenerator(object): - def __init__(self, grammar_file, token_file, verbose=False, graph_file=None): - with open(grammar_file) as f: - self.grammar = f.read() - with open(token_file) as tok_file: - token_lines = tok_file.readlines() - self.tokens = dict(token.generate_tokens(token_lines)) - self.opmap = dict(token.generate_opmap(token_lines)) - # Manually add <> so it does not collide with != - self.opmap["<>"] = "NOTEQUAL" - self.verbose = verbose - self.filename = grammar_file - self.graph_file = graph_file - self.dfas, self.startsymbol = self.create_dfas() - self.first = {} # map from symbol name to set of tokens - self.calculate_first_sets() - - def create_dfas(self): - rule_to_dfas = collections.OrderedDict() - start_nonterminal = None - for nfa in GrammarParser(self.grammar).parse(): - if self.verbose: - print("Dump of NFA for", nfa.name) - nfa.dump() - if self.graph_file is not None: - nfa.dump_graph(self.graph_file.write) - dfa = DFA.from_nfa(nfa) - if self.verbose: - print("Dump of DFA for", dfa.name) - dfa.dump() - dfa.simplify() - if self.graph_file is not None: - dfa.dump_graph(self.graph_file.write) - rule_to_dfas[dfa.name] = dfa - - if start_nonterminal is None: - start_nonterminal = dfa.name - - return rule_to_dfas, start_nonterminal - - def make_grammar(self): - c = grammar.Grammar() - c.all_labels = set() - names = list(self.dfas.keys()) - names.remove(self.startsymbol) - names.insert(0, self.startsymbol) - for name in names: - i = 256 + len(c.symbol2number) - c.symbol2number[Label(name)] = i - c.number2symbol[i] = Label(name) - c.all_labels.add(name) - for name in names: - self.make_label(c, name) - dfa = self.dfas[name] - states = [] - for state in dfa: - arcs = [] - for label, next in sorted(state.arcs.items()): - c.all_labels.add(label) - arcs.append((self.make_label(c, label), dfa.states.index(next))) - if state.is_final: - arcs.append((0, dfa.states.index(state))) - states.append(arcs) - c.states.append(states) - c.dfas[c.symbol2number[name]] = (states, self.make_first_sets(c, name)) - c.start = c.symbol2number[self.startsymbol] - - if self.verbose: - print("") - print("Grammar summary") - print("===============") - - print("- {n_labels} labels".format(n_labels=len(c.labels))) - print("- {n_dfas} dfas".format(n_dfas=len(c.dfas))) - print("- {n_tokens} tokens".format(n_tokens=len(c.tokens))) - print("- {n_keywords} keywords".format(n_keywords=len(c.keywords))) - print( - "- Start symbol: {start_symbol}".format( - start_symbol=c.number2symbol[c.start] - ) - ) - return c - - def make_first_sets(self, c, name): - rawfirst = self.first[name] - first = set() - for label in sorted(rawfirst): - ilabel = self.make_label(c, label) - ##assert ilabel not in first # XXX failed on <> ... != - first.add(ilabel) - return first - - def make_label(self, c, label): - label = Label(label) - ilabel = len(c.labels) - - if label.type == LabelType.NONTERMINAL: - if label in c.symbol2label: - return c.symbol2label[label] - else: - c.labels.append((c.symbol2number[label], None)) - c.symbol2label[label] = ilabel - return ilabel - elif label.type == LabelType.NAMED_TOKEN: - # A named token (NAME, NUMBER, STRING) - itoken = self.tokens.get(label, None) - assert isinstance(itoken, int), label - assert itoken in self.tokens.values(), label - if itoken in c.tokens: - return c.tokens[itoken] - else: - c.labels.append((itoken, None)) - c.tokens[itoken] = ilabel - return ilabel - elif label.type == LabelType.KEYWORD: - # A keyword - value = literal_eval(label) - if value in c.keywords: - return c.keywords[value] - else: - c.labels.append((self.tokens["NAME"], value)) - c.keywords[value] = ilabel - return ilabel - elif label.type == LabelType.OPERATOR: - # An operator (any non-numeric token) - value = literal_eval(label) - tok_name = self.opmap[value] # Fails if unknown token - itoken = self.tokens[tok_name] - if itoken in c.tokens: - return c.tokens[itoken] - else: - c.labels.append((itoken, None)) - c.tokens[itoken] = ilabel - return ilabel - else: - raise ValueError("Cannot categorize label {}".format(label)) - - def calculate_first_sets(self): - names = list(self.dfas.keys()) - for name in names: - if name not in self.first: - self.calculate_first_sets_for_rule(name) - - if self.verbose: - print("First set for {dfa_name}".format(dfa_name=name)) - for item in self.first[name]: - print(" - {terminal}".format(terminal=item)) - - def calculate_first_sets_for_rule(self, name): - dfa = self.dfas[name] - self.first[name] = None # dummy to detect left recursion - state = dfa.states[0] - totalset = set() - overlapcheck = {} - for label, next in state.arcs.items(): - if label in self.dfas: - if label in self.first: - fset = self.first[label] - if fset is None: - raise ValueError("recursion for rule %r" % name) - else: - self.calculate_first_sets_for_rule(label) - fset = self.first[label] - totalset.update(fset) - overlapcheck[label] = fset - else: - totalset.add(label) - overlapcheck[label] = {label} - inverse = {} - for label, itsfirst in overlapcheck.items(): - for symbol in itsfirst: - if symbol in inverse: - raise ValueError( - "rule %s is ambiguous; %s is in the" - " first sets of %s as well as %s" - % (name, symbol, label, inverse[symbol]) - ) - inverse[symbol] = label - self.first[name] = totalset diff --git a/Parser/pgen/token.py b/Parser/pgen/token.py deleted file mode 100644 index 2cff62c..0000000 --- a/Parser/pgen/token.py +++ /dev/null @@ -1,38 +0,0 @@ -import itertools - - -def generate_tokens(tokens): - numbers = itertools.count(0) - for line in tokens: - line = line.strip() - - if not line or line.startswith("#"): - continue - - name = line.split()[0] - yield (name, next(numbers)) - - yield ("N_TOKENS", next(numbers)) - yield ("NT_OFFSET", 256) - - -def generate_opmap(tokens): - for line in tokens: - line = line.strip() - - if not line or line.startswith("#"): - continue - - pieces = line.split() - - if len(pieces) != 2: - continue - - name, op = pieces - yield (op.strip("'"), name) - - # Yield independently <>. This is needed so it does not collide - # with the token generation in "generate_tokens" because if this - # symbol is included in Grammar/Tokens, it will collide with != - # as it has the same name (NOTEQUAL). - yield ("<>", "NOTEQUAL") |