From 71876fa438f706b211360d8c205cb985906212ee Mon Sep 17 00:00:00 2001
From: Pablo Galindo <Pablogsal@gmail.com>
Date: Thu, 22 Aug 2019 02:38:39 +0100
Subject: Refactor Parser/pgen and add documentation and explanations
 (GH-15373)

* Refactor Parser/pgen and add documentation and explanations

To improve the readability and maintainability of the parser
generator perform the following transformations:

    * Separate the metagrammar parser in its own class to simplify
      the parser generator logic.

    * Create separate classes for DFAs and NFAs and move methods that
      act exclusively on them from the parser generator to these
      classes.

    * Add docstrings and comment documenting the process to go from
      the grammar file into NFAs and then DFAs. Detail some of the
      algorithms and give some background explanations of some concepts
      that will helps readers not familiar with the parser generation
      process.

    * Select more descriptive names for some variables and variables.

    * PEP8 formatting and quote-style homogenization.

The output of the parser generator remains the same (Include/graminit.h
and Python/graminit.c remain untouched by running the new parser generator).
---
 Parser/pgen/__main__.py   |   8 +-
 Parser/pgen/automata.py   | 371 ++++++++++++++++++++++++++++++++
 Parser/pgen/grammar.py    |   6 +-
 Parser/pgen/keywordgen.py |  11 +-
 Parser/pgen/metaparser.py | 152 +++++++++++++
 Parser/pgen/pgen.py       | 527 +++++++++++++++++++---------------------------
 Parser/pgen/token.py      |  10 +-
 7 files changed, 753 insertions(+), 332 deletions(-)
 create mode 100644 Parser/pgen/automata.py
 create mode 100644 Parser/pgen/metaparser.py

diff --git a/Parser/pgen/__main__.py b/Parser/pgen/__main__.py
index eea5261..bb96e75 100644
--- a/Parser/pgen/__main__.py
+++ b/Parser/pgen/__main__.py
@@ -8,17 +8,15 @@ def main():
     parser.add_argument(
         "grammar", type=str, help="The file with the grammar definition in EBNF format"
     )
-    parser.add_argument(
-        "tokens", type=str, help="The file with the token definitions"
-    )
+    parser.add_argument("tokens", type=str, help="The file with the token definitions")
     parser.add_argument(
         "graminit_h",
-        type=argparse.FileType('w'),
+        type=argparse.FileType("w"),
         help="The path to write the grammar's non-terminals as #defines",
     )
     parser.add_argument(
         "graminit_c",
-        type=argparse.FileType('w'),
+        type=argparse.FileType("w"),
         help="The path to write the grammar as initialized data",
     )
 
diff --git a/Parser/pgen/automata.py b/Parser/pgen/automata.py
new file mode 100644
index 0000000..3147d86
--- /dev/null
+++ b/Parser/pgen/automata.py
@@ -0,0 +1,371 @@
+"""Classes representing state-machine concepts"""
+
+class NFA:
+    """A non deterministic finite automata
+
+    A non deterministic automata is a form of a finite state
+    machine. An NFA's rules are less restrictive than a DFA.
+    The NFA rules are:
+
+      * A transition can be non-deterministic and can result in
+        nothing, one, or two or more states.
+
+      * An epsilon transition consuming empty input is valid.
+        Transitions consuming labeled symbols are also permitted.
+
+    This class assumes that there is only one starting state and one
+    accepting (ending) state.
+
+    Attributes:
+        name (str): The name of the rule the NFA is representing.
+        start (NFAState): The starting state.
+        end (NFAState): The ending state
+    """
+
+    def __init__(self, start, end):
+        self.name = start.rule_name
+        self.start = start
+        self.end = end
+
+    def __repr__(self):
+        return "NFA(start={}, end={})".format(self.start, self.end)
+
+    def dump(self, writer=print):
+        """Dump a graphical representation of the NFA"""
+        todo = [self.start]
+        for i, state in enumerate(todo):
+            writer("  State", i, state is self.end and "(final)" or "")
+            for arc in state.arcs:
+                label = arc.label
+                next = arc.target
+                if next in todo:
+                    j = todo.index(next)
+                else:
+                    j = len(todo)
+                    todo.append(next)
+                if label is None:
+                    writer("    -> %d" % j)
+                else:
+                    writer("    %s -> %d" % (label, j))
+
+
+class NFAArc:
+    """An arc representing a transition between two NFA states.
+
+    NFA states can be connected via two ways:
+
+        * A label transition: An input equal to the label must
+          be consumed to perform the transition.
+        * An epsilon transition: The transition can be taken without
+          consuming any input symbol.
+
+        Attributes:
+            target (NFAState): The ending state of the transition arc.
+            label (Optional[str]): The label that must be consumed to make
+                the transition. An epsilon transition is represented
+                using `None`.
+    """
+
+    def __init__(self, target, label):
+        self.target = target
+        self.label = label
+
+    def __repr__(self):
+        return "<%s: %s>" % (self.__class__.__name__, self.label)
+
+
+class NFAState:
+    """A state of a NFA, non deterministic finite automata.
+
+    Attributes:
+        target (rule_name): The name of the rule used to represent the NFA's
+            ending state after a transition.
+        arcs (Dict[Optional[str], NFAState]): A mapping representing transitions
+            between the current NFA state and another NFA state via following
+            a label.
+    """
+
+    def __init__(self, rule_name):
+        self.rule_name = rule_name
+        self.arcs = []
+
+    def add_arc(self, target, label=None):
+        """Add a new arc to connect the state to a target state within the NFA
+
+        The method adds a new arc to the list of arcs available as transitions
+        from the present state. An optional label indicates a named transition
+        that consumes an input while the absence of a label represents an epsilon
+        transition.
+
+        Attributes:
+            target (NFAState): The end of the transition that the arc represents.
+            label (Optional[str]): The label that must be consumed for making
+                the transition. If the label is not provided the transition is assumed
+                to be an epsilon-transition.
+        """
+        assert label is None or isinstance(label, str)
+        assert isinstance(target, NFAState)
+        self.arcs.append(NFAArc(target, label))
+
+    def __repr__(self):
+        return "<%s: from %s>" % (self.__class__.__name__, self.rule_name)
+
+
+class DFA:
+    """A deterministic finite automata
+
+    A deterministic finite automata is a form of a finite state machine
+    that obeys the following rules:
+
+       * Each of the transitions is uniquely determined by
+         the source state and input symbol
+       * Reading an input symbol is required for each state
+         transition (no epsilon transitions).
+
+    The finite-state machine will accept or reject a string of symbols
+    and only produces a unique computation of the automaton for each input
+    string. The DFA must have a unique starting state (represented as the first
+    element in the list of states) but can have multiple final states.
+
+    Attributes:
+        name (str): The name of the rule the DFA is representing.
+        states (List[DFAState]): A collection of DFA states.
+    """
+
+    def __init__(self, name, states):
+        self.name = name
+        self.states = states
+
+    @classmethod
+    def from_nfa(cls, nfa):
+        """Constructs a DFA from a NFA using the Rabin–Scott construction algorithm.
+
+        To simulate the operation of a DFA on a given input string, it's
+        necessary to keep track of a single state at any time, or more precisely,
+        the state that the automaton will reach after seeing a prefix of the
+        input. In contrast, to simulate an NFA, it's necessary to keep track of
+        a set of states: all of the states that the automaton could reach after
+        seeing the same prefix of the input, according to the nondeterministic
+        choices made by the automaton. There are two possible sources of
+        non-determinism:
+
+        1) Multiple (one or more) transitions with the same label
+
+                         'A'     +-------+
+                    +----------->+ State +----------->+
+                    |            |   2   |
+            +-------+            +-------+
+            | State |
+            |   1   |            +-------+
+            +-------+            | State |
+                    +----------->+   3   +----------->+
+                         'A'     +-------+
+
+        2) Epsilon transitions (transitions that can be taken without consuming any input)
+
+            +-------+            +-------+
+            | State |     ε      | State |
+            |   1   +----------->+   2   +----------->+
+            +-------+            +-------+
+
+        Looking at the first case above, we can't determine which transition should be
+        followed when given an input A. We could choose whether or not to follow the
+        transition while in the second case the problem is that we can choose both to
+        follow the transition or not doing it. To solve this problem we can imagine that
+        we follow all possibilities at the same time and we construct new states from the
+        set of all possible reachable states. For every case in the previous example:
+
+
+        1) For multiple transitions with the same label we colapse all of the
+           final states under the same one
+
+            +-------+            +-------+
+            | State |     'A'    | State |
+            |   1   +----------->+  2-3  +----------->+
+            +-------+            +-------+
+
+        2) For epsilon transitions we collapse all epsilon-reachable states
+           into the same one
+
+            +-------+
+            | State |
+            |  1-2  +----------->
+            +-------+
+
+        Because the DFA states consist of sets of NFA states, an n-state NFA
+        may be converted to a DFA with at most 2**n states. Notice that the
+        constructed DFA is not minimal and can be simplified or reduced
+        afterwards.
+
+        Parameters:
+            name (NFA): The NFA to transform to DFA.
+        """
+        assert isinstance(nfa, NFA)
+
+        def add_closure(nfa_state, base_nfa_set):
+            """Calculate the epsilon-closure of a given state
+
+            Add to the *base_nfa_set* all the states that are
+            reachable from *nfa_state* via epsilon-transitions.
+            """
+            assert isinstance(nfa_state, NFAState)
+            if nfa_state in base_nfa_set:
+                return
+            base_nfa_set.add(nfa_state)
+            for nfa_arc in nfa_state.arcs:
+                if nfa_arc.label is None:
+                    add_closure(nfa_arc.target, base_nfa_set)
+
+        # Calculte the epsilon-closure of the starting state
+        base_nfa_set = set()
+        add_closure(nfa.start, base_nfa_set)
+
+        # Start by visiting the NFA starting state (there is only one).
+        states = [DFAState(nfa.name, base_nfa_set, nfa.end)]
+
+        for state in states:  # NB states grow while we're iterating
+
+            # Find transitions from the current state to other reachable states
+            # and store them in mapping that correlates the label to all the
+            # possible reachable states that can be obtained by consuming a
+            # token equal to the label. Each set of all the states that can
+            # be reached after following a label will be the a DFA state.
+            arcs = {}
+            for nfa_state in state.nfa_set:
+                for nfa_arc in nfa_state.arcs:
+                    if nfa_arc.label is not None:
+                        nfa_set = arcs.setdefault(nfa_arc.label, set())
+                        # All states that can be reached by epsilon-transitions
+                        # are also included in the set of reachable states.
+                        add_closure(nfa_arc.target, nfa_set)
+
+            # Now create new DFAs by visiting all posible transitions between
+            # the current DFA state and the new power-set states (each nfa_set)
+            # via the different labels. As the nodes are appended to *states* this
+            # is performing a deep-first search traversal over the power-set of
+            # the states of the original NFA.
+            for label, nfa_set in sorted(arcs.items()):
+                for exisisting_state in states:
+                    if exisisting_state.nfa_set == nfa_set:
+                        # The DFA state already exists for this rule.
+                        next_state = exisisting_state
+                        break
+                else:
+                    next_state = DFAState(nfa.name, nfa_set, nfa.end)
+                    states.append(next_state)
+
+                # Add a transition between the current DFA state and the new
+                # DFA state (the power-set state) via the current label.
+                state.add_arc(next_state, label)
+
+        return cls(nfa.name, states)
+
+    def __iter__(self):
+        return iter(self.states)
+
+    def simplify(self):
+        """Attempt to reduce the number of states of the DFA
+
+        Transform the DFA into an equivalent DFA that has fewer states. Two
+        classes of states can be removed or merged from the original DFA without
+        affecting the language it accepts to minimize it:
+
+            * Unreachable states can not be reached from the initial
+              state of the DFA, for any input string.
+            * Nondistinguishable states are those that cannot be distinguished
+              from one another for any input string.
+
+        This algorithm does not achieve the optimal fully-reduced solution, but it
+        works well enough for the particularities of the Python grammar. The
+        algorithm repeatedly looks for two states that have the same set of
+        arcs (same labels pointing to the same nodes) and unifies them, until
+        things stop changing.
+        """
+        changes = True
+        while changes:
+            changes = False
+            for i, state_i in enumerate(self.states):
+                for j in range(i + 1, len(self.states)):
+                    state_j = self.states[j]
+                    if state_i == state_j:
+                        del self.states[j]
+                        for state in self.states:
+                            state.unifystate(state_j, state_i)
+                        changes = True
+                        break
+
+    def dump(self, writer=print):
+        """Dump a graphical representation of the DFA"""
+        for i, state in enumerate(self.states):
+            writer("  State", i, state.is_final and "(final)" or "")
+            for label, next in sorted(state.arcs.items()):
+                writer("    %s -> %d" % (label, self.states.index(next)))
+
+
+class DFAState(object):
+    """A state of a DFA
+
+    Attributes:
+        rule_name (rule_name): The name of the DFA rule containing the represented state.
+        nfa_set (Set[NFAState]): The set of NFA states used to create this state.
+        final (bool): True if the state represents an accepting state of the DFA
+            containing this state.
+        arcs (Dict[label, DFAState]): A mapping representing transitions between
+            the current DFA state and another DFA state via following a label.
+    """
+
+    def __init__(self, rule_name, nfa_set, final):
+        assert isinstance(nfa_set, set)
+        assert isinstance(next(iter(nfa_set)), NFAState)
+        assert isinstance(final, NFAState)
+        self.rule_name = rule_name
+        self.nfa_set = nfa_set
+        self.arcs = {}  # map from terminals/nonterminals to DFAState
+        self.is_final = final in nfa_set
+
+    def add_arc(self, target, label):
+        """Add a new arc to the current state.
+
+        Parameters:
+            target (DFAState): The DFA state at the end of the arc.
+            label (str): The label respresenting the token that must be consumed
+                to perform this transition.
+        """
+        assert isinstance(label, str)
+        assert label not in self.arcs
+        assert isinstance(target, DFAState)
+        self.arcs[label] = target
+
+    def unifystate(self, old, new):
+        """Replace all arcs from the current node to *old* with *new*.
+
+        Parameters:
+            old (DFAState): The  DFA state to remove from all existing arcs.
+            new (DFAState): The DFA state to replace in all existing arcs.
+        """
+        for label, next_ in self.arcs.items():
+            if next_ is old:
+                self.arcs[label] = new
+
+    def __eq__(self, other):
+        # The nfa_set does not matter for  equality
+        assert isinstance(other, DFAState)
+        if self.is_final != other.is_final:
+            return False
+        # We cannot just return self.arcs == other.arcs because that
+        # would invoke this method recursively if there are any cycles.
+        if len(self.arcs) != len(other.arcs):
+            return False
+        for label, next_ in self.arcs.items():
+            if next_ is not other.arcs.get(label):
+                return False
+        return True
+
+    __hash__ = None  # For Py3 compatibility.
+
+    def __repr__(self):
+        return "<%s: %s is_final=%s>" % (
+            self.__class__.__name__,
+            self.rule_name,
+            self.is_final,
+        )
diff --git a/Parser/pgen/grammar.py b/Parser/pgen/grammar.py
index 5cd6524..56188db 100644
--- a/Parser/pgen/grammar.py
+++ b/Parser/pgen/grammar.py
@@ -76,12 +76,14 @@ class Grammar:
 
     def print_labels(self, writer):
         writer(
-            "static const label labels[{n_labels}] = {{\n".format(n_labels=len(self.labels))
+            "static const label labels[{n_labels}] = {{\n".format(
+                n_labels=len(self.labels)
+            )
         )
         for label, name in self.labels:
             label_name = '"{}"'.format(name) if name is not None else 0
             writer(
-                '    {{{label}, {label_name}}},\n'.format(
+                "    {{{label}, {label_name}}},\n".format(
                     label=label, label_name=label_name
                 )
             )
diff --git a/Parser/pgen/keywordgen.py b/Parser/pgen/keywordgen.py
index eeb3ef7..f0234a8 100644
--- a/Parser/pgen/keywordgen.py
+++ b/Parser/pgen/keywordgen.py
@@ -32,17 +32,16 @@ EXTRA_KEYWORDS = ["async", "await"]
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Generate the Lib/keywords.py "
-                                                 "file from the grammar.")
-    parser.add_argument(
-        "grammar", type=str, help="The file with the grammar definition in EBNF format"
+    parser = argparse.ArgumentParser(
+        description="Generate the Lib/keywords.py " "file from the grammar."
     )
     parser.add_argument(
-        "tokens", type=str, help="The file with the token definitions"
+        "grammar", type=str, help="The file with the grammar definition in EBNF format"
     )
+    parser.add_argument("tokens", type=str, help="The file with the token definitions")
     parser.add_argument(
         "keyword_file",
-        type=argparse.FileType('w'),
+        type=argparse.FileType("w"),
         help="The path to write the keyword definitions",
     )
     args = parser.parse_args()
diff --git a/Parser/pgen/metaparser.py b/Parser/pgen/metaparser.py
new file mode 100644
index 0000000..074a083
--- /dev/null
+++ b/Parser/pgen/metaparser.py
@@ -0,0 +1,152 @@
+"""Parser for the Python metagrammar"""
+
+import io
+import tokenize  # from stdlib
+
+from .automata import NFA, NFAState
+
+
+class GrammarParser:
+    """Parser for Python grammar files."""
+
+    _translation_table = {
+        tokenize.NAME: "NAME",
+        tokenize.STRING: "STRING",
+        tokenize.NEWLINE: "NEWLINE",
+        tokenize.NL: "NL",
+        tokenize.OP: "OP",
+        tokenize.ENDMARKER: "ENDMARKER",
+        tokenize.COMMENT: "COMMENT",
+    }
+
+    def __init__(self, grammar):
+        self.grammar = grammar
+        grammar_adaptor = io.StringIO(grammar)
+        self.generator = tokenize.generate_tokens(grammar_adaptor.readline)
+        self._gettoken()  # Initialize lookahead
+        self._current_rule_name = None
+
+    def parse(self):
+        """Turn the grammar into a collection of NFAs"""
+        # grammar: (NEWLINE | rule)* ENDMARKER
+        while self.type != tokenize.ENDMARKER:
+            while self.type == tokenize.NEWLINE:
+                self._gettoken()
+            # rule: NAME ':' rhs NEWLINE
+            self._current_rule_name = self._expect(tokenize.NAME)
+            self._expect(tokenize.OP, ":")
+            a, z = self._parse_rhs()
+            self._expect(tokenize.NEWLINE)
+
+            yield NFA(a, z)
+
+    def _parse_rhs(self):
+        # rhs: items ('|' items)*
+        a, z = self._parse_items()
+        if self.value != "|":
+            return a, z
+        else:
+            aa = NFAState(self._current_rule_name)
+            zz = NFAState(self._current_rule_name)
+            while True:
+                # Allow to transit directly to the previous state and connect the end of the
+                # previous state to the end of the current one, effectively allowing to skip
+                # the current state.
+                aa.add_arc(a)
+                z.add_arc(zz)
+                if self.value != "|":
+                    break
+
+                self._gettoken()
+                a, z = self._parse_items()
+            return aa, zz
+
+    def _parse_items(self):
+        # items: item+
+        a, b = self._parse_item()
+        while self.type in (tokenize.NAME, tokenize.STRING) or self.value in ("(", "["):
+            c, d = self._parse_item()
+            # Allow a transition between the end of the previous state
+            # and the beginning of the new one, connecting all the items
+            # together. In this way we can only reach the end if we visit
+            # all the items.
+            b.add_arc(c)
+            b = d
+        return a, b
+
+    def _parse_item(self):
+        # item: '[' rhs ']' | atom ['+' | '*']
+        if self.value == "[":
+            self._gettoken()
+            a, z = self._parse_rhs()
+            self._expect(tokenize.OP, "]")
+            # Make a transition from the beginning to the end so it is possible to
+            # advance for free to the next state of this item # without consuming
+            # anything from the rhs.
+            a.add_arc(z)
+            return a, z
+        else:
+            a, z = self._parse_atom()
+            value = self.value
+            if value not in ("+", "*"):
+                return a, z
+            self._gettoken()
+            z.add_arc(a)
+            if value == "+":
+                # Create a cycle to the beginning so we go back to the old state in this
+                # item and repeat.
+                return a, z
+            else:
+                # The end state is the same as the beginning, so we can cycle arbitrarily
+                # and end in the beginning if necessary.
+                return a, a
+
+    def _parse_atom(self):
+        # atom: '(' rhs ')' | NAME | STRING
+        if self.value == "(":
+            self._gettoken()
+            a, z = self._parse_rhs()
+            self._expect(tokenize.OP, ")")
+            return a, z
+        elif self.type in (tokenize.NAME, tokenize.STRING):
+            a = NFAState(self._current_rule_name)
+            z = NFAState(self._current_rule_name)
+            # We can transit to the next state only if we consume the value.
+            a.add_arc(z, self.value)
+            self._gettoken()
+            return a, z
+        else:
+            self._raise_error(
+                "expected (...) or NAME or STRING, got {} ({})",
+                self._translation_table.get(self.type, self.type),
+                self.value,
+            )
+
+    def _expect(self, type_, value=None):
+        if self.type != type_:
+            self._raise_error(
+                "expected {}, got {} ({})",
+                self._translation_table.get(type_, type_),
+                self._translation_table.get(self.type, self.type),
+                self.value,
+            )
+        if value is not None and self.value != value:
+            self._raise_error("expected {}, got {}", value, self.value)
+        value = self.value
+        self._gettoken()
+        return value
+
+    def _gettoken(self):
+        tup = next(self.generator)
+        while tup[0] in (tokenize.COMMENT, tokenize.NL):
+            tup = next(self.generator)
+        self.type, self.value, self.begin, self.end, self.line = tup
+
+    def _raise_error(self, msg, *args):
+        if args:
+            try:
+                msg = msg.format(*args)
+            except Exception:
+                msg = " ".join([msg] + list(map(str, args)))
+        line = self.grammar.splitlines()[self.begin[0] - 1]
+        raise SyntaxError(msg, ("<grammar>", self.begin[0], self.begin[1], line))
diff --git a/Parser/pgen/pgen.py b/Parser/pgen/pgen.py
index d52d58f..d7dcb76 100644
--- a/Parser/pgen/pgen.py
+++ b/Parser/pgen/pgen.py
@@ -1,42 +1,180 @@
+"""Python parser generator
+
+
+This parser generator transforms a Python grammar file into parsing tables
+that can be consumed by Python's LL(1) parser written in C.
+
+Concepts
+--------
+
+* An LL(1) parser (Left-to-right, Leftmost derivation, 1 token-lookahead) is a
+  top-down parser for a subset of context-free languages. It parses the input
+  from Left to right, performing Leftmost derivation of the sentence, and can
+  only use 1 tokens of lookahead when parsing a sentence.
+
+* A parsing table is a collection of data that a generic implementation of the
+  LL(1) parser consumes to know how to parse a given context-free grammar. In
+  this case the collection of thata involves Deterministic Finite Automatons,
+  calculated first sets, keywords and transition labels.
+
+* A grammar is defined by production rules (or just 'productions') that specify
+  which symbols may replace which other symbols; these rules may be used to
+  generate strings, or to parse them. Each such rule has a head, or left-hand
+  side, which consists of the string that may be replaced, and a body, or
+  right-hand side, which consists of a string that may replace it. In the
+  Python grammar, rules are written in the form
+
+  rule_name: rule_description;
+
+  meaning the rule 'a: b' specifies that a can be replaced by b. A Context-free
+  grammars is a grammars in which the left-hand side of each production rule
+  consists of only a single nonterminal symbol. Context free grammars can
+  always be recognized by a Non-Deterministic Automatons.
+
+* Terminal symbols are literal symbols which may appear in the outputs of the
+  production rules of the grammar and which cannot be changed using the rules
+  of the grammar. Applying the rules recursively to a source string of symbols
+  will usually terminate in a final output string consisting only of terminal
+  symbols.
+
+* Nonterminal symbols are those symbols which can be replaced. The grammar
+  includes a start symbol a designated member of the set of nonterminals from
+  which all the strings in the language may be derived by successive
+  applications of the production rules.
+
+* The language defined by the grammar is defined as the set of terminal strings
+  that can be derived using the production rules.
+
+* The first sets of a rule (FIRST(rule)) are defined to be the set of terminals
+  that can appear in the first position of any string derived from the rule.
+  This is useful for LL(1) parsers as the parser is only allow to look at the
+  next token in the input to know which rule needs to parse. For example given
+  this grammar:
+
+  start: '(' A | B ')'
+  A: 'a' '<'
+  B: 'b' '<'
+
+  and the input '(b<)' the parser can only look at 'b' to know if it needs
+  to parse A o B. Because FIRST(A) = {'a'} and FIRST(B) = {'b'} it knows
+  that needs to continue parsing rule B because only that rule can start
+  with 'b'.
+
+Description
+-----------
+
+The input for the parser generator is a grammar in extended BNF form (using *
+for repetition, + for at-least-once repetition, [] for optional parts, | for
+alternatives and () for grouping).
+
+Each rule in the grammar file is considered as a regular expression in its
+own right. It is turned into a Non-deterministic Finite Automaton (NFA),
+which is then turned into a Deterministic Finite Automaton (DFA), which is
+then optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
+or similar compiler books (this technique is more often used for lexical
+analyzers).
+
+The DFA's are used by the parser as parsing tables in a special way that's
+probably unique. Before they are usable, the FIRST sets of all non-terminals
+are computed so the LL(1) parser consuming the parsing tables can distinguish
+between different transitions.
+Reference
+---------
+
+[Aho&Ullman 77]
+    Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
+    (first edition)
+"""
+
+from ast import literal_eval
 import collections
-import tokenize  # from stdlib
 
 from . import grammar, token
+from .automata import DFA
+from .metaparser import GrammarParser
 
+import enum
 
-class ParserGenerator(object):
 
-    def __init__(self, grammar_file, token_file, stream=None, verbose=False):
-        close_stream = None
-        if stream is None:
-            stream = open(grammar_file)
-            close_stream = stream.close
+class LabelType(enum.Enum):
+    NONTERMINAL = 0
+    NAMED_TOKEN = 1
+    KEYWORD = 2
+    OPERATOR = 3
+    NONE = 4
+
+
+class Label(str):
+    def __init__(self, value):
+        self.type = self._get_type()
+
+    def _get_type(self):
+        if self[0].isalpha():
+            if self.upper() == self:
+                # NAMED tokens (ASYNC, NAME...) are all uppercase by convention
+                return LabelType.NAMED_TOKEN
+            else:
+                # If is not uppercase it must be a non terminal.
+                return LabelType.NONTERMINAL
+        else:
+            # Keywords and operators are wrapped in quotes
+            assert self[0] == self[-1] in ('"', "'"), self
+            value = literal_eval(self)
+            if value[0].isalpha():
+                return LabelType.KEYWORD
+            else:
+                return LabelType.OPERATOR
+
+    def __repr__(self):
+        return "{}({})".format(self.type, super().__repr__())
+
+
+class ParserGenerator(object):
+    def __init__(self, grammar_file, token_file, verbose=False):
+        with open(grammar_file) as f:
+            self.grammar = f.read()
         with open(token_file) as tok_file:
             token_lines = tok_file.readlines()
         self.tokens = dict(token.generate_tokens(token_lines))
         self.opmap = dict(token.generate_opmap(token_lines))
         # Manually add <> so it does not collide with !=
-        self.opmap['<>'] = "NOTEQUAL"
+        self.opmap["<>"] = "NOTEQUAL"
         self.verbose = verbose
         self.filename = grammar_file
-        self.stream = stream
-        self.generator = tokenize.generate_tokens(stream.readline)
-        self.gettoken() # Initialize lookahead
-        self.dfas, self.startsymbol = self.parse()
-        if close_stream is not None:
-            close_stream()
-        self.first = {} # map from symbol name to set of tokens
-        self.addfirstsets()
+        self.dfas, self.startsymbol = self.create_dfas()
+        self.first = {}  # map from symbol name to set of tokens
+        self.calculate_first_sets()
+
+    def create_dfas(self):
+        rule_to_dfas = collections.OrderedDict()
+        start_nonterminal = None
+        for nfa in GrammarParser(self.grammar).parse():
+            if self.verbose:
+                print("Dump of NFA for", nfa.name)
+                nfa.dump()
+            dfa = DFA.from_nfa(nfa)
+            if self.verbose:
+                print("Dump of DFA for", dfa.name)
+                dfa.dump()
+            dfa.simplify()
+            rule_to_dfas[dfa.name] = dfa
+
+            if start_nonterminal is None:
+                start_nonterminal = dfa.name
+
+        return rule_to_dfas, start_nonterminal
 
     def make_grammar(self):
         c = grammar.Grammar()
+        c.all_labels = set()
         names = list(self.dfas.keys())
         names.remove(self.startsymbol)
         names.insert(0, self.startsymbol)
         for name in names:
             i = 256 + len(c.symbol2number)
-            c.symbol2number[name] = i
-            c.number2symbol[i] = name
+            c.symbol2number[Label(name)] = i
+            c.number2symbol[i] = Label(name)
+            c.all_labels.add(name)
         for name in names:
             self.make_label(c, name)
             dfa = self.dfas[name]
@@ -44,12 +182,13 @@ class ParserGenerator(object):
             for state in dfa:
                 arcs = []
                 for label, next in sorted(state.arcs.items()):
-                    arcs.append((self.make_label(c, label), dfa.index(next)))
-                if state.isfinal:
-                    arcs.append((0, dfa.index(state)))
+                    c.all_labels.add(label)
+                    arcs.append((self.make_label(c, label), dfa.states.index(next)))
+                if state.is_final:
+                    arcs.append((0, dfa.states.index(state)))
                 states.append(arcs)
             c.states.append(states)
-            c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))
+            c.dfas[c.symbol2number[name]] = (states, self.make_first_sets(c, name))
         c.start = c.symbol2number[self.startsymbol]
 
         if self.verbose:
@@ -68,7 +207,7 @@ class ParserGenerator(object):
             )
         return c
 
-    def make_first(self, c, name):
+    def make_first_sets(self, c, name):
         rawfirst = self.first[name]
         first = set()
         for label in sorted(rawfirst):
@@ -78,67 +217,65 @@ class ParserGenerator(object):
         return first
 
     def make_label(self, c, label):
-        # XXX Maybe this should be a method on a subclass of converter?
+        label = Label(label)
         ilabel = len(c.labels)
-        if label[0].isalpha():
-            # Either a symbol name or a named token
-            if label in c.symbol2number:
-                # A symbol name (a non-terminal)
-                if label in c.symbol2label:
-                    return c.symbol2label[label]
-                else:
-                    c.labels.append((c.symbol2number[label], None))
-                    c.symbol2label[label] = ilabel
-                    return ilabel
+
+        if label.type == LabelType.NONTERMINAL:
+            if label in c.symbol2label:
+                return c.symbol2label[label]
             else:
-                # A named token (NAME, NUMBER, STRING)
-                itoken = self.tokens.get(label, None)
-                assert isinstance(itoken, int), label
-                assert itoken in self.tokens.values(), label
-                if itoken in c.tokens:
-                    return c.tokens[itoken]
-                else:
-                    c.labels.append((itoken, None))
-                    c.tokens[itoken] = ilabel
-                    return ilabel
-        else:
-            # Either a keyword or an operator
-            assert label[0] in ('"', "'"), label
-            value = eval(label)
-            if value[0].isalpha():
-                # A keyword
-                if value in c.keywords:
-                    return c.keywords[value]
-                else:
-                    c.labels.append((self.tokens["NAME"], value))
-                    c.keywords[value] = ilabel
-                    return ilabel
+                c.labels.append((c.symbol2number[label], None))
+                c.symbol2label[label] = ilabel
+                return ilabel
+        elif label.type == LabelType.NAMED_TOKEN:
+            # A named token (NAME, NUMBER, STRING)
+            itoken = self.tokens.get(label, None)
+            assert isinstance(itoken, int), label
+            assert itoken in self.tokens.values(), label
+            if itoken in c.tokens:
+                return c.tokens[itoken]
             else:
-                # An operator (any non-numeric token)
-                tok_name = self.opmap[value] # Fails if unknown token
-                itoken = self.tokens[tok_name]
-                if itoken in c.tokens:
-                    return c.tokens[itoken]
-                else:
-                    c.labels.append((itoken, None))
-                    c.tokens[itoken] = ilabel
-                    return ilabel
+                c.labels.append((itoken, None))
+                c.tokens[itoken] = ilabel
+                return ilabel
+        elif label.type == LabelType.KEYWORD:
+            # A keyword
+            value = literal_eval(label)
+            if value in c.keywords:
+                return c.keywords[value]
+            else:
+                c.labels.append((self.tokens["NAME"], value))
+                c.keywords[value] = ilabel
+                return ilabel
+        elif label.type == LabelType.OPERATOR:
+            # An operator (any non-numeric token)
+            value = literal_eval(label)
+            tok_name = self.opmap[value]  # Fails if unknown token
+            itoken = self.tokens[tok_name]
+            if itoken in c.tokens:
+                return c.tokens[itoken]
+            else:
+                c.labels.append((itoken, None))
+                c.tokens[itoken] = ilabel
+                return ilabel
+        else:
+            raise ValueError("Cannot categorize label {}".format(label))
 
-    def addfirstsets(self):
+    def calculate_first_sets(self):
         names = list(self.dfas.keys())
         for name in names:
             if name not in self.first:
-                self.calcfirst(name)
+                self.calculate_first_sets_for_rule(name)
 
             if self.verbose:
                 print("First set for {dfa_name}".format(dfa_name=name))
                 for item in self.first[name]:
                     print("    - {terminal}".format(terminal=item))
 
-    def calcfirst(self, name):
+    def calculate_first_sets_for_rule(self, name):
         dfa = self.dfas[name]
-        self.first[name] = None # dummy to detect left recursion
-        state = dfa[0]
+        self.first[name] = None  # dummy to detect left recursion
+        state = dfa.states[0]
         totalset = set()
         overlapcheck = {}
         for label, next in state.arcs.items():
@@ -148,7 +285,7 @@ class ParserGenerator(object):
                     if fset is None:
                         raise ValueError("recursion for rule %r" % name)
                 else:
-                    self.calcfirst(label)
+                    self.calculate_first_sets_for_rule(label)
                     fset = self.first[label]
                 totalset.update(fset)
                 overlapcheck[label] = fset
@@ -159,248 +296,10 @@ class ParserGenerator(object):
         for label, itsfirst in overlapcheck.items():
             for symbol in itsfirst:
                 if symbol in inverse:
-                    raise ValueError("rule %s is ambiguous; %s is in the"
-                                     " first sets of %s as well as %s" %
-                                     (name, symbol, label, inverse[symbol]))
+                    raise ValueError(
+                        "rule %s is ambiguous; %s is in the"
+                        " first sets of %s as well as %s"
+                        % (name, symbol, label, inverse[symbol])
+                    )
                 inverse[symbol] = label
         self.first[name] = totalset
-
-    def parse(self):
-        dfas = collections.OrderedDict()
-        startsymbol = None
-        # MSTART: (NEWLINE | RULE)* ENDMARKER
-        while self.type != tokenize.ENDMARKER:
-            while self.type == tokenize.NEWLINE:
-                self.gettoken()
-            # RULE: NAME ':' RHS NEWLINE
-            name = self.expect(tokenize.NAME)
-            if self.verbose:
-                print("Processing rule {dfa_name}".format(dfa_name=name))
-            self.expect(tokenize.OP, ":")
-            a, z = self.parse_rhs()
-            self.expect(tokenize.NEWLINE)
-            if self.verbose:
-                self.dump_nfa(name, a, z)
-            dfa = self.make_dfa(a, z)
-            if self.verbose:
-                self.dump_dfa(name, dfa)
-            self.simplify_dfa(dfa)
-            dfas[name] = dfa
-            if startsymbol is None:
-                startsymbol = name
-        return dfas, startsymbol
-
-    def make_dfa(self, start, finish):
-        # To turn an NFA into a DFA, we define the states of the DFA
-        # to correspond to *sets* of states of the NFA.  Then do some
-        # state reduction.  Let's represent sets as dicts with 1 for
-        # values.
-        assert isinstance(start, NFAState)
-        assert isinstance(finish, NFAState)
-        def closure(state):
-            base = set()
-            addclosure(state, base)
-            return base
-        def addclosure(state, base):
-            assert isinstance(state, NFAState)
-            if state in base:
-                return
-            base.add(state)
-            for label, next in state.arcs:
-                if label is None:
-                    addclosure(next, base)
-        states = [DFAState(closure(start), finish)]
-        for state in states: # NB states grows while we're iterating
-            arcs = {}
-            for nfastate in state.nfaset:
-                for label, next in nfastate.arcs:
-                    if label is not None:
-                        addclosure(next, arcs.setdefault(label, set()))
-            for label, nfaset in sorted(arcs.items()):
-                for st in states:
-                    if st.nfaset == nfaset:
-                        break
-                else:
-                    st = DFAState(nfaset, finish)
-                    states.append(st)
-                state.addarc(st, label)
-        return states # List of DFAState instances; first one is start
-
-    def dump_nfa(self, name, start, finish):
-        print("Dump of NFA for", name)
-        todo = [start]
-        for i, state in enumerate(todo):
-            print("  State", i, state is finish and "(final)" or "")
-            for label, next in state.arcs:
-                if next in todo:
-                    j = todo.index(next)
-                else:
-                    j = len(todo)
-                    todo.append(next)
-                if label is None:
-                    print("    -> %d" % j)
-                else:
-                    print("    %s -> %d" % (label, j))
-
-    def dump_dfa(self, name, dfa):
-        print("Dump of DFA for", name)
-        for i, state in enumerate(dfa):
-            print("  State", i, state.isfinal and "(final)" or "")
-            for label, next in sorted(state.arcs.items()):
-                print("    %s -> %d" % (label, dfa.index(next)))
-
-    def simplify_dfa(self, dfa):
-        # This is not theoretically optimal, but works well enough.
-        # Algorithm: repeatedly look for two states that have the same
-        # set of arcs (same labels pointing to the same nodes) and
-        # unify them, until things stop changing.
-
-        # dfa is a list of DFAState instances
-        changes = True
-        while changes:
-            changes = False
-            for i, state_i in enumerate(dfa):
-                for j in range(i+1, len(dfa)):
-                    state_j = dfa[j]
-                    if state_i == state_j:
-                        #print "  unify", i, j
-                        del dfa[j]
-                        for state in dfa:
-                            state.unifystate(state_j, state_i)
-                        changes = True
-                        break
-
-    def parse_rhs(self):
-        # RHS: ALT ('|' ALT)*
-        a, z = self.parse_alt()
-        if self.value != "|":
-            return a, z
-        else:
-            aa = NFAState()
-            zz = NFAState()
-            aa.addarc(a)
-            z.addarc(zz)
-            while self.value == "|":
-                self.gettoken()
-                a, z = self.parse_alt()
-                aa.addarc(a)
-                z.addarc(zz)
-            return aa, zz
-
-    def parse_alt(self):
-        # ALT: ITEM+
-        a, b = self.parse_item()
-        while (self.value in ("(", "[") or
-               self.type in (tokenize.NAME, tokenize.STRING)):
-            c, d = self.parse_item()
-            b.addarc(c)
-            b = d
-        return a, b
-
-    def parse_item(self):
-        # ITEM: '[' RHS ']' | ATOM ['+' | '*']
-        if self.value == "[":
-            self.gettoken()
-            a, z = self.parse_rhs()
-            self.expect(tokenize.OP, "]")
-            a.addarc(z)
-            return a, z
-        else:
-            a, z = self.parse_atom()
-            value = self.value
-            if value not in ("+", "*"):
-                return a, z
-            self.gettoken()
-            z.addarc(a)
-            if value == "+":
-                return a, z
-            else:
-                return a, a
-
-    def parse_atom(self):
-        # ATOM: '(' RHS ')' | NAME | STRING
-        if self.value == "(":
-            self.gettoken()
-            a, z = self.parse_rhs()
-            self.expect(tokenize.OP, ")")
-            return a, z
-        elif self.type in (tokenize.NAME, tokenize.STRING):
-            a = NFAState()
-            z = NFAState()
-            a.addarc(z, self.value)
-            self.gettoken()
-            return a, z
-        else:
-            self.raise_error("expected (...) or NAME or STRING, got %s/%s",
-                             self.type, self.value)
-
-    def expect(self, type, value=None):
-        if self.type != type or (value is not None and self.value != value):
-            self.raise_error("expected %s/%s, got %s/%s",
-                             type, value, self.type, self.value)
-        value = self.value
-        self.gettoken()
-        return value
-
-    def gettoken(self):
-        tup = next(self.generator)
-        while tup[0] in (tokenize.COMMENT, tokenize.NL):
-            tup = next(self.generator)
-        self.type, self.value, self.begin, self.end, self.line = tup
-        # print(getattr(tokenize, 'tok_name')[self.type], repr(self.value))
-
-    def raise_error(self, msg, *args):
-        if args:
-            try:
-                msg = msg % args
-            except Exception:
-                msg = " ".join([msg] + list(map(str, args)))
-        raise SyntaxError(msg, (self.filename, self.end[0],
-                                self.end[1], self.line))
-
-class NFAState(object):
-
-    def __init__(self):
-        self.arcs = [] # list of (label, NFAState) pairs
-
-    def addarc(self, next, label=None):
-        assert label is None or isinstance(label, str)
-        assert isinstance(next, NFAState)
-        self.arcs.append((label, next))
-
-class DFAState(object):
-
-    def __init__(self, nfaset, final):
-        assert isinstance(nfaset, set)
-        assert isinstance(next(iter(nfaset)), NFAState)
-        assert isinstance(final, NFAState)
-        self.nfaset = nfaset
-        self.isfinal = final in nfaset
-        self.arcs = {} # map from label to DFAState
-
-    def addarc(self, next, label):
-        assert isinstance(label, str)
-        assert label not in self.arcs
-        assert isinstance(next, DFAState)
-        self.arcs[label] = next
-
-    def unifystate(self, old, new):
-        for label, next in self.arcs.items():
-            if next is old:
-                self.arcs[label] = new
-
-    def __eq__(self, other):
-        # Equality test -- ignore the nfaset instance variable
-        assert isinstance(other, DFAState)
-        if self.isfinal != other.isfinal:
-            return False
-        # Can't just return self.arcs == other.arcs, because that
-        # would invoke this method recursively, with cycles...
-        if len(self.arcs) != len(other.arcs):
-            return False
-        for label, next in self.arcs.items():
-            if next is not other.arcs.get(label):
-                return False
-        return True
-
-    __hash__ = None # For Py3 compatibility.
diff --git a/Parser/pgen/token.py b/Parser/pgen/token.py
index e7e8f3f..2cff62c 100644
--- a/Parser/pgen/token.py
+++ b/Parser/pgen/token.py
@@ -6,21 +6,21 @@ def generate_tokens(tokens):
     for line in tokens:
         line = line.strip()
 
-        if not line or line.startswith('#'):
+        if not line or line.startswith("#"):
             continue
 
         name = line.split()[0]
         yield (name, next(numbers))
 
-    yield ('N_TOKENS', next(numbers))
-    yield ('NT_OFFSET', 256)
+    yield ("N_TOKENS", next(numbers))
+    yield ("NT_OFFSET", 256)
 
 
 def generate_opmap(tokens):
     for line in tokens:
         line = line.strip()
 
-        if not line or line.startswith('#'):
+        if not line or line.startswith("#"):
             continue
 
         pieces = line.split()
@@ -35,4 +35,4 @@ def generate_opmap(tokens):
     # with the token generation in "generate_tokens" because if this
     # symbol is included in Grammar/Tokens, it will collide with !=
     # as it has the same name (NOTEQUAL).
-    yield ('<>', 'NOTEQUAL')
+    yield ("<>", "NOTEQUAL")
-- 
cgit v0.12