1 files changed, 152 insertions, 0 deletions
diff --git a/Parser/pgen/metaparser.py b/Parser/pgen/metaparser.py
new file mode 100644
index 0000000..074a083
--- /dev/null
+++ b/Parser/pgen/metaparser.py
@@ -0,0 +1,152 @@
+"""Parser for the Python metagrammar"""
+
+import io
+import tokenize  # from stdlib
+
+from .automata import NFA, NFAState
+
+
+class GrammarParser:
+    """Parser for Python grammar files."""
+
+    _translation_table = {
+        tokenize.NAME: "NAME",
+        tokenize.STRING: "STRING",
+        tokenize.NEWLINE: "NEWLINE",
+        tokenize.NL: "NL",
+        tokenize.OP: "OP",
+        tokenize.ENDMARKER: "ENDMARKER",
+        tokenize.COMMENT: "COMMENT",
+    }
+
+    def __init__(self, grammar):
+        self.grammar = grammar
+        grammar_adaptor = io.StringIO(grammar)
+        self.generator = tokenize.generate_tokens(grammar_adaptor.readline)
+        self._gettoken()  # Initialize lookahead
+        self._current_rule_name = None
+
+    def parse(self):
+        """Turn the grammar into a collection of NFAs"""
+        # grammar: (NEWLINE | rule)* ENDMARKER
+        while self.type != tokenize.ENDMARKER:
+            while self.type == tokenize.NEWLINE:
+                self._gettoken()
+            # rule: NAME ':' rhs NEWLINE
+            self._current_rule_name = self._expect(tokenize.NAME)
+            self._expect(tokenize.OP, ":")
+            a, z = self._parse_rhs()
+            self._expect(tokenize.NEWLINE)
+
+            yield NFA(a, z)
+
+    def _parse_rhs(self):
+        # rhs: items ('|' items)*
+        a, z = self._parse_items()
+        if self.value != "|":
+            return a, z
+        else:
+            aa = NFAState(self._current_rule_name)
+            zz = NFAState(self._current_rule_name)
+            while True:
+                # Allow to transit directly to the previous state and connect the end of the
+                # previous state to the end of the current one, effectively allowing to skip
+                # the current state.
+                aa.add_arc(a)
+                z.add_arc(zz)
+                if self.value != "|":
+                    break
+
+                self._gettoken()
+                a, z = self._parse_items()
+            return aa, zz
+
+    def _parse_items(self):
+        # items: item+
+        a, b = self._parse_item()
+        while self.type in (tokenize.NAME, tokenize.STRING) or self.value in ("(", "["):
+            c, d = self._parse_item()
+            # Allow a transition between the end of the previous state
+            # and the beginning of the new one, connecting all the items
+            # together. In this way we can only reach the end if we visit
+            # all the items.
+            b.add_arc(c)
+            b = d
+        return a, b
+
+    def _parse_item(self):
+        # item: '[' rhs ']' | atom ['+' | '*']
+        if self.value == "[":
+            self._gettoken()
+            a, z = self._parse_rhs()
+            self._expect(tokenize.OP, "]")
+            # Make a transition from the beginning to the end so it is possible to
+            # advance for free to the next state of this item # without consuming
+            # anything from the rhs.
+            a.add_arc(z)
+            return a, z
+        else:
+            a, z = self._parse_atom()
+            value = self.value
+            if value not in ("+", "*"):
+                return a, z
+            self._gettoken()
+            z.add_arc(a)
+            if value == "+":
+                # Create a cycle to the beginning so we go back to the old state in this
+                # item and repeat.
+                return a, z
+            else:
+                # The end state is the same as the beginning, so we can cycle arbitrarily
+                # and end in the beginning if necessary.
+                return a, a
+
+    def _parse_atom(self):
+        # atom: '(' rhs ')' | NAME | STRING
+        if self.value == "(":
+            self._gettoken()
+            a, z = self._parse_rhs()
+            self._expect(tokenize.OP, ")")
+            return a, z
+        elif self.type in (tokenize.NAME, tokenize.STRING):
+            a = NFAState(self._current_rule_name)
+            z = NFAState(self._current_rule_name)
+            # We can transit to the next state only if we consume the value.
+            a.add_arc(z, self.value)
+            self._gettoken()
+            return a, z
+        else:
+            self._raise_error(
+                "expected (...) or NAME or STRING, got {} ({})",
+                self._translation_table.get(self.type, self.type),
+                self.value,
+            )
+
+    def _expect(self, type_, value=None):
+        if self.type != type_:
+            self._raise_error(
+                "expected {}, got {} ({})",
+                self._translation_table.get(type_, type_),
+                self._translation_table.get(self.type, self.type),
+                self.value,
+            )
+        if value is not None and self.value != value:
+            self._raise_error("expected {}, got {}", value, self.value)
+        value = self.value
+        self._gettoken()
+        return value
+
+    def _gettoken(self):
+        tup = next(self.generator)
+        while tup[0] in (tokenize.COMMENT, tokenize.NL):
+            tup = next(self.generator)
+        self.type, self.value, self.begin, self.end, self.line = tup
+
+    def _raise_error(self, msg, *args):
+        if args:
+            try:
+                msg = msg.format(*args)
+            except Exception:
+                msg = " ".join([msg] + list(map(str, args)))
+        line = self.grammar.splitlines()[self.begin[0] - 1]
+        raise SyntaxError(msg, ("<grammar>", self.begin[0], self.begin[1], line))