diff options
Diffstat (limited to 'Parser/pgen/metaparser.py')
-rw-r--r-- | Parser/pgen/metaparser.py | 152 |
1 files changed, 152 insertions, 0 deletions
diff --git a/Parser/pgen/metaparser.py b/Parser/pgen/metaparser.py new file mode 100644 index 0000000..074a083 --- /dev/null +++ b/Parser/pgen/metaparser.py @@ -0,0 +1,152 @@ +"""Parser for the Python metagrammar""" + +import io +import tokenize # from stdlib + +from .automata import NFA, NFAState + + +class GrammarParser: + """Parser for Python grammar files.""" + + _translation_table = { + tokenize.NAME: "NAME", + tokenize.STRING: "STRING", + tokenize.NEWLINE: "NEWLINE", + tokenize.NL: "NL", + tokenize.OP: "OP", + tokenize.ENDMARKER: "ENDMARKER", + tokenize.COMMENT: "COMMENT", + } + + def __init__(self, grammar): + self.grammar = grammar + grammar_adaptor = io.StringIO(grammar) + self.generator = tokenize.generate_tokens(grammar_adaptor.readline) + self._gettoken() # Initialize lookahead + self._current_rule_name = None + + def parse(self): + """Turn the grammar into a collection of NFAs""" + # grammar: (NEWLINE | rule)* ENDMARKER + while self.type != tokenize.ENDMARKER: + while self.type == tokenize.NEWLINE: + self._gettoken() + # rule: NAME ':' rhs NEWLINE + self._current_rule_name = self._expect(tokenize.NAME) + self._expect(tokenize.OP, ":") + a, z = self._parse_rhs() + self._expect(tokenize.NEWLINE) + + yield NFA(a, z) + + def _parse_rhs(self): + # rhs: items ('|' items)* + a, z = self._parse_items() + if self.value != "|": + return a, z + else: + aa = NFAState(self._current_rule_name) + zz = NFAState(self._current_rule_name) + while True: + # Allow to transit directly to the previous state and connect the end of the + # previous state to the end of the current one, effectively allowing to skip + # the current state. + aa.add_arc(a) + z.add_arc(zz) + if self.value != "|": + break + + self._gettoken() + a, z = self._parse_items() + return aa, zz + + def _parse_items(self): + # items: item+ + a, b = self._parse_item() + while self.type in (tokenize.NAME, tokenize.STRING) or self.value in ("(", "["): + c, d = self._parse_item() + # Allow a transition between the end of the previous state + # and the beginning of the new one, connecting all the items + # together. In this way we can only reach the end if we visit + # all the items. + b.add_arc(c) + b = d + return a, b + + def _parse_item(self): + # item: '[' rhs ']' | atom ['+' | '*'] + if self.value == "[": + self._gettoken() + a, z = self._parse_rhs() + self._expect(tokenize.OP, "]") + # Make a transition from the beginning to the end so it is possible to + # advance for free to the next state of this item # without consuming + # anything from the rhs. + a.add_arc(z) + return a, z + else: + a, z = self._parse_atom() + value = self.value + if value not in ("+", "*"): + return a, z + self._gettoken() + z.add_arc(a) + if value == "+": + # Create a cycle to the beginning so we go back to the old state in this + # item and repeat. + return a, z + else: + # The end state is the same as the beginning, so we can cycle arbitrarily + # and end in the beginning if necessary. + return a, a + + def _parse_atom(self): + # atom: '(' rhs ')' | NAME | STRING + if self.value == "(": + self._gettoken() + a, z = self._parse_rhs() + self._expect(tokenize.OP, ")") + return a, z + elif self.type in (tokenize.NAME, tokenize.STRING): + a = NFAState(self._current_rule_name) + z = NFAState(self._current_rule_name) + # We can transit to the next state only if we consume the value. + a.add_arc(z, self.value) + self._gettoken() + return a, z + else: + self._raise_error( + "expected (...) or NAME or STRING, got {} ({})", + self._translation_table.get(self.type, self.type), + self.value, + ) + + def _expect(self, type_, value=None): + if self.type != type_: + self._raise_error( + "expected {}, got {} ({})", + self._translation_table.get(type_, type_), + self._translation_table.get(self.type, self.type), + self.value, + ) + if value is not None and self.value != value: + self._raise_error("expected {}, got {}", value, self.value) + value = self.value + self._gettoken() + return value + + def _gettoken(self): + tup = next(self.generator) + while tup[0] in (tokenize.COMMENT, tokenize.NL): + tup = next(self.generator) + self.type, self.value, self.begin, self.end, self.line = tup + + def _raise_error(self, msg, *args): + if args: + try: + msg = msg.format(*args) + except Exception: + msg = " ".join([msg] + list(map(str, args))) + line = self.grammar.splitlines()[self.begin[0] - 1] + raise SyntaxError(msg, ("<grammar>", self.begin[0], self.begin[1], line)) |