diff options
Diffstat (limited to 'Parser/pgen/pgen.py')
-rw-r--r-- | Parser/pgen/pgen.py | 305 |
1 files changed, 0 insertions, 305 deletions
diff --git a/Parser/pgen/pgen.py b/Parser/pgen/pgen.py deleted file mode 100644 index 2f444eb..0000000 --- a/Parser/pgen/pgen.py +++ /dev/null @@ -1,305 +0,0 @@ -"""Python parser generator - - -This parser generator transforms a Python grammar file into parsing tables -that can be consumed by Python's LL(1) parser written in C. - -Concepts --------- - -* An LL(1) parser (Left-to-right, Leftmost derivation, 1 token-lookahead) is a - top-down parser for a subset of context-free languages. It parses the input - from Left to right, performing Leftmost derivation of the sentence, and can - only use 1 token of lookahead when parsing a sentence. - -* A parsing table is a collection of data that a generic implementation of the - LL(1) parser consumes to know how to parse a given context-free grammar. In - this case the collection of data involves Deterministic Finite Automatons, - calculated first sets, keywords and transition labels. - -* A grammar is defined by production rules (or just 'productions') that specify - which symbols may replace which other symbols; these rules may be used to - generate strings, or to parse them. Each such rule has a head, or left-hand - side, which consists of the string that may be replaced, and a body, or - right-hand side, which consists of a string that may replace it. In the - Python grammar, rules are written in the form - - rule_name: rule_description; - - meaning the rule 'a: b' specifies that a can be replaced by b. A context-free - grammar is a grammar in which the left-hand side of each production rule - consists of only a single nonterminal symbol. Context-free grammars can - always be recognized by a Non-Deterministic Automatons. - -* Terminal symbols are literal symbols which may appear in the outputs of the - production rules of the grammar and which cannot be changed using the rules - of the grammar. Applying the rules recursively to a source string of symbols - will usually terminate in a final output string consisting only of terminal - symbols. - -* Nonterminal symbols are those symbols which can be replaced. The grammar - includes a start symbol a designated member of the set of nonterminals from - which all the strings in the language may be derived by successive - applications of the production rules. - -* The language defined by the grammar is defined as the set of terminal strings - that can be derived using the production rules. - -* The first sets of a rule (FIRST(rule)) are defined to be the set of terminals - that can appear in the first position of any string derived from the rule. - This is useful for LL(1) parsers as the parser is only allowed to look at the - next token in the input to know which rule needs to parse. For example, given - this grammar: - - start: '(' A | B ')' - A: 'a' '<' - B: 'b' '<' - - and the input '(b<)' the parser can only look at 'b' to know if it needs - to parse A o B. Because FIRST(A) = {'a'} and FIRST(B) = {'b'} it knows - that needs to continue parsing rule B because only that rule can start - with 'b'. - -Description ------------ - -The input for the parser generator is a grammar in extended BNF form (using * -for repetition, + for at-least-once repetition, [] for optional parts, | for -alternatives and () for grouping). - -Each rule in the grammar file is considered as a regular expression in its -own right. It is turned into a Non-deterministic Finite Automaton (NFA), -which is then turned into a Deterministic Finite Automaton (DFA), which is -then optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3, -or similar compiler books (this technique is more often used for lexical -analyzers). - -The DFA's are used by the parser as parsing tables in a special way that's -probably unique. Before they are usable, the FIRST sets of all non-terminals -are computed so the LL(1) parser consuming the parsing tables can distinguish -between different transitions. -Reference ---------- - -[Aho&Ullman 77] - Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977 - (first edition) -""" - -from ast import literal_eval -import collections - -from . import grammar, token -from .automata import DFA -from .metaparser import GrammarParser - -import enum - - -class LabelType(enum.Enum): - NONTERMINAL = 0 - NAMED_TOKEN = 1 - KEYWORD = 2 - OPERATOR = 3 - NONE = 4 - - -class Label(str): - def __init__(self, value): - self.type = self._get_type() - - def _get_type(self): - if self[0].isalpha(): - if self.upper() == self: - # NAMED tokens (ASYNC, NAME...) are all uppercase by convention - return LabelType.NAMED_TOKEN - else: - # If is not uppercase it must be a non terminal. - return LabelType.NONTERMINAL - else: - # Keywords and operators are wrapped in quotes - assert self[0] == self[-1] in ('"', "'"), self - value = literal_eval(self) - if value[0].isalpha(): - return LabelType.KEYWORD - else: - return LabelType.OPERATOR - - def __repr__(self): - return "{}({})".format(self.type, super().__repr__()) - - -class ParserGenerator(object): - def __init__(self, grammar_file, token_file, verbose=False): - with open(grammar_file) as f: - self.grammar = f.read() - with open(token_file) as tok_file: - token_lines = tok_file.readlines() - self.tokens = dict(token.generate_tokens(token_lines)) - self.opmap = dict(token.generate_opmap(token_lines)) - # Manually add <> so it does not collide with != - self.opmap["<>"] = "NOTEQUAL" - self.verbose = verbose - self.filename = grammar_file - self.dfas, self.startsymbol = self.create_dfas() - self.first = {} # map from symbol name to set of tokens - self.calculate_first_sets() - - def create_dfas(self): - rule_to_dfas = collections.OrderedDict() - start_nonterminal = None - for nfa in GrammarParser(self.grammar).parse(): - if self.verbose: - print("Dump of NFA for", nfa.name) - nfa.dump() - dfa = DFA.from_nfa(nfa) - if self.verbose: - print("Dump of DFA for", dfa.name) - dfa.dump() - dfa.simplify() - rule_to_dfas[dfa.name] = dfa - - if start_nonterminal is None: - start_nonterminal = dfa.name - - return rule_to_dfas, start_nonterminal - - def make_grammar(self): - c = grammar.Grammar() - c.all_labels = set() - names = list(self.dfas.keys()) - names.remove(self.startsymbol) - names.insert(0, self.startsymbol) - for name in names: - i = 256 + len(c.symbol2number) - c.symbol2number[Label(name)] = i - c.number2symbol[i] = Label(name) - c.all_labels.add(name) - for name in names: - self.make_label(c, name) - dfa = self.dfas[name] - states = [] - for state in dfa: - arcs = [] - for label, next in sorted(state.arcs.items()): - c.all_labels.add(label) - arcs.append((self.make_label(c, label), dfa.states.index(next))) - if state.is_final: - arcs.append((0, dfa.states.index(state))) - states.append(arcs) - c.states.append(states) - c.dfas[c.symbol2number[name]] = (states, self.make_first_sets(c, name)) - c.start = c.symbol2number[self.startsymbol] - - if self.verbose: - print("") - print("Grammar summary") - print("===============") - - print("- {n_labels} labels".format(n_labels=len(c.labels))) - print("- {n_dfas} dfas".format(n_dfas=len(c.dfas))) - print("- {n_tokens} tokens".format(n_tokens=len(c.tokens))) - print("- {n_keywords} keywords".format(n_keywords=len(c.keywords))) - print( - "- Start symbol: {start_symbol}".format( - start_symbol=c.number2symbol[c.start] - ) - ) - return c - - def make_first_sets(self, c, name): - rawfirst = self.first[name] - first = set() - for label in sorted(rawfirst): - ilabel = self.make_label(c, label) - ##assert ilabel not in first # XXX failed on <> ... != - first.add(ilabel) - return first - - def make_label(self, c, label): - label = Label(label) - ilabel = len(c.labels) - - if label.type == LabelType.NONTERMINAL: - if label in c.symbol2label: - return c.symbol2label[label] - else: - c.labels.append((c.symbol2number[label], None)) - c.symbol2label[label] = ilabel - return ilabel - elif label.type == LabelType.NAMED_TOKEN: - # A named token (NAME, NUMBER, STRING) - itoken = self.tokens.get(label, None) - assert isinstance(itoken, int), label - assert itoken in self.tokens.values(), label - if itoken in c.tokens: - return c.tokens[itoken] - else: - c.labels.append((itoken, None)) - c.tokens[itoken] = ilabel - return ilabel - elif label.type == LabelType.KEYWORD: - # A keyword - value = literal_eval(label) - if value in c.keywords: - return c.keywords[value] - else: - c.labels.append((self.tokens["NAME"], value)) - c.keywords[value] = ilabel - return ilabel - elif label.type == LabelType.OPERATOR: - # An operator (any non-numeric token) - value = literal_eval(label) - tok_name = self.opmap[value] # Fails if unknown token - itoken = self.tokens[tok_name] - if itoken in c.tokens: - return c.tokens[itoken] - else: - c.labels.append((itoken, None)) - c.tokens[itoken] = ilabel - return ilabel - else: - raise ValueError("Cannot categorize label {}".format(label)) - - def calculate_first_sets(self): - names = list(self.dfas.keys()) - for name in names: - if name not in self.first: - self.calculate_first_sets_for_rule(name) - - if self.verbose: - print("First set for {dfa_name}".format(dfa_name=name)) - for item in self.first[name]: - print(" - {terminal}".format(terminal=item)) - - def calculate_first_sets_for_rule(self, name): - dfa = self.dfas[name] - self.first[name] = None # dummy to detect left recursion - state = dfa.states[0] - totalset = set() - overlapcheck = {} - for label, next in state.arcs.items(): - if label in self.dfas: - if label in self.first: - fset = self.first[label] - if fset is None: - raise ValueError("recursion for rule %r" % name) - else: - self.calculate_first_sets_for_rule(label) - fset = self.first[label] - totalset.update(fset) - overlapcheck[label] = fset - else: - totalset.add(label) - overlapcheck[label] = {label} - inverse = {} - for label, itsfirst in overlapcheck.items(): - for symbol in itsfirst: - if symbol in inverse: - raise ValueError( - "rule %s is ambiguous; %s is in the" - " first sets of %s as well as %s" - % (name, symbol, label, inverse[symbol]) - ) - inverse[symbol] = label - self.first[name] = totalset |