summaryrefslogtreecommitdiffstats
path: root/Parser/pgen/pgen.py
diff options
context:
space:
mode:
Diffstat (limited to 'Parser/pgen/pgen.py')
-rw-r--r--Parser/pgen/pgen.py305
1 files changed, 0 insertions, 305 deletions
diff --git a/Parser/pgen/pgen.py b/Parser/pgen/pgen.py
deleted file mode 100644
index 2f444eb..0000000
--- a/Parser/pgen/pgen.py
+++ /dev/null
@@ -1,305 +0,0 @@
-"""Python parser generator
-
-
-This parser generator transforms a Python grammar file into parsing tables
-that can be consumed by Python's LL(1) parser written in C.
-
-Concepts
---------
-
-* An LL(1) parser (Left-to-right, Leftmost derivation, 1 token-lookahead) is a
- top-down parser for a subset of context-free languages. It parses the input
- from Left to right, performing Leftmost derivation of the sentence, and can
- only use 1 token of lookahead when parsing a sentence.
-
-* A parsing table is a collection of data that a generic implementation of the
- LL(1) parser consumes to know how to parse a given context-free grammar. In
- this case the collection of data involves Deterministic Finite Automatons,
- calculated first sets, keywords and transition labels.
-
-* A grammar is defined by production rules (or just 'productions') that specify
- which symbols may replace which other symbols; these rules may be used to
- generate strings, or to parse them. Each such rule has a head, or left-hand
- side, which consists of the string that may be replaced, and a body, or
- right-hand side, which consists of a string that may replace it. In the
- Python grammar, rules are written in the form
-
- rule_name: rule_description;
-
- meaning the rule 'a: b' specifies that a can be replaced by b. A context-free
- grammar is a grammar in which the left-hand side of each production rule
- consists of only a single nonterminal symbol. Context-free grammars can
- always be recognized by a Non-Deterministic Automatons.
-
-* Terminal symbols are literal symbols which may appear in the outputs of the
- production rules of the grammar and which cannot be changed using the rules
- of the grammar. Applying the rules recursively to a source string of symbols
- will usually terminate in a final output string consisting only of terminal
- symbols.
-
-* Nonterminal symbols are those symbols which can be replaced. The grammar
- includes a start symbol a designated member of the set of nonterminals from
- which all the strings in the language may be derived by successive
- applications of the production rules.
-
-* The language defined by the grammar is defined as the set of terminal strings
- that can be derived using the production rules.
-
-* The first sets of a rule (FIRST(rule)) are defined to be the set of terminals
- that can appear in the first position of any string derived from the rule.
- This is useful for LL(1) parsers as the parser is only allowed to look at the
- next token in the input to know which rule needs to parse. For example, given
- this grammar:
-
- start: '(' A | B ')'
- A: 'a' '<'
- B: 'b' '<'
-
- and the input '(b<)' the parser can only look at 'b' to know if it needs
- to parse A o B. Because FIRST(A) = {'a'} and FIRST(B) = {'b'} it knows
- that needs to continue parsing rule B because only that rule can start
- with 'b'.
-
-Description
------------
-
-The input for the parser generator is a grammar in extended BNF form (using *
-for repetition, + for at-least-once repetition, [] for optional parts, | for
-alternatives and () for grouping).
-
-Each rule in the grammar file is considered as a regular expression in its
-own right. It is turned into a Non-deterministic Finite Automaton (NFA),
-which is then turned into a Deterministic Finite Automaton (DFA), which is
-then optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
-or similar compiler books (this technique is more often used for lexical
-analyzers).
-
-The DFA's are used by the parser as parsing tables in a special way that's
-probably unique. Before they are usable, the FIRST sets of all non-terminals
-are computed so the LL(1) parser consuming the parsing tables can distinguish
-between different transitions.
-Reference
----------
-
-[Aho&Ullman 77]
- Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
- (first edition)
-"""
-
-from ast import literal_eval
-import collections
-
-from . import grammar, token
-from .automata import DFA
-from .metaparser import GrammarParser
-
-import enum
-
-
-class LabelType(enum.Enum):
- NONTERMINAL = 0
- NAMED_TOKEN = 1
- KEYWORD = 2
- OPERATOR = 3
- NONE = 4
-
-
-class Label(str):
- def __init__(self, value):
- self.type = self._get_type()
-
- def _get_type(self):
- if self[0].isalpha():
- if self.upper() == self:
- # NAMED tokens (ASYNC, NAME...) are all uppercase by convention
- return LabelType.NAMED_TOKEN
- else:
- # If is not uppercase it must be a non terminal.
- return LabelType.NONTERMINAL
- else:
- # Keywords and operators are wrapped in quotes
- assert self[0] == self[-1] in ('"', "'"), self
- value = literal_eval(self)
- if value[0].isalpha():
- return LabelType.KEYWORD
- else:
- return LabelType.OPERATOR
-
- def __repr__(self):
- return "{}({})".format(self.type, super().__repr__())
-
-
-class ParserGenerator(object):
- def __init__(self, grammar_file, token_file, verbose=False):
- with open(grammar_file) as f:
- self.grammar = f.read()
- with open(token_file) as tok_file:
- token_lines = tok_file.readlines()
- self.tokens = dict(token.generate_tokens(token_lines))
- self.opmap = dict(token.generate_opmap(token_lines))
- # Manually add <> so it does not collide with !=
- self.opmap["<>"] = "NOTEQUAL"
- self.verbose = verbose
- self.filename = grammar_file
- self.dfas, self.startsymbol = self.create_dfas()
- self.first = {} # map from symbol name to set of tokens
- self.calculate_first_sets()
-
- def create_dfas(self):
- rule_to_dfas = collections.OrderedDict()
- start_nonterminal = None
- for nfa in GrammarParser(self.grammar).parse():
- if self.verbose:
- print("Dump of NFA for", nfa.name)
- nfa.dump()
- dfa = DFA.from_nfa(nfa)
- if self.verbose:
- print("Dump of DFA for", dfa.name)
- dfa.dump()
- dfa.simplify()
- rule_to_dfas[dfa.name] = dfa
-
- if start_nonterminal is None:
- start_nonterminal = dfa.name
-
- return rule_to_dfas, start_nonterminal
-
- def make_grammar(self):
- c = grammar.Grammar()
- c.all_labels = set()
- names = list(self.dfas.keys())
- names.remove(self.startsymbol)
- names.insert(0, self.startsymbol)
- for name in names:
- i = 256 + len(c.symbol2number)
- c.symbol2number[Label(name)] = i
- c.number2symbol[i] = Label(name)
- c.all_labels.add(name)
- for name in names:
- self.make_label(c, name)
- dfa = self.dfas[name]
- states = []
- for state in dfa:
- arcs = []
- for label, next in sorted(state.arcs.items()):
- c.all_labels.add(label)
- arcs.append((self.make_label(c, label), dfa.states.index(next)))
- if state.is_final:
- arcs.append((0, dfa.states.index(state)))
- states.append(arcs)
- c.states.append(states)
- c.dfas[c.symbol2number[name]] = (states, self.make_first_sets(c, name))
- c.start = c.symbol2number[self.startsymbol]
-
- if self.verbose:
- print("")
- print("Grammar summary")
- print("===============")
-
- print("- {n_labels} labels".format(n_labels=len(c.labels)))
- print("- {n_dfas} dfas".format(n_dfas=len(c.dfas)))
- print("- {n_tokens} tokens".format(n_tokens=len(c.tokens)))
- print("- {n_keywords} keywords".format(n_keywords=len(c.keywords)))
- print(
- "- Start symbol: {start_symbol}".format(
- start_symbol=c.number2symbol[c.start]
- )
- )
- return c
-
- def make_first_sets(self, c, name):
- rawfirst = self.first[name]
- first = set()
- for label in sorted(rawfirst):
- ilabel = self.make_label(c, label)
- ##assert ilabel not in first # XXX failed on <> ... !=
- first.add(ilabel)
- return first
-
- def make_label(self, c, label):
- label = Label(label)
- ilabel = len(c.labels)
-
- if label.type == LabelType.NONTERMINAL:
- if label in c.symbol2label:
- return c.symbol2label[label]
- else:
- c.labels.append((c.symbol2number[label], None))
- c.symbol2label[label] = ilabel
- return ilabel
- elif label.type == LabelType.NAMED_TOKEN:
- # A named token (NAME, NUMBER, STRING)
- itoken = self.tokens.get(label, None)
- assert isinstance(itoken, int), label
- assert itoken in self.tokens.values(), label
- if itoken in c.tokens:
- return c.tokens[itoken]
- else:
- c.labels.append((itoken, None))
- c.tokens[itoken] = ilabel
- return ilabel
- elif label.type == LabelType.KEYWORD:
- # A keyword
- value = literal_eval(label)
- if value in c.keywords:
- return c.keywords[value]
- else:
- c.labels.append((self.tokens["NAME"], value))
- c.keywords[value] = ilabel
- return ilabel
- elif label.type == LabelType.OPERATOR:
- # An operator (any non-numeric token)
- value = literal_eval(label)
- tok_name = self.opmap[value] # Fails if unknown token
- itoken = self.tokens[tok_name]
- if itoken in c.tokens:
- return c.tokens[itoken]
- else:
- c.labels.append((itoken, None))
- c.tokens[itoken] = ilabel
- return ilabel
- else:
- raise ValueError("Cannot categorize label {}".format(label))
-
- def calculate_first_sets(self):
- names = list(self.dfas.keys())
- for name in names:
- if name not in self.first:
- self.calculate_first_sets_for_rule(name)
-
- if self.verbose:
- print("First set for {dfa_name}".format(dfa_name=name))
- for item in self.first[name]:
- print(" - {terminal}".format(terminal=item))
-
- def calculate_first_sets_for_rule(self, name):
- dfa = self.dfas[name]
- self.first[name] = None # dummy to detect left recursion
- state = dfa.states[0]
- totalset = set()
- overlapcheck = {}
- for label, next in state.arcs.items():
- if label in self.dfas:
- if label in self.first:
- fset = self.first[label]
- if fset is None:
- raise ValueError("recursion for rule %r" % name)
- else:
- self.calculate_first_sets_for_rule(label)
- fset = self.first[label]
- totalset.update(fset)
- overlapcheck[label] = fset
- else:
- totalset.add(label)
- overlapcheck[label] = {label}
- inverse = {}
- for label, itsfirst in overlapcheck.items():
- for symbol in itsfirst:
- if symbol in inverse:
- raise ValueError(
- "rule %s is ambiguous; %s is in the"
- " first sets of %s as well as %s"
- % (name, symbol, label, inverse[symbol])
- )
- inverse[symbol] = label
- self.first[name] = totalset