summaryrefslogtreecommitdiffstats
path: root/Parser/pgen
diff options
context:
space:
mode:
Diffstat (limited to 'Parser/pgen')
-rw-r--r--Parser/pgen/__init__.py0
-rw-r--r--Parser/pgen/__main__.py43
-rw-r--r--Parser/pgen/automata.py400
-rw-r--r--Parser/pgen/grammar.py147
-rw-r--r--Parser/pgen/keywordgen.py59
-rw-r--r--Parser/pgen/metaparser.py152
-rw-r--r--Parser/pgen/pgen.py310
-rw-r--r--Parser/pgen/token.py38
8 files changed, 0 insertions, 1149 deletions
diff --git a/Parser/pgen/__init__.py b/Parser/pgen/__init__.py
deleted file mode 100644
index e69de29..0000000
--- a/Parser/pgen/__init__.py
+++ /dev/null
diff --git a/Parser/pgen/__main__.py b/Parser/pgen/__main__.py
deleted file mode 100644
index d3780a7..0000000
--- a/Parser/pgen/__main__.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import argparse
-
-from .pgen import ParserGenerator
-
-
-def main():
- parser = argparse.ArgumentParser(description="Parser generator main program.")
- parser.add_argument(
- "grammar", type=str, help="The file with the grammar definition in EBNF format"
- )
- parser.add_argument("tokens", type=str, help="The file with the token definitions")
- parser.add_argument(
- "graminit_h",
- type=argparse.FileType("w"),
- help="The path to write the grammar's non-terminals as #defines",
- )
- parser.add_argument(
- "graminit_c",
- type=argparse.FileType("w"),
- help="The path to write the grammar as initialized data",
- )
-
- parser.add_argument("--verbose", "-v", action="count")
- parser.add_argument(
- "--graph",
- type=argparse.FileType("w"),
- action="store",
- metavar="GRAPH_OUTPUT_FILE",
- help="Dumps a DOT representation of the generated automata in a file",
- )
-
- args = parser.parse_args()
-
- p = ParserGenerator(
- args.grammar, args.tokens, verbose=args.verbose, graph_file=args.graph
- )
- grammar = p.make_grammar()
- grammar.produce_graminit_h(args.graminit_h.write)
- grammar.produce_graminit_c(args.graminit_c.write)
-
-
-if __name__ == "__main__":
- main()
diff --git a/Parser/pgen/automata.py b/Parser/pgen/automata.py
deleted file mode 100644
index f2ed221..0000000
--- a/Parser/pgen/automata.py
+++ /dev/null
@@ -1,400 +0,0 @@
-"""Classes representing state-machine concepts"""
-
-class NFA:
- """A non deterministic finite automata
-
- A non deterministic automata is a form of a finite state
- machine. An NFA's rules are less restrictive than a DFA.
- The NFA rules are:
-
- * A transition can be non-deterministic and can result in
- nothing, one, or two or more states.
-
- * An epsilon transition consuming empty input is valid.
- Transitions consuming labeled symbols are also permitted.
-
- This class assumes that there is only one starting state and one
- accepting (ending) state.
-
- Attributes:
- name (str): The name of the rule the NFA is representing.
- start (NFAState): The starting state.
- end (NFAState): The ending state
- """
-
- def __init__(self, start, end):
- self.name = start.rule_name
- self.start = start
- self.end = end
-
- def __repr__(self):
- return "NFA(start={}, end={})".format(self.start, self.end)
-
- def dump(self, writer=print):
- """Dump a graphical representation of the NFA"""
- todo = [self.start]
- for i, state in enumerate(todo):
- writer(" State", i, state is self.end and "(final)" or "")
- for arc in state.arcs:
- label = arc.label
- next = arc.target
- if next in todo:
- j = todo.index(next)
- else:
- j = len(todo)
- todo.append(next)
- if label is None:
- writer(" -> %d" % j)
- else:
- writer(" %s -> %d" % (label, j))
-
- def dump_graph(self, writer):
- """Dump a DOT representation of the NFA"""
- writer('digraph %s_nfa {\n' % self.name)
- todo = [self.start]
- for i, state in enumerate(todo):
- writer(' %d [label="State %d %s"];\n' % (i, i, state is self.end and "(final)" or ""))
- for arc in state.arcs:
- label = arc.label
- next = arc.target
- if next in todo:
- j = todo.index(next)
- else:
- j = len(todo)
- todo.append(next)
- if label is None:
- writer(" %d -> %d [style=dotted label=ε];\n" % (i, j))
- else:
- writer(" %d -> %d [label=%s];\n" % (i, j, label.replace("'", '"')))
- writer('}\n')
-
-
-class NFAArc:
- """An arc representing a transition between two NFA states.
-
- NFA states can be connected via two ways:
-
- * A label transition: An input equal to the label must
- be consumed to perform the transition.
- * An epsilon transition: The transition can be taken without
- consuming any input symbol.
-
- Attributes:
- target (NFAState): The ending state of the transition arc.
- label (Optional[str]): The label that must be consumed to make
- the transition. An epsilon transition is represented
- using `None`.
- """
-
- def __init__(self, target, label):
- self.target = target
- self.label = label
-
- def __repr__(self):
- return "<%s: %s>" % (self.__class__.__name__, self.label)
-
-
-class NFAState:
- """A state of a NFA, non deterministic finite automata.
-
- Attributes:
- target (rule_name): The name of the rule used to represent the NFA's
- ending state after a transition.
- arcs (Dict[Optional[str], NFAState]): A mapping representing transitions
- between the current NFA state and another NFA state via following
- a label.
- """
-
- def __init__(self, rule_name):
- self.rule_name = rule_name
- self.arcs = []
-
- def add_arc(self, target, label=None):
- """Add a new arc to connect the state to a target state within the NFA
-
- The method adds a new arc to the list of arcs available as transitions
- from the present state. An optional label indicates a named transition
- that consumes an input while the absence of a label represents an epsilon
- transition.
-
- Attributes:
- target (NFAState): The end of the transition that the arc represents.
- label (Optional[str]): The label that must be consumed for making
- the transition. If the label is not provided the transition is assumed
- to be an epsilon-transition.
- """
- assert label is None or isinstance(label, str)
- assert isinstance(target, NFAState)
- self.arcs.append(NFAArc(target, label))
-
- def __repr__(self):
- return "<%s: from %s>" % (self.__class__.__name__, self.rule_name)
-
-
-class DFA:
- """A deterministic finite automata
-
- A deterministic finite automata is a form of a finite state machine
- that obeys the following rules:
-
- * Each of the transitions is uniquely determined by
- the source state and input symbol
- * Reading an input symbol is required for each state
- transition (no epsilon transitions).
-
- The finite-state machine will accept or reject a string of symbols
- and only produces a unique computation of the automaton for each input
- string. The DFA must have a unique starting state (represented as the first
- element in the list of states) but can have multiple final states.
-
- Attributes:
- name (str): The name of the rule the DFA is representing.
- states (List[DFAState]): A collection of DFA states.
- """
-
- def __init__(self, name, states):
- self.name = name
- self.states = states
-
- @classmethod
- def from_nfa(cls, nfa):
- """Constructs a DFA from a NFA using the Rabin–Scott construction algorithm.
-
- To simulate the operation of a DFA on a given input string, it's
- necessary to keep track of a single state at any time, or more precisely,
- the state that the automaton will reach after seeing a prefix of the
- input. In contrast, to simulate an NFA, it's necessary to keep track of
- a set of states: all of the states that the automaton could reach after
- seeing the same prefix of the input, according to the nondeterministic
- choices made by the automaton. There are two possible sources of
- non-determinism:
-
- 1) Multiple (one or more) transitions with the same label
-
- 'A' +-------+
- +----------->+ State +----------->+
- | | 2 |
- +-------+ +-------+
- | State |
- | 1 | +-------+
- +-------+ | State |
- +----------->+ 3 +----------->+
- 'A' +-------+
-
- 2) Epsilon transitions (transitions that can be taken without consuming any input)
-
- +-------+ +-------+
- | State | ε | State |
- | 1 +----------->+ 2 +----------->+
- +-------+ +-------+
-
- Looking at the first case above, we can't determine which transition should be
- followed when given an input A. We could choose whether or not to follow the
- transition while in the second case the problem is that we can choose both to
- follow the transition or not doing it. To solve this problem we can imagine that
- we follow all possibilities at the same time and we construct new states from the
- set of all possible reachable states. For every case in the previous example:
-
-
- 1) For multiple transitions with the same label we colapse all of the
- final states under the same one
-
- +-------+ +-------+
- | State | 'A' | State |
- | 1 +----------->+ 2-3 +----------->+
- +-------+ +-------+
-
- 2) For epsilon transitions we collapse all epsilon-reachable states
- into the same one
-
- +-------+
- | State |
- | 1-2 +----------->
- +-------+
-
- Because the DFA states consist of sets of NFA states, an n-state NFA
- may be converted to a DFA with at most 2**n states. Notice that the
- constructed DFA is not minimal and can be simplified or reduced
- afterwards.
-
- Parameters:
- name (NFA): The NFA to transform to DFA.
- """
- assert isinstance(nfa, NFA)
-
- def add_closure(nfa_state, base_nfa_set):
- """Calculate the epsilon-closure of a given state
-
- Add to the *base_nfa_set* all the states that are
- reachable from *nfa_state* via epsilon-transitions.
- """
- assert isinstance(nfa_state, NFAState)
- if nfa_state in base_nfa_set:
- return
- base_nfa_set.add(nfa_state)
- for nfa_arc in nfa_state.arcs:
- if nfa_arc.label is None:
- add_closure(nfa_arc.target, base_nfa_set)
-
- # Calculate the epsilon-closure of the starting state
- base_nfa_set = set()
- add_closure(nfa.start, base_nfa_set)
-
- # Start by visiting the NFA starting state (there is only one).
- states = [DFAState(nfa.name, base_nfa_set, nfa.end)]
-
- for state in states: # NB states grow while we're iterating
-
- # Find transitions from the current state to other reachable states
- # and store them in mapping that correlates the label to all the
- # possible reachable states that can be obtained by consuming a
- # token equal to the label. Each set of all the states that can
- # be reached after following a label will be the a DFA state.
- arcs = {}
- for nfa_state in state.nfa_set:
- for nfa_arc in nfa_state.arcs:
- if nfa_arc.label is not None:
- nfa_set = arcs.setdefault(nfa_arc.label, set())
- # All states that can be reached by epsilon-transitions
- # are also included in the set of reachable states.
- add_closure(nfa_arc.target, nfa_set)
-
- # Now create new DFAs by visiting all posible transitions between
- # the current DFA state and the new power-set states (each nfa_set)
- # via the different labels. As the nodes are appended to *states* this
- # is performing a breadth-first search traversal over the power-set of
- # the states of the original NFA.
- for label, nfa_set in sorted(arcs.items()):
- for exisisting_state in states:
- if exisisting_state.nfa_set == nfa_set:
- # The DFA state already exists for this rule.
- next_state = exisisting_state
- break
- else:
- next_state = DFAState(nfa.name, nfa_set, nfa.end)
- states.append(next_state)
-
- # Add a transition between the current DFA state and the new
- # DFA state (the power-set state) via the current label.
- state.add_arc(next_state, label)
-
- return cls(nfa.name, states)
-
- def __iter__(self):
- return iter(self.states)
-
- def simplify(self):
- """Attempt to reduce the number of states of the DFA
-
- Transform the DFA into an equivalent DFA that has fewer states. Two
- classes of states can be removed or merged from the original DFA without
- affecting the language it accepts to minimize it:
-
- * Unreachable states can not be reached from the initial
- state of the DFA, for any input string.
- * Nondistinguishable states are those that cannot be distinguished
- from one another for any input string.
-
- This algorithm does not achieve the optimal fully-reduced solution, but it
- works well enough for the particularities of the Python grammar. The
- algorithm repeatedly looks for two states that have the same set of
- arcs (same labels pointing to the same nodes) and unifies them, until
- things stop changing.
- """
- changes = True
- while changes:
- changes = False
- for i, state_i in enumerate(self.states):
- for j in range(i + 1, len(self.states)):
- state_j = self.states[j]
- if state_i == state_j:
- del self.states[j]
- for state in self.states:
- state.unifystate(state_j, state_i)
- changes = True
- break
-
- def dump(self, writer=print):
- """Dump a graphical representation of the DFA"""
- for i, state in enumerate(self.states):
- writer(" State", i, state.is_final and "(final)" or "")
- for label, next in sorted(state.arcs.items()):
- writer(" %s -> %d" % (label, self.states.index(next)))
-
- def dump_graph(self, writer):
- """Dump a DOT representation of the DFA"""
- writer('digraph %s_dfa {\n' % self.name)
- for i, state in enumerate(self.states):
- writer(' %d [label="State %d %s"];\n' % (i, i, state.is_final and "(final)" or ""))
- for label, next in sorted(state.arcs.items()):
- writer(" %d -> %d [label=%s];\n" % (i, self.states.index(next), label.replace("'", '"')))
- writer('}\n')
-
-
-class DFAState(object):
- """A state of a DFA
-
- Attributes:
- rule_name (rule_name): The name of the DFA rule containing the represented state.
- nfa_set (Set[NFAState]): The set of NFA states used to create this state.
- final (bool): True if the state represents an accepting state of the DFA
- containing this state.
- arcs (Dict[label, DFAState]): A mapping representing transitions between
- the current DFA state and another DFA state via following a label.
- """
-
- def __init__(self, rule_name, nfa_set, final):
- assert isinstance(nfa_set, set)
- assert isinstance(next(iter(nfa_set)), NFAState)
- assert isinstance(final, NFAState)
- self.rule_name = rule_name
- self.nfa_set = nfa_set
- self.arcs = {} # map from terminals/nonterminals to DFAState
- self.is_final = final in nfa_set
-
- def add_arc(self, target, label):
- """Add a new arc to the current state.
-
- Parameters:
- target (DFAState): The DFA state at the end of the arc.
- label (str): The label respresenting the token that must be consumed
- to perform this transition.
- """
- assert isinstance(label, str)
- assert label not in self.arcs
- assert isinstance(target, DFAState)
- self.arcs[label] = target
-
- def unifystate(self, old, new):
- """Replace all arcs from the current node to *old* with *new*.
-
- Parameters:
- old (DFAState): The DFA state to remove from all existing arcs.
- new (DFAState): The DFA state to replace in all existing arcs.
- """
- for label, next_ in self.arcs.items():
- if next_ is old:
- self.arcs[label] = new
-
- def __eq__(self, other):
- # The nfa_set does not matter for equality
- assert isinstance(other, DFAState)
- if self.is_final != other.is_final:
- return False
- # We cannot just return self.arcs == other.arcs because that
- # would invoke this method recursively if there are any cycles.
- if len(self.arcs) != len(other.arcs):
- return False
- for label, next_ in self.arcs.items():
- if next_ is not other.arcs.get(label):
- return False
- return True
-
- __hash__ = None # For Py3 compatibility.
-
- def __repr__(self):
- return "<%s: %s is_final=%s>" % (
- self.__class__.__name__,
- self.rule_name,
- self.is_final,
- )
diff --git a/Parser/pgen/grammar.py b/Parser/pgen/grammar.py
deleted file mode 100644
index ce40e16..0000000
--- a/Parser/pgen/grammar.py
+++ /dev/null
@@ -1,147 +0,0 @@
-import collections
-
-
-class Grammar:
- """Pgen parsing tables class.
-
- The instance variables are as follows:
-
- symbol2number -- a dict mapping symbol names to numbers. Symbol
- numbers are always 256 or higher, to distinguish
- them from token numbers, which are between 0 and
- 255 (inclusive).
-
- number2symbol -- a dict mapping numbers to symbol names;
- these two are each other's inverse.
-
- states -- a list of DFAs, where each DFA is a list of
- states, each state is a list of arcs, and each
- arc is a (i, j) pair where i is a label and j is
- a state number. The DFA number is the index into
- this list. (This name is slightly confusing.)
- Final states are represented by a special arc of
- the form (0, j) where j is its own state number.
-
- dfas -- a dict mapping symbol numbers to (DFA, first)
- pairs, where DFA is an item from the states list
- above, and first is a set of tokens that can
- begin this grammar rule.
-
- labels -- a list of (x, y) pairs where x is either a token
- number or a symbol number, and y is either None
- or a string; the strings are keywords. The label
- number is the index in this list; label numbers
- are used to mark state transitions (arcs) in the
- DFAs.
-
- start -- the number of the grammar's start symbol.
-
- keywords -- a dict mapping keyword strings to arc labels.
-
- tokens -- a dict mapping token numbers to arc labels.
-
- """
-
- def __init__(self):
- self.symbol2number = collections.OrderedDict()
- self.number2symbol = collections.OrderedDict()
- self.states = []
- self.dfas = collections.OrderedDict()
- self.labels = [(0, "EMPTY")]
- self.keywords = collections.OrderedDict()
- self.tokens = collections.OrderedDict()
- self.symbol2label = collections.OrderedDict()
- self.start = 256
-
- def produce_graminit_h(self, writer):
- writer("/* Generated by Parser/pgen */\n\n")
- for number, symbol in self.number2symbol.items():
- writer("#define {} {}\n".format(symbol, number))
-
- def produce_graminit_c(self, writer):
- writer("/* Generated by Parser/pgen */\n\n")
-
- writer('#include "exports.h"\n')
- writer('#include "grammar.h"\n')
- writer("Py_EXPORTED_SYMBOL grammar _PyParser_Grammar;\n")
-
- self.print_dfas(writer)
- self.print_labels(writer)
-
- writer("Py_EXPORTED_SYMBOL grammar _PyParser_Grammar = {\n")
- writer(" {n_dfas},\n".format(n_dfas=len(self.dfas)))
- writer(" dfas,\n")
- writer(" {{{n_labels}, labels}},\n".format(n_labels=len(self.labels)))
- writer(" {start_number}\n".format(start_number=self.start))
- writer("};\n")
-
- def print_labels(self, writer):
- writer(
- "static const label labels[{n_labels}] = {{\n".format(
- n_labels=len(self.labels)
- )
- )
- for label, name in self.labels:
- label_name = '"{}"'.format(name) if name is not None else 0
- writer(
- " {{{label}, {label_name}}},\n".format(
- label=label, label_name=label_name
- )
- )
- writer("};\n")
-
- def print_dfas(self, writer):
- self.print_states(writer)
- writer("static const dfa dfas[{}] = {{\n".format(len(self.dfas)))
- for dfaindex, dfa_elem in enumerate(self.dfas.items()):
- symbol, (dfa, first_sets) = dfa_elem
- writer(
- ' {{{dfa_symbol}, "{symbol_name}", '.format(
- dfa_symbol=symbol, symbol_name=self.number2symbol[symbol]
- )
- + "{n_states}, states_{dfa_index},\n".format(
- n_states=len(dfa), dfa_index=dfaindex
- )
- + ' "'
- )
-
- bitset = bytearray((len(self.labels) >> 3) + 1)
- for token in first_sets:
- bitset[token >> 3] |= 1 << (token & 7)
- for byte in bitset:
- writer("\\%03o" % (byte & 0xFF))
- writer('"},\n')
- writer("};\n")
-
- def print_states(self, write):
- for dfaindex, dfa in enumerate(self.states):
- self.print_arcs(write, dfaindex, dfa)
- write(
- "static state states_{dfa_index}[{n_states}] = {{\n".format(
- dfa_index=dfaindex, n_states=len(dfa)
- )
- )
- for stateindex, state in enumerate(dfa):
- narcs = len(state)
- write(
- " {{{n_arcs}, arcs_{dfa_index}_{state_index}}},\n".format(
- n_arcs=narcs, dfa_index=dfaindex, state_index=stateindex
- )
- )
- write("};\n")
-
- def print_arcs(self, write, dfaindex, states):
- for stateindex, state in enumerate(states):
- narcs = len(state)
- write(
- "static const arc arcs_{dfa_index}_{state_index}[{n_arcs}] = {{\n".format(
- dfa_index=dfaindex, state_index=stateindex, n_arcs=narcs
- )
- )
- for a, b in state:
- write(
- " {{{from_label}, {to_state}}},\n".format(
- from_label=a, to_state=b
- )
- )
- write("};\n")
diff --git a/Parser/pgen/keywordgen.py b/Parser/pgen/keywordgen.py
deleted file mode 100644
index f0234a8..0000000
--- a/Parser/pgen/keywordgen.py
+++ /dev/null
@@ -1,59 +0,0 @@
-"""Generate Lib/keyword.py from the Grammar and Tokens files using pgen"""
-
-import argparse
-
-from .pgen import ParserGenerator
-
-TEMPLATE = r'''
-"""Keywords (from "Grammar/Grammar")
-
-This file is automatically generated; please don't muck it up!
-
-To update the symbols in this file, 'cd' to the top directory of
-the python source tree and run:
-
- python3 -m Parser.pgen.keywordgen Grammar/Grammar \
- Grammar/Tokens \
- Lib/keyword.py
-
-Alternatively, you can run 'make regen-keyword'.
-"""
-
-__all__ = ["iskeyword", "kwlist"]
-
-kwlist = [
- {keywords}
-]
-
-iskeyword = frozenset(kwlist).__contains__
-'''.lstrip()
-
-EXTRA_KEYWORDS = ["async", "await"]
-
-
-def main():
- parser = argparse.ArgumentParser(
- description="Generate the Lib/keywords.py " "file from the grammar."
- )
- parser.add_argument(
- "grammar", type=str, help="The file with the grammar definition in EBNF format"
- )
- parser.add_argument("tokens", type=str, help="The file with the token definitions")
- parser.add_argument(
- "keyword_file",
- type=argparse.FileType("w"),
- help="The path to write the keyword definitions",
- )
- args = parser.parse_args()
- p = ParserGenerator(args.grammar, args.tokens)
- grammar = p.make_grammar()
-
- with args.keyword_file as thefile:
- all_keywords = sorted(list(grammar.keywords) + EXTRA_KEYWORDS)
-
- keywords = ",\n ".join(map(repr, all_keywords))
- thefile.write(TEMPLATE.format(keywords=keywords))
-
-
-if __name__ == "__main__":
- main()
diff --git a/Parser/pgen/metaparser.py b/Parser/pgen/metaparser.py
deleted file mode 100644
index 074a083..0000000
--- a/Parser/pgen/metaparser.py
+++ /dev/null
@@ -1,152 +0,0 @@
-"""Parser for the Python metagrammar"""
-
-import io
-import tokenize # from stdlib
-
-from .automata import NFA, NFAState
-
-
-class GrammarParser:
- """Parser for Python grammar files."""
-
- _translation_table = {
- tokenize.NAME: "NAME",
- tokenize.STRING: "STRING",
- tokenize.NEWLINE: "NEWLINE",
- tokenize.NL: "NL",
- tokenize.OP: "OP",
- tokenize.ENDMARKER: "ENDMARKER",
- tokenize.COMMENT: "COMMENT",
- }
-
- def __init__(self, grammar):
- self.grammar = grammar
- grammar_adaptor = io.StringIO(grammar)
- self.generator = tokenize.generate_tokens(grammar_adaptor.readline)
- self._gettoken() # Initialize lookahead
- self._current_rule_name = None
-
- def parse(self):
- """Turn the grammar into a collection of NFAs"""
- # grammar: (NEWLINE | rule)* ENDMARKER
- while self.type != tokenize.ENDMARKER:
- while self.type == tokenize.NEWLINE:
- self._gettoken()
- # rule: NAME ':' rhs NEWLINE
- self._current_rule_name = self._expect(tokenize.NAME)
- self._expect(tokenize.OP, ":")
- a, z = self._parse_rhs()
- self._expect(tokenize.NEWLINE)
-
- yield NFA(a, z)
-
- def _parse_rhs(self):
- # rhs: items ('|' items)*
- a, z = self._parse_items()
- if self.value != "|":
- return a, z
- else:
- aa = NFAState(self._current_rule_name)
- zz = NFAState(self._current_rule_name)
- while True:
- # Allow to transit directly to the previous state and connect the end of the
- # previous state to the end of the current one, effectively allowing to skip
- # the current state.
- aa.add_arc(a)
- z.add_arc(zz)
- if self.value != "|":
- break
-
- self._gettoken()
- a, z = self._parse_items()
- return aa, zz
-
- def _parse_items(self):
- # items: item+
- a, b = self._parse_item()
- while self.type in (tokenize.NAME, tokenize.STRING) or self.value in ("(", "["):
- c, d = self._parse_item()
- # Allow a transition between the end of the previous state
- # and the beginning of the new one, connecting all the items
- # together. In this way we can only reach the end if we visit
- # all the items.
- b.add_arc(c)
- b = d
- return a, b
-
- def _parse_item(self):
- # item: '[' rhs ']' | atom ['+' | '*']
- if self.value == "[":
- self._gettoken()
- a, z = self._parse_rhs()
- self._expect(tokenize.OP, "]")
- # Make a transition from the beginning to the end so it is possible to
- # advance for free to the next state of this item # without consuming
- # anything from the rhs.
- a.add_arc(z)
- return a, z
- else:
- a, z = self._parse_atom()
- value = self.value
- if value not in ("+", "*"):
- return a, z
- self._gettoken()
- z.add_arc(a)
- if value == "+":
- # Create a cycle to the beginning so we go back to the old state in this
- # item and repeat.
- return a, z
- else:
- # The end state is the same as the beginning, so we can cycle arbitrarily
- # and end in the beginning if necessary.
- return a, a
-
- def _parse_atom(self):
- # atom: '(' rhs ')' | NAME | STRING
- if self.value == "(":
- self._gettoken()
- a, z = self._parse_rhs()
- self._expect(tokenize.OP, ")")
- return a, z
- elif self.type in (tokenize.NAME, tokenize.STRING):
- a = NFAState(self._current_rule_name)
- z = NFAState(self._current_rule_name)
- # We can transit to the next state only if we consume the value.
- a.add_arc(z, self.value)
- self._gettoken()
- return a, z
- else:
- self._raise_error(
- "expected (...) or NAME or STRING, got {} ({})",
- self._translation_table.get(self.type, self.type),
- self.value,
- )
-
- def _expect(self, type_, value=None):
- if self.type != type_:
- self._raise_error(
- "expected {}, got {} ({})",
- self._translation_table.get(type_, type_),
- self._translation_table.get(self.type, self.type),
- self.value,
- )
- if value is not None and self.value != value:
- self._raise_error("expected {}, got {}", value, self.value)
- value = self.value
- self._gettoken()
- return value
-
- def _gettoken(self):
- tup = next(self.generator)
- while tup[0] in (tokenize.COMMENT, tokenize.NL):
- tup = next(self.generator)
- self.type, self.value, self.begin, self.end, self.line = tup
-
- def _raise_error(self, msg, *args):
- if args:
- try:
- msg = msg.format(*args)
- except Exception:
- msg = " ".join([msg] + list(map(str, args)))
- line = self.grammar.splitlines()[self.begin[0] - 1]
- raise SyntaxError(msg, ("<grammar>", self.begin[0], self.begin[1], line))
diff --git a/Parser/pgen/pgen.py b/Parser/pgen/pgen.py
deleted file mode 100644
index 03032d4..0000000
--- a/Parser/pgen/pgen.py
+++ /dev/null
@@ -1,310 +0,0 @@
-"""Python parser generator
-
-
-This parser generator transforms a Python grammar file into parsing tables
-that can be consumed by Python's LL(1) parser written in C.
-
-Concepts
---------
-
-* An LL(1) parser (Left-to-right, Leftmost derivation, 1 token-lookahead) is a
- top-down parser for a subset of context-free languages. It parses the input
- from Left to right, performing Leftmost derivation of the sentence, and can
- only use 1 token of lookahead when parsing a sentence.
-
-* A parsing table is a collection of data that a generic implementation of the
- LL(1) parser consumes to know how to parse a given context-free grammar. In
- this case the collection of data involves Deterministic Finite Automatons,
- calculated first sets, keywords and transition labels.
-
-* A grammar is defined by production rules (or just 'productions') that specify
- which symbols may replace which other symbols; these rules may be used to
- generate strings, or to parse them. Each such rule has a head, or left-hand
- side, which consists of the string that may be replaced, and a body, or
- right-hand side, which consists of a string that may replace it. In the
- Python grammar, rules are written in the form
-
- rule_name: rule_description;
-
- meaning the rule 'a: b' specifies that a can be replaced by b. A context-free
- grammar is a grammar in which the left-hand side of each production rule
- consists of only a single nonterminal symbol. Context-free grammars can
- always be recognized by a Non-Deterministic Automatons.
-
-* Terminal symbols are literal symbols which may appear in the outputs of the
- production rules of the grammar and which cannot be changed using the rules
- of the grammar. Applying the rules recursively to a source string of symbols
- will usually terminate in a final output string consisting only of terminal
- symbols.
-
-* Nonterminal symbols are those symbols which can be replaced. The grammar
- includes a start symbol a designated member of the set of nonterminals from
- which all the strings in the language may be derived by successive
- applications of the production rules.
-
-* The language defined by the grammar is defined as the set of terminal strings
- that can be derived using the production rules.
-
-* The first sets of a rule (FIRST(rule)) are defined to be the set of terminals
- that can appear in the first position of any string derived from the rule.
- This is useful for LL(1) parsers as the parser is only allowed to look at the
- next token in the input to know which rule needs to parse. For example, given
- this grammar:
-
- start: '(' A | B ')'
- A: 'a' '<'
- B: 'b' '<'
-
- and the input '(b<)' the parser can only look at 'b' to know if it needs
- to parse A o B. Because FIRST(A) = {'a'} and FIRST(B) = {'b'} it knows
- that needs to continue parsing rule B because only that rule can start
- with 'b'.
-
-Description
------------
-
-The input for the parser generator is a grammar in extended BNF form (using *
-for repetition, + for at-least-once repetition, [] for optional parts, | for
-alternatives and () for grouping).
-
-Each rule in the grammar file is considered as a regular expression in its
-own right. It is turned into a Non-deterministic Finite Automaton (NFA),
-which is then turned into a Deterministic Finite Automaton (DFA), which is
-then optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
-or similar compiler books (this technique is more often used for lexical
-analyzers).
-
-The DFA's are used by the parser as parsing tables in a special way that's
-probably unique. Before they are usable, the FIRST sets of all non-terminals
-are computed so the LL(1) parser consuming the parsing tables can distinguish
-between different transitions.
-Reference
----------
-
-[Aho&Ullman 77]
- Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
- (first edition)
-"""
-
-from ast import literal_eval
-import collections
-
-from . import grammar, token
-from .automata import DFA
-from .metaparser import GrammarParser
-
-import enum
-
-
-class LabelType(enum.Enum):
- NONTERMINAL = 0
- NAMED_TOKEN = 1
- KEYWORD = 2
- OPERATOR = 3
- NONE = 4
-
-
-class Label(str):
- def __init__(self, value):
- self.type = self._get_type()
-
- def _get_type(self):
- if self[0].isalpha():
- if self.upper() == self:
- # NAMED tokens (ASYNC, NAME...) are all uppercase by convention
- return LabelType.NAMED_TOKEN
- else:
- # If is not uppercase it must be a non terminal.
- return LabelType.NONTERMINAL
- else:
- # Keywords and operators are wrapped in quotes
- assert self[0] == self[-1] in ('"', "'"), self
- value = literal_eval(self)
- if value[0].isalpha():
- return LabelType.KEYWORD
- else:
- return LabelType.OPERATOR
-
- def __repr__(self):
- return "{}({})".format(self.type, super().__repr__())
-
-
-class ParserGenerator(object):
- def __init__(self, grammar_file, token_file, verbose=False, graph_file=None):
- with open(grammar_file) as f:
- self.grammar = f.read()
- with open(token_file) as tok_file:
- token_lines = tok_file.readlines()
- self.tokens = dict(token.generate_tokens(token_lines))
- self.opmap = dict(token.generate_opmap(token_lines))
- # Manually add <> so it does not collide with !=
- self.opmap["<>"] = "NOTEQUAL"
- self.verbose = verbose
- self.filename = grammar_file
- self.graph_file = graph_file
- self.dfas, self.startsymbol = self.create_dfas()
- self.first = {} # map from symbol name to set of tokens
- self.calculate_first_sets()
-
- def create_dfas(self):
- rule_to_dfas = collections.OrderedDict()
- start_nonterminal = None
- for nfa in GrammarParser(self.grammar).parse():
- if self.verbose:
- print("Dump of NFA for", nfa.name)
- nfa.dump()
- if self.graph_file is not None:
- nfa.dump_graph(self.graph_file.write)
- dfa = DFA.from_nfa(nfa)
- if self.verbose:
- print("Dump of DFA for", dfa.name)
- dfa.dump()
- dfa.simplify()
- if self.graph_file is not None:
- dfa.dump_graph(self.graph_file.write)
- rule_to_dfas[dfa.name] = dfa
-
- if start_nonterminal is None:
- start_nonterminal = dfa.name
-
- return rule_to_dfas, start_nonterminal
-
- def make_grammar(self):
- c = grammar.Grammar()
- c.all_labels = set()
- names = list(self.dfas.keys())
- names.remove(self.startsymbol)
- names.insert(0, self.startsymbol)
- for name in names:
- i = 256 + len(c.symbol2number)
- c.symbol2number[Label(name)] = i
- c.number2symbol[i] = Label(name)
- c.all_labels.add(name)
- for name in names:
- self.make_label(c, name)
- dfa = self.dfas[name]
- states = []
- for state in dfa:
- arcs = []
- for label, next in sorted(state.arcs.items()):
- c.all_labels.add(label)
- arcs.append((self.make_label(c, label), dfa.states.index(next)))
- if state.is_final:
- arcs.append((0, dfa.states.index(state)))
- states.append(arcs)
- c.states.append(states)
- c.dfas[c.symbol2number[name]] = (states, self.make_first_sets(c, name))
- c.start = c.symbol2number[self.startsymbol]
-
- if self.verbose:
- print("")
- print("Grammar summary")
- print("===============")
-
- print("- {n_labels} labels".format(n_labels=len(c.labels)))
- print("- {n_dfas} dfas".format(n_dfas=len(c.dfas)))
- print("- {n_tokens} tokens".format(n_tokens=len(c.tokens)))
- print("- {n_keywords} keywords".format(n_keywords=len(c.keywords)))
- print(
- "- Start symbol: {start_symbol}".format(
- start_symbol=c.number2symbol[c.start]
- )
- )
- return c
-
- def make_first_sets(self, c, name):
- rawfirst = self.first[name]
- first = set()
- for label in sorted(rawfirst):
- ilabel = self.make_label(c, label)
- ##assert ilabel not in first # XXX failed on <> ... !=
- first.add(ilabel)
- return first
-
- def make_label(self, c, label):
- label = Label(label)
- ilabel = len(c.labels)
-
- if label.type == LabelType.NONTERMINAL:
- if label in c.symbol2label:
- return c.symbol2label[label]
- else:
- c.labels.append((c.symbol2number[label], None))
- c.symbol2label[label] = ilabel
- return ilabel
- elif label.type == LabelType.NAMED_TOKEN:
- # A named token (NAME, NUMBER, STRING)
- itoken = self.tokens.get(label, None)
- assert isinstance(itoken, int), label
- assert itoken in self.tokens.values(), label
- if itoken in c.tokens:
- return c.tokens[itoken]
- else:
- c.labels.append((itoken, None))
- c.tokens[itoken] = ilabel
- return ilabel
- elif label.type == LabelType.KEYWORD:
- # A keyword
- value = literal_eval(label)
- if value in c.keywords:
- return c.keywords[value]
- else:
- c.labels.append((self.tokens["NAME"], value))
- c.keywords[value] = ilabel
- return ilabel
- elif label.type == LabelType.OPERATOR:
- # An operator (any non-numeric token)
- value = literal_eval(label)
- tok_name = self.opmap[value] # Fails if unknown token
- itoken = self.tokens[tok_name]
- if itoken in c.tokens:
- return c.tokens[itoken]
- else:
- c.labels.append((itoken, None))
- c.tokens[itoken] = ilabel
- return ilabel
- else:
- raise ValueError("Cannot categorize label {}".format(label))
-
- def calculate_first_sets(self):
- names = list(self.dfas.keys())
- for name in names:
- if name not in self.first:
- self.calculate_first_sets_for_rule(name)
-
- if self.verbose:
- print("First set for {dfa_name}".format(dfa_name=name))
- for item in self.first[name]:
- print(" - {terminal}".format(terminal=item))
-
- def calculate_first_sets_for_rule(self, name):
- dfa = self.dfas[name]
- self.first[name] = None # dummy to detect left recursion
- state = dfa.states[0]
- totalset = set()
- overlapcheck = {}
- for label, next in state.arcs.items():
- if label in self.dfas:
- if label in self.first:
- fset = self.first[label]
- if fset is None:
- raise ValueError("recursion for rule %r" % name)
- else:
- self.calculate_first_sets_for_rule(label)
- fset = self.first[label]
- totalset.update(fset)
- overlapcheck[label] = fset
- else:
- totalset.add(label)
- overlapcheck[label] = {label}
- inverse = {}
- for label, itsfirst in overlapcheck.items():
- for symbol in itsfirst:
- if symbol in inverse:
- raise ValueError(
- "rule %s is ambiguous; %s is in the"
- " first sets of %s as well as %s"
- % (name, symbol, label, inverse[symbol])
- )
- inverse[symbol] = label
- self.first[name] = totalset
diff --git a/Parser/pgen/token.py b/Parser/pgen/token.py
deleted file mode 100644
index 2cff62c..0000000
--- a/Parser/pgen/token.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import itertools
-
-
-def generate_tokens(tokens):
- numbers = itertools.count(0)
- for line in tokens:
- line = line.strip()
-
- if not line or line.startswith("#"):
- continue
-
- name = line.split()[0]
- yield (name, next(numbers))
-
- yield ("N_TOKENS", next(numbers))
- yield ("NT_OFFSET", 256)
-
-
-def generate_opmap(tokens):
- for line in tokens:
- line = line.strip()
-
- if not line or line.startswith("#"):
- continue
-
- pieces = line.split()
-
- if len(pieces) != 2:
- continue
-
- name, op = pieces
- yield (op.strip("'"), name)
-
- # Yield independently <>. This is needed so it does not collide
- # with the token generation in "generate_tokens" because if this
- # symbol is included in Grammar/Tokens, it will collide with !=
- # as it has the same name (NOTEQUAL).
- yield ("<>", "NOTEQUAL")