summaryrefslogtreecommitdiffstats
path: root/Parser
diff options
context:
space:
mode:
Diffstat (limited to 'Parser')
-rw-r--r--Parser/pgen/__main__.py8
-rw-r--r--Parser/pgen/automata.py371
-rw-r--r--Parser/pgen/grammar.py6
-rw-r--r--Parser/pgen/keywordgen.py11
-rw-r--r--Parser/pgen/metaparser.py152
-rw-r--r--Parser/pgen/pgen.py527
-rw-r--r--Parser/pgen/token.py10
7 files changed, 753 insertions, 332 deletions
diff --git a/Parser/pgen/__main__.py b/Parser/pgen/__main__.py
index eea5261..bb96e75 100644
--- a/Parser/pgen/__main__.py
+++ b/Parser/pgen/__main__.py
@@ -8,17 +8,15 @@ def main():
parser.add_argument(
"grammar", type=str, help="The file with the grammar definition in EBNF format"
)
- parser.add_argument(
- "tokens", type=str, help="The file with the token definitions"
- )
+ parser.add_argument("tokens", type=str, help="The file with the token definitions")
parser.add_argument(
"graminit_h",
- type=argparse.FileType('w'),
+ type=argparse.FileType("w"),
help="The path to write the grammar's non-terminals as #defines",
)
parser.add_argument(
"graminit_c",
- type=argparse.FileType('w'),
+ type=argparse.FileType("w"),
help="The path to write the grammar as initialized data",
)
diff --git a/Parser/pgen/automata.py b/Parser/pgen/automata.py
new file mode 100644
index 0000000..3147d86
--- /dev/null
+++ b/Parser/pgen/automata.py
@@ -0,0 +1,371 @@
+"""Classes representing state-machine concepts"""
+
+class NFA:
+ """A non deterministic finite automata
+
+ A non deterministic automata is a form of a finite state
+ machine. An NFA's rules are less restrictive than a DFA.
+ The NFA rules are:
+
+ * A transition can be non-deterministic and can result in
+ nothing, one, or two or more states.
+
+ * An epsilon transition consuming empty input is valid.
+ Transitions consuming labeled symbols are also permitted.
+
+ This class assumes that there is only one starting state and one
+ accepting (ending) state.
+
+ Attributes:
+ name (str): The name of the rule the NFA is representing.
+ start (NFAState): The starting state.
+ end (NFAState): The ending state
+ """
+
+ def __init__(self, start, end):
+ self.name = start.rule_name
+ self.start = start
+ self.end = end
+
+ def __repr__(self):
+ return "NFA(start={}, end={})".format(self.start, self.end)
+
+ def dump(self, writer=print):
+ """Dump a graphical representation of the NFA"""
+ todo = [self.start]
+ for i, state in enumerate(todo):
+ writer(" State", i, state is self.end and "(final)" or "")
+ for arc in state.arcs:
+ label = arc.label
+ next = arc.target
+ if next in todo:
+ j = todo.index(next)
+ else:
+ j = len(todo)
+ todo.append(next)
+ if label is None:
+ writer(" -> %d" % j)
+ else:
+ writer(" %s -> %d" % (label, j))
+
+
+class NFAArc:
+ """An arc representing a transition between two NFA states.
+
+ NFA states can be connected via two ways:
+
+ * A label transition: An input equal to the label must
+ be consumed to perform the transition.
+ * An epsilon transition: The transition can be taken without
+ consuming any input symbol.
+
+ Attributes:
+ target (NFAState): The ending state of the transition arc.
+ label (Optional[str]): The label that must be consumed to make
+ the transition. An epsilon transition is represented
+ using `None`.
+ """
+
+ def __init__(self, target, label):
+ self.target = target
+ self.label = label
+
+ def __repr__(self):
+ return "<%s: %s>" % (self.__class__.__name__, self.label)
+
+
+class NFAState:
+ """A state of a NFA, non deterministic finite automata.
+
+ Attributes:
+ target (rule_name): The name of the rule used to represent the NFA's
+ ending state after a transition.
+ arcs (Dict[Optional[str], NFAState]): A mapping representing transitions
+ between the current NFA state and another NFA state via following
+ a label.
+ """
+
+ def __init__(self, rule_name):
+ self.rule_name = rule_name
+ self.arcs = []
+
+ def add_arc(self, target, label=None):
+ """Add a new arc to connect the state to a target state within the NFA
+
+ The method adds a new arc to the list of arcs available as transitions
+ from the present state. An optional label indicates a named transition
+ that consumes an input while the absence of a label represents an epsilon
+ transition.
+
+ Attributes:
+ target (NFAState): The end of the transition that the arc represents.
+ label (Optional[str]): The label that must be consumed for making
+ the transition. If the label is not provided the transition is assumed
+ to be an epsilon-transition.
+ """
+ assert label is None or isinstance(label, str)
+ assert isinstance(target, NFAState)
+ self.arcs.append(NFAArc(target, label))
+
+ def __repr__(self):
+ return "<%s: from %s>" % (self.__class__.__name__, self.rule_name)
+
+
+class DFA:
+ """A deterministic finite automata
+
+ A deterministic finite automata is a form of a finite state machine
+ that obeys the following rules:
+
+ * Each of the transitions is uniquely determined by
+ the source state and input symbol
+ * Reading an input symbol is required for each state
+ transition (no epsilon transitions).
+
+ The finite-state machine will accept or reject a string of symbols
+ and only produces a unique computation of the automaton for each input
+ string. The DFA must have a unique starting state (represented as the first
+ element in the list of states) but can have multiple final states.
+
+ Attributes:
+ name (str): The name of the rule the DFA is representing.
+ states (List[DFAState]): A collection of DFA states.
+ """
+
+ def __init__(self, name, states):
+ self.name = name
+ self.states = states
+
+ @classmethod
+ def from_nfa(cls, nfa):
+ """Constructs a DFA from a NFA using the Rabin–Scott construction algorithm.
+
+ To simulate the operation of a DFA on a given input string, it's
+ necessary to keep track of a single state at any time, or more precisely,
+ the state that the automaton will reach after seeing a prefix of the
+ input. In contrast, to simulate an NFA, it's necessary to keep track of
+ a set of states: all of the states that the automaton could reach after
+ seeing the same prefix of the input, according to the nondeterministic
+ choices made by the automaton. There are two possible sources of
+ non-determinism:
+
+ 1) Multiple (one or more) transitions with the same label
+
+ 'A' +-------+
+ +----------->+ State +----------->+
+ | | 2 |
+ +-------+ +-------+
+ | State |
+ | 1 | +-------+
+ +-------+ | State |
+ +----------->+ 3 +----------->+
+ 'A' +-------+
+
+ 2) Epsilon transitions (transitions that can be taken without consuming any input)
+
+ +-------+ +-------+
+ | State | ε | State |
+ | 1 +----------->+ 2 +----------->+
+ +-------+ +-------+
+
+ Looking at the first case above, we can't determine which transition should be
+ followed when given an input A. We could choose whether or not to follow the
+ transition while in the second case the problem is that we can choose both to
+ follow the transition or not doing it. To solve this problem we can imagine that
+ we follow all possibilities at the same time and we construct new states from the
+ set of all possible reachable states. For every case in the previous example:
+
+
+ 1) For multiple transitions with the same label we colapse all of the
+ final states under the same one
+
+ +-------+ +-------+
+ | State | 'A' | State |
+ | 1 +----------->+ 2-3 +----------->+
+ +-------+ +-------+
+
+ 2) For epsilon transitions we collapse all epsilon-reachable states
+ into the same one
+
+ +-------+
+ | State |
+ | 1-2 +----------->
+ +-------+
+
+ Because the DFA states consist of sets of NFA states, an n-state NFA
+ may be converted to a DFA with at most 2**n states. Notice that the
+ constructed DFA is not minimal and can be simplified or reduced
+ afterwards.
+
+ Parameters:
+ name (NFA): The NFA to transform to DFA.
+ """
+ assert isinstance(nfa, NFA)
+
+ def add_closure(nfa_state, base_nfa_set):
+ """Calculate the epsilon-closure of a given state
+
+ Add to the *base_nfa_set* all the states that are
+ reachable from *nfa_state* via epsilon-transitions.
+ """
+ assert isinstance(nfa_state, NFAState)
+ if nfa_state in base_nfa_set:
+ return
+ base_nfa_set.add(nfa_state)
+ for nfa_arc in nfa_state.arcs:
+ if nfa_arc.label is None:
+ add_closure(nfa_arc.target, base_nfa_set)
+
+ # Calculte the epsilon-closure of the starting state
+ base_nfa_set = set()
+ add_closure(nfa.start, base_nfa_set)
+
+ # Start by visiting the NFA starting state (there is only one).
+ states = [DFAState(nfa.name, base_nfa_set, nfa.end)]
+
+ for state in states: # NB states grow while we're iterating
+
+ # Find transitions from the current state to other reachable states
+ # and store them in mapping that correlates the label to all the
+ # possible reachable states that can be obtained by consuming a
+ # token equal to the label. Each set of all the states that can
+ # be reached after following a label will be the a DFA state.
+ arcs = {}
+ for nfa_state in state.nfa_set:
+ for nfa_arc in nfa_state.arcs:
+ if nfa_arc.label is not None:
+ nfa_set = arcs.setdefault(nfa_arc.label, set())
+ # All states that can be reached by epsilon-transitions
+ # are also included in the set of reachable states.
+ add_closure(nfa_arc.target, nfa_set)
+
+ # Now create new DFAs by visiting all posible transitions between
+ # the current DFA state and the new power-set states (each nfa_set)
+ # via the different labels. As the nodes are appended to *states* this
+ # is performing a deep-first search traversal over the power-set of
+ # the states of the original NFA.
+ for label, nfa_set in sorted(arcs.items()):
+ for exisisting_state in states:
+ if exisisting_state.nfa_set == nfa_set:
+ # The DFA state already exists for this rule.
+ next_state = exisisting_state
+ break
+ else:
+ next_state = DFAState(nfa.name, nfa_set, nfa.end)
+ states.append(next_state)
+
+ # Add a transition between the current DFA state and the new
+ # DFA state (the power-set state) via the current label.
+ state.add_arc(next_state, label)
+
+ return cls(nfa.name, states)
+
+ def __iter__(self):
+ return iter(self.states)
+
+ def simplify(self):
+ """Attempt to reduce the number of states of the DFA
+
+ Transform the DFA into an equivalent DFA that has fewer states. Two
+ classes of states can be removed or merged from the original DFA without
+ affecting the language it accepts to minimize it:
+
+ * Unreachable states can not be reached from the initial
+ state of the DFA, for any input string.
+ * Nondistinguishable states are those that cannot be distinguished
+ from one another for any input string.
+
+ This algorithm does not achieve the optimal fully-reduced solution, but it
+ works well enough for the particularities of the Python grammar. The
+ algorithm repeatedly looks for two states that have the same set of
+ arcs (same labels pointing to the same nodes) and unifies them, until
+ things stop changing.
+ """
+ changes = True
+ while changes:
+ changes = False
+ for i, state_i in enumerate(self.states):
+ for j in range(i + 1, len(self.states)):
+ state_j = self.states[j]
+ if state_i == state_j:
+ del self.states[j]
+ for state in self.states:
+ state.unifystate(state_j, state_i)
+ changes = True
+ break
+
+ def dump(self, writer=print):
+ """Dump a graphical representation of the DFA"""
+ for i, state in enumerate(self.states):
+ writer(" State", i, state.is_final and "(final)" or "")
+ for label, next in sorted(state.arcs.items()):
+ writer(" %s -> %d" % (label, self.states.index(next)))
+
+
+class DFAState(object):
+ """A state of a DFA
+
+ Attributes:
+ rule_name (rule_name): The name of the DFA rule containing the represented state.
+ nfa_set (Set[NFAState]): The set of NFA states used to create this state.
+ final (bool): True if the state represents an accepting state of the DFA
+ containing this state.
+ arcs (Dict[label, DFAState]): A mapping representing transitions between
+ the current DFA state and another DFA state via following a label.
+ """
+
+ def __init__(self, rule_name, nfa_set, final):
+ assert isinstance(nfa_set, set)
+ assert isinstance(next(iter(nfa_set)), NFAState)
+ assert isinstance(final, NFAState)
+ self.rule_name = rule_name
+ self.nfa_set = nfa_set
+ self.arcs = {} # map from terminals/nonterminals to DFAState
+ self.is_final = final in nfa_set
+
+ def add_arc(self, target, label):
+ """Add a new arc to the current state.
+
+ Parameters:
+ target (DFAState): The DFA state at the end of the arc.
+ label (str): The label respresenting the token that must be consumed
+ to perform this transition.
+ """
+ assert isinstance(label, str)
+ assert label not in self.arcs
+ assert isinstance(target, DFAState)
+ self.arcs[label] = target
+
+ def unifystate(self, old, new):
+ """Replace all arcs from the current node to *old* with *new*.
+
+ Parameters:
+ old (DFAState): The DFA state to remove from all existing arcs.
+ new (DFAState): The DFA state to replace in all existing arcs.
+ """
+ for label, next_ in self.arcs.items():
+ if next_ is old:
+ self.arcs[label] = new
+
+ def __eq__(self, other):
+ # The nfa_set does not matter for equality
+ assert isinstance(other, DFAState)
+ if self.is_final != other.is_final:
+ return False
+ # We cannot just return self.arcs == other.arcs because that
+ # would invoke this method recursively if there are any cycles.
+ if len(self.arcs) != len(other.arcs):
+ return False
+ for label, next_ in self.arcs.items():
+ if next_ is not other.arcs.get(label):
+ return False
+ return True
+
+ __hash__ = None # For Py3 compatibility.
+
+ def __repr__(self):
+ return "<%s: %s is_final=%s>" % (
+ self.__class__.__name__,
+ self.rule_name,
+ self.is_final,
+ )
diff --git a/Parser/pgen/grammar.py b/Parser/pgen/grammar.py
index 5cd6524..56188db 100644
--- a/Parser/pgen/grammar.py
+++ b/Parser/pgen/grammar.py
@@ -76,12 +76,14 @@ class Grammar:
def print_labels(self, writer):
writer(
- "static const label labels[{n_labels}] = {{\n".format(n_labels=len(self.labels))
+ "static const label labels[{n_labels}] = {{\n".format(
+ n_labels=len(self.labels)
+ )
)
for label, name in self.labels:
label_name = '"{}"'.format(name) if name is not None else 0
writer(
- ' {{{label}, {label_name}}},\n'.format(
+ " {{{label}, {label_name}}},\n".format(
label=label, label_name=label_name
)
)
diff --git a/Parser/pgen/keywordgen.py b/Parser/pgen/keywordgen.py
index eeb3ef7..f0234a8 100644
--- a/Parser/pgen/keywordgen.py
+++ b/Parser/pgen/keywordgen.py
@@ -32,17 +32,16 @@ EXTRA_KEYWORDS = ["async", "await"]
def main():
- parser = argparse.ArgumentParser(description="Generate the Lib/keywords.py "
- "file from the grammar.")
- parser.add_argument(
- "grammar", type=str, help="The file with the grammar definition in EBNF format"
+ parser = argparse.ArgumentParser(
+ description="Generate the Lib/keywords.py " "file from the grammar."
)
parser.add_argument(
- "tokens", type=str, help="The file with the token definitions"
+ "grammar", type=str, help="The file with the grammar definition in EBNF format"
)
+ parser.add_argument("tokens", type=str, help="The file with the token definitions")
parser.add_argument(
"keyword_file",
- type=argparse.FileType('w'),
+ type=argparse.FileType("w"),
help="The path to write the keyword definitions",
)
args = parser.parse_args()
diff --git a/Parser/pgen/metaparser.py b/Parser/pgen/metaparser.py
new file mode 100644
index 0000000..074a083
--- /dev/null
+++ b/Parser/pgen/metaparser.py
@@ -0,0 +1,152 @@
+"""Parser for the Python metagrammar"""
+
+import io
+import tokenize # from stdlib
+
+from .automata import NFA, NFAState
+
+
+class GrammarParser:
+ """Parser for Python grammar files."""
+
+ _translation_table = {
+ tokenize.NAME: "NAME",
+ tokenize.STRING: "STRING",
+ tokenize.NEWLINE: "NEWLINE",
+ tokenize.NL: "NL",
+ tokenize.OP: "OP",
+ tokenize.ENDMARKER: "ENDMARKER",
+ tokenize.COMMENT: "COMMENT",
+ }
+
+ def __init__(self, grammar):
+ self.grammar = grammar
+ grammar_adaptor = io.StringIO(grammar)
+ self.generator = tokenize.generate_tokens(grammar_adaptor.readline)
+ self._gettoken() # Initialize lookahead
+ self._current_rule_name = None
+
+ def parse(self):
+ """Turn the grammar into a collection of NFAs"""
+ # grammar: (NEWLINE | rule)* ENDMARKER
+ while self.type != tokenize.ENDMARKER:
+ while self.type == tokenize.NEWLINE:
+ self._gettoken()
+ # rule: NAME ':' rhs NEWLINE
+ self._current_rule_name = self._expect(tokenize.NAME)
+ self._expect(tokenize.OP, ":")
+ a, z = self._parse_rhs()
+ self._expect(tokenize.NEWLINE)
+
+ yield NFA(a, z)
+
+ def _parse_rhs(self):
+ # rhs: items ('|' items)*
+ a, z = self._parse_items()
+ if self.value != "|":
+ return a, z
+ else:
+ aa = NFAState(self._current_rule_name)
+ zz = NFAState(self._current_rule_name)
+ while True:
+ # Allow to transit directly to the previous state and connect the end of the
+ # previous state to the end of the current one, effectively allowing to skip
+ # the current state.
+ aa.add_arc(a)
+ z.add_arc(zz)
+ if self.value != "|":
+ break
+
+ self._gettoken()
+ a, z = self._parse_items()
+ return aa, zz
+
+ def _parse_items(self):
+ # items: item+
+ a, b = self._parse_item()
+ while self.type in (tokenize.NAME, tokenize.STRING) or self.value in ("(", "["):
+ c, d = self._parse_item()
+ # Allow a transition between the end of the previous state
+ # and the beginning of the new one, connecting all the items
+ # together. In this way we can only reach the end if we visit
+ # all the items.
+ b.add_arc(c)
+ b = d
+ return a, b
+
+ def _parse_item(self):
+ # item: '[' rhs ']' | atom ['+' | '*']
+ if self.value == "[":
+ self._gettoken()
+ a, z = self._parse_rhs()
+ self._expect(tokenize.OP, "]")
+ # Make a transition from the beginning to the end so it is possible to
+ # advance for free to the next state of this item # without consuming
+ # anything from the rhs.
+ a.add_arc(z)
+ return a, z
+ else:
+ a, z = self._parse_atom()
+ value = self.value
+ if value not in ("+", "*"):
+ return a, z
+ self._gettoken()
+ z.add_arc(a)
+ if value == "+":
+ # Create a cycle to the beginning so we go back to the old state in this
+ # item and repeat.
+ return a, z
+ else:
+ # The end state is the same as the beginning, so we can cycle arbitrarily
+ # and end in the beginning if necessary.
+ return a, a
+
+ def _parse_atom(self):
+ # atom: '(' rhs ')' | NAME | STRING
+ if self.value == "(":
+ self._gettoken()
+ a, z = self._parse_rhs()
+ self._expect(tokenize.OP, ")")
+ return a, z
+ elif self.type in (tokenize.NAME, tokenize.STRING):
+ a = NFAState(self._current_rule_name)
+ z = NFAState(self._current_rule_name)
+ # We can transit to the next state only if we consume the value.
+ a.add_arc(z, self.value)
+ self._gettoken()
+ return a, z
+ else:
+ self._raise_error(
+ "expected (...) or NAME or STRING, got {} ({})",
+ self._translation_table.get(self.type, self.type),
+ self.value,
+ )
+
+ def _expect(self, type_, value=None):
+ if self.type != type_:
+ self._raise_error(
+ "expected {}, got {} ({})",
+ self._translation_table.get(type_, type_),
+ self._translation_table.get(self.type, self.type),
+ self.value,
+ )
+ if value is not None and self.value != value:
+ self._raise_error("expected {}, got {}", value, self.value)
+ value = self.value
+ self._gettoken()
+ return value
+
+ def _gettoken(self):
+ tup = next(self.generator)
+ while tup[0] in (tokenize.COMMENT, tokenize.NL):
+ tup = next(self.generator)
+ self.type, self.value, self.begin, self.end, self.line = tup
+
+ def _raise_error(self, msg, *args):
+ if args:
+ try:
+ msg = msg.format(*args)
+ except Exception:
+ msg = " ".join([msg] + list(map(str, args)))
+ line = self.grammar.splitlines()[self.begin[0] - 1]
+ raise SyntaxError(msg, ("<grammar>", self.begin[0], self.begin[1], line))
diff --git a/Parser/pgen/pgen.py b/Parser/pgen/pgen.py
index d52d58f..d7dcb76 100644
--- a/Parser/pgen/pgen.py
+++ b/Parser/pgen/pgen.py
@@ -1,42 +1,180 @@
+"""Python parser generator
+
+
+This parser generator transforms a Python grammar file into parsing tables
+that can be consumed by Python's LL(1) parser written in C.
+
+Concepts
+--------
+
+* An LL(1) parser (Left-to-right, Leftmost derivation, 1 token-lookahead) is a
+ top-down parser for a subset of context-free languages. It parses the input
+ from Left to right, performing Leftmost derivation of the sentence, and can
+ only use 1 tokens of lookahead when parsing a sentence.
+
+* A parsing table is a collection of data that a generic implementation of the
+ LL(1) parser consumes to know how to parse a given context-free grammar. In
+ this case the collection of thata involves Deterministic Finite Automatons,
+ calculated first sets, keywords and transition labels.
+
+* A grammar is defined by production rules (or just 'productions') that specify
+ which symbols may replace which other symbols; these rules may be used to
+ generate strings, or to parse them. Each such rule has a head, or left-hand
+ side, which consists of the string that may be replaced, and a body, or
+ right-hand side, which consists of a string that may replace it. In the
+ Python grammar, rules are written in the form
+
+ rule_name: rule_description;
+
+ meaning the rule 'a: b' specifies that a can be replaced by b. A Context-free
+ grammars is a grammars in which the left-hand side of each production rule
+ consists of only a single nonterminal symbol. Context free grammars can
+ always be recognized by a Non-Deterministic Automatons.
+
+* Terminal symbols are literal symbols which may appear in the outputs of the
+ production rules of the grammar and which cannot be changed using the rules
+ of the grammar. Applying the rules recursively to a source string of symbols
+ will usually terminate in a final output string consisting only of terminal
+ symbols.
+
+* Nonterminal symbols are those symbols which can be replaced. The grammar
+ includes a start symbol a designated member of the set of nonterminals from
+ which all the strings in the language may be derived by successive
+ applications of the production rules.
+
+* The language defined by the grammar is defined as the set of terminal strings
+ that can be derived using the production rules.
+
+* The first sets of a rule (FIRST(rule)) are defined to be the set of terminals
+ that can appear in the first position of any string derived from the rule.
+ This is useful for LL(1) parsers as the parser is only allow to look at the
+ next token in the input to know which rule needs to parse. For example given
+ this grammar:
+
+ start: '(' A | B ')'
+ A: 'a' '<'
+ B: 'b' '<'
+
+ and the input '(b<)' the parser can only look at 'b' to know if it needs
+ to parse A o B. Because FIRST(A) = {'a'} and FIRST(B) = {'b'} it knows
+ that needs to continue parsing rule B because only that rule can start
+ with 'b'.
+
+Description
+-----------
+
+The input for the parser generator is a grammar in extended BNF form (using *
+for repetition, + for at-least-once repetition, [] for optional parts, | for
+alternatives and () for grouping).
+
+Each rule in the grammar file is considered as a regular expression in its
+own right. It is turned into a Non-deterministic Finite Automaton (NFA),
+which is then turned into a Deterministic Finite Automaton (DFA), which is
+then optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
+or similar compiler books (this technique is more often used for lexical
+analyzers).
+
+The DFA's are used by the parser as parsing tables in a special way that's
+probably unique. Before they are usable, the FIRST sets of all non-terminals
+are computed so the LL(1) parser consuming the parsing tables can distinguish
+between different transitions.
+Reference
+---------
+
+[Aho&Ullman 77]
+ Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
+ (first edition)
+"""
+
+from ast import literal_eval
import collections
-import tokenize # from stdlib
from . import grammar, token
+from .automata import DFA
+from .metaparser import GrammarParser
+import enum
-class ParserGenerator(object):
- def __init__(self, grammar_file, token_file, stream=None, verbose=False):
- close_stream = None
- if stream is None:
- stream = open(grammar_file)
- close_stream = stream.close
+class LabelType(enum.Enum):
+ NONTERMINAL = 0
+ NAMED_TOKEN = 1
+ KEYWORD = 2
+ OPERATOR = 3
+ NONE = 4
+
+
+class Label(str):
+ def __init__(self, value):
+ self.type = self._get_type()
+
+ def _get_type(self):
+ if self[0].isalpha():
+ if self.upper() == self:
+ # NAMED tokens (ASYNC, NAME...) are all uppercase by convention
+ return LabelType.NAMED_TOKEN
+ else:
+ # If is not uppercase it must be a non terminal.
+ return LabelType.NONTERMINAL
+ else:
+ # Keywords and operators are wrapped in quotes
+ assert self[0] == self[-1] in ('"', "'"), self
+ value = literal_eval(self)
+ if value[0].isalpha():
+ return LabelType.KEYWORD
+ else:
+ return LabelType.OPERATOR
+
+ def __repr__(self):
+ return "{}({})".format(self.type, super().__repr__())
+
+
+class ParserGenerator(object):
+ def __init__(self, grammar_file, token_file, verbose=False):
+ with open(grammar_file) as f:
+ self.grammar = f.read()
with open(token_file) as tok_file:
token_lines = tok_file.readlines()
self.tokens = dict(token.generate_tokens(token_lines))
self.opmap = dict(token.generate_opmap(token_lines))
# Manually add <> so it does not collide with !=
- self.opmap['<>'] = "NOTEQUAL"
+ self.opmap["<>"] = "NOTEQUAL"
self.verbose = verbose
self.filename = grammar_file
- self.stream = stream
- self.generator = tokenize.generate_tokens(stream.readline)
- self.gettoken() # Initialize lookahead
- self.dfas, self.startsymbol = self.parse()
- if close_stream is not None:
- close_stream()
- self.first = {} # map from symbol name to set of tokens
- self.addfirstsets()
+ self.dfas, self.startsymbol = self.create_dfas()
+ self.first = {} # map from symbol name to set of tokens
+ self.calculate_first_sets()
+
+ def create_dfas(self):
+ rule_to_dfas = collections.OrderedDict()
+ start_nonterminal = None
+ for nfa in GrammarParser(self.grammar).parse():
+ if self.verbose:
+ print("Dump of NFA for", nfa.name)
+ nfa.dump()
+ dfa = DFA.from_nfa(nfa)
+ if self.verbose:
+ print("Dump of DFA for", dfa.name)
+ dfa.dump()
+ dfa.simplify()
+ rule_to_dfas[dfa.name] = dfa
+
+ if start_nonterminal is None:
+ start_nonterminal = dfa.name
+
+ return rule_to_dfas, start_nonterminal
def make_grammar(self):
c = grammar.Grammar()
+ c.all_labels = set()
names = list(self.dfas.keys())
names.remove(self.startsymbol)
names.insert(0, self.startsymbol)
for name in names:
i = 256 + len(c.symbol2number)
- c.symbol2number[name] = i
- c.number2symbol[i] = name
+ c.symbol2number[Label(name)] = i
+ c.number2symbol[i] = Label(name)
+ c.all_labels.add(name)
for name in names:
self.make_label(c, name)
dfa = self.dfas[name]
@@ -44,12 +182,13 @@ class ParserGenerator(object):
for state in dfa:
arcs = []
for label, next in sorted(state.arcs.items()):
- arcs.append((self.make_label(c, label), dfa.index(next)))
- if state.isfinal:
- arcs.append((0, dfa.index(state)))
+ c.all_labels.add(label)
+ arcs.append((self.make_label(c, label), dfa.states.index(next)))
+ if state.is_final:
+ arcs.append((0, dfa.states.index(state)))
states.append(arcs)
c.states.append(states)
- c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))
+ c.dfas[c.symbol2number[name]] = (states, self.make_first_sets(c, name))
c.start = c.symbol2number[self.startsymbol]
if self.verbose:
@@ -68,7 +207,7 @@ class ParserGenerator(object):
)
return c
- def make_first(self, c, name):
+ def make_first_sets(self, c, name):
rawfirst = self.first[name]
first = set()
for label in sorted(rawfirst):
@@ -78,67 +217,65 @@ class ParserGenerator(object):
return first
def make_label(self, c, label):
- # XXX Maybe this should be a method on a subclass of converter?
+ label = Label(label)
ilabel = len(c.labels)
- if label[0].isalpha():
- # Either a symbol name or a named token
- if label in c.symbol2number:
- # A symbol name (a non-terminal)
- if label in c.symbol2label:
- return c.symbol2label[label]
- else:
- c.labels.append((c.symbol2number[label], None))
- c.symbol2label[label] = ilabel
- return ilabel
+
+ if label.type == LabelType.NONTERMINAL:
+ if label in c.symbol2label:
+ return c.symbol2label[label]
else:
- # A named token (NAME, NUMBER, STRING)
- itoken = self.tokens.get(label, None)
- assert isinstance(itoken, int), label
- assert itoken in self.tokens.values(), label
- if itoken in c.tokens:
- return c.tokens[itoken]
- else:
- c.labels.append((itoken, None))
- c.tokens[itoken] = ilabel
- return ilabel
- else:
- # Either a keyword or an operator
- assert label[0] in ('"', "'"), label
- value = eval(label)
- if value[0].isalpha():
- # A keyword
- if value in c.keywords:
- return c.keywords[value]
- else:
- c.labels.append((self.tokens["NAME"], value))
- c.keywords[value] = ilabel
- return ilabel
+ c.labels.append((c.symbol2number[label], None))
+ c.symbol2label[label] = ilabel
+ return ilabel
+ elif label.type == LabelType.NAMED_TOKEN:
+ # A named token (NAME, NUMBER, STRING)
+ itoken = self.tokens.get(label, None)
+ assert isinstance(itoken, int), label
+ assert itoken in self.tokens.values(), label
+ if itoken in c.tokens:
+ return c.tokens[itoken]
else:
- # An operator (any non-numeric token)
- tok_name = self.opmap[value] # Fails if unknown token
- itoken = self.tokens[tok_name]
- if itoken in c.tokens:
- return c.tokens[itoken]
- else:
- c.labels.append((itoken, None))
- c.tokens[itoken] = ilabel
- return ilabel
+ c.labels.append((itoken, None))
+ c.tokens[itoken] = ilabel
+ return ilabel
+ elif label.type == LabelType.KEYWORD:
+ # A keyword
+ value = literal_eval(label)
+ if value in c.keywords:
+ return c.keywords[value]
+ else:
+ c.labels.append((self.tokens["NAME"], value))
+ c.keywords[value] = ilabel
+ return ilabel
+ elif label.type == LabelType.OPERATOR:
+ # An operator (any non-numeric token)
+ value = literal_eval(label)
+ tok_name = self.opmap[value] # Fails if unknown token
+ itoken = self.tokens[tok_name]
+ if itoken in c.tokens:
+ return c.tokens[itoken]
+ else:
+ c.labels.append((itoken, None))
+ c.tokens[itoken] = ilabel
+ return ilabel
+ else:
+ raise ValueError("Cannot categorize label {}".format(label))
- def addfirstsets(self):
+ def calculate_first_sets(self):
names = list(self.dfas.keys())
for name in names:
if name not in self.first:
- self.calcfirst(name)
+ self.calculate_first_sets_for_rule(name)
if self.verbose:
print("First set for {dfa_name}".format(dfa_name=name))
for item in self.first[name]:
print(" - {terminal}".format(terminal=item))
- def calcfirst(self, name):
+ def calculate_first_sets_for_rule(self, name):
dfa = self.dfas[name]
- self.first[name] = None # dummy to detect left recursion
- state = dfa[0]
+ self.first[name] = None # dummy to detect left recursion
+ state = dfa.states[0]
totalset = set()
overlapcheck = {}
for label, next in state.arcs.items():
@@ -148,7 +285,7 @@ class ParserGenerator(object):
if fset is None:
raise ValueError("recursion for rule %r" % name)
else:
- self.calcfirst(label)
+ self.calculate_first_sets_for_rule(label)
fset = self.first[label]
totalset.update(fset)
overlapcheck[label] = fset
@@ -159,248 +296,10 @@ class ParserGenerator(object):
for label, itsfirst in overlapcheck.items():
for symbol in itsfirst:
if symbol in inverse:
- raise ValueError("rule %s is ambiguous; %s is in the"
- " first sets of %s as well as %s" %
- (name, symbol, label, inverse[symbol]))
+ raise ValueError(
+ "rule %s is ambiguous; %s is in the"
+ " first sets of %s as well as %s"
+ % (name, symbol, label, inverse[symbol])
+ )
inverse[symbol] = label
self.first[name] = totalset
-
- def parse(self):
- dfas = collections.OrderedDict()
- startsymbol = None
- # MSTART: (NEWLINE | RULE)* ENDMARKER
- while self.type != tokenize.ENDMARKER:
- while self.type == tokenize.NEWLINE:
- self.gettoken()
- # RULE: NAME ':' RHS NEWLINE
- name = self.expect(tokenize.NAME)
- if self.verbose:
- print("Processing rule {dfa_name}".format(dfa_name=name))
- self.expect(tokenize.OP, ":")
- a, z = self.parse_rhs()
- self.expect(tokenize.NEWLINE)
- if self.verbose:
- self.dump_nfa(name, a, z)
- dfa = self.make_dfa(a, z)
- if self.verbose:
- self.dump_dfa(name, dfa)
- self.simplify_dfa(dfa)
- dfas[name] = dfa
- if startsymbol is None:
- startsymbol = name
- return dfas, startsymbol
-
- def make_dfa(self, start, finish):
- # To turn an NFA into a DFA, we define the states of the DFA
- # to correspond to *sets* of states of the NFA. Then do some
- # state reduction. Let's represent sets as dicts with 1 for
- # values.
- assert isinstance(start, NFAState)
- assert isinstance(finish, NFAState)
- def closure(state):
- base = set()
- addclosure(state, base)
- return base
- def addclosure(state, base):
- assert isinstance(state, NFAState)
- if state in base:
- return
- base.add(state)
- for label, next in state.arcs:
- if label is None:
- addclosure(next, base)
- states = [DFAState(closure(start), finish)]
- for state in states: # NB states grows while we're iterating
- arcs = {}
- for nfastate in state.nfaset:
- for label, next in nfastate.arcs:
- if label is not None:
- addclosure(next, arcs.setdefault(label, set()))
- for label, nfaset in sorted(arcs.items()):
- for st in states:
- if st.nfaset == nfaset:
- break
- else:
- st = DFAState(nfaset, finish)
- states.append(st)
- state.addarc(st, label)
- return states # List of DFAState instances; first one is start
-
- def dump_nfa(self, name, start, finish):
- print("Dump of NFA for", name)
- todo = [start]
- for i, state in enumerate(todo):
- print(" State", i, state is finish and "(final)" or "")
- for label, next in state.arcs:
- if next in todo:
- j = todo.index(next)
- else:
- j = len(todo)
- todo.append(next)
- if label is None:
- print(" -> %d" % j)
- else:
- print(" %s -> %d" % (label, j))
-
- def dump_dfa(self, name, dfa):
- print("Dump of DFA for", name)
- for i, state in enumerate(dfa):
- print(" State", i, state.isfinal and "(final)" or "")
- for label, next in sorted(state.arcs.items()):
- print(" %s -> %d" % (label, dfa.index(next)))
-
- def simplify_dfa(self, dfa):
- # This is not theoretically optimal, but works well enough.
- # Algorithm: repeatedly look for two states that have the same
- # set of arcs (same labels pointing to the same nodes) and
- # unify them, until things stop changing.
-
- # dfa is a list of DFAState instances
- changes = True
- while changes:
- changes = False
- for i, state_i in enumerate(dfa):
- for j in range(i+1, len(dfa)):
- state_j = dfa[j]
- if state_i == state_j:
- #print " unify", i, j
- del dfa[j]
- for state in dfa:
- state.unifystate(state_j, state_i)
- changes = True
- break
-
- def parse_rhs(self):
- # RHS: ALT ('|' ALT)*
- a, z = self.parse_alt()
- if self.value != "|":
- return a, z
- else:
- aa = NFAState()
- zz = NFAState()
- aa.addarc(a)
- z.addarc(zz)
- while self.value == "|":
- self.gettoken()
- a, z = self.parse_alt()
- aa.addarc(a)
- z.addarc(zz)
- return aa, zz
-
- def parse_alt(self):
- # ALT: ITEM+
- a, b = self.parse_item()
- while (self.value in ("(", "[") or
- self.type in (tokenize.NAME, tokenize.STRING)):
- c, d = self.parse_item()
- b.addarc(c)
- b = d
- return a, b
-
- def parse_item(self):
- # ITEM: '[' RHS ']' | ATOM ['+' | '*']
- if self.value == "[":
- self.gettoken()
- a, z = self.parse_rhs()
- self.expect(tokenize.OP, "]")
- a.addarc(z)
- return a, z
- else:
- a, z = self.parse_atom()
- value = self.value
- if value not in ("+", "*"):
- return a, z
- self.gettoken()
- z.addarc(a)
- if value == "+":
- return a, z
- else:
- return a, a
-
- def parse_atom(self):
- # ATOM: '(' RHS ')' | NAME | STRING
- if self.value == "(":
- self.gettoken()
- a, z = self.parse_rhs()
- self.expect(tokenize.OP, ")")
- return a, z
- elif self.type in (tokenize.NAME, tokenize.STRING):
- a = NFAState()
- z = NFAState()
- a.addarc(z, self.value)
- self.gettoken()
- return a, z
- else:
- self.raise_error("expected (...) or NAME or STRING, got %s/%s",
- self.type, self.value)
-
- def expect(self, type, value=None):
- if self.type != type or (value is not None and self.value != value):
- self.raise_error("expected %s/%s, got %s/%s",
- type, value, self.type, self.value)
- value = self.value
- self.gettoken()
- return value
-
- def gettoken(self):
- tup = next(self.generator)
- while tup[0] in (tokenize.COMMENT, tokenize.NL):
- tup = next(self.generator)
- self.type, self.value, self.begin, self.end, self.line = tup
- # print(getattr(tokenize, 'tok_name')[self.type], repr(self.value))
-
- def raise_error(self, msg, *args):
- if args:
- try:
- msg = msg % args
- except Exception:
- msg = " ".join([msg] + list(map(str, args)))
- raise SyntaxError(msg, (self.filename, self.end[0],
- self.end[1], self.line))
-
-class NFAState(object):
-
- def __init__(self):
- self.arcs = [] # list of (label, NFAState) pairs
-
- def addarc(self, next, label=None):
- assert label is None or isinstance(label, str)
- assert isinstance(next, NFAState)
- self.arcs.append((label, next))
-
-class DFAState(object):
-
- def __init__(self, nfaset, final):
- assert isinstance(nfaset, set)
- assert isinstance(next(iter(nfaset)), NFAState)
- assert isinstance(final, NFAState)
- self.nfaset = nfaset
- self.isfinal = final in nfaset
- self.arcs = {} # map from label to DFAState
-
- def addarc(self, next, label):
- assert isinstance(label, str)
- assert label not in self.arcs
- assert isinstance(next, DFAState)
- self.arcs[label] = next
-
- def unifystate(self, old, new):
- for label, next in self.arcs.items():
- if next is old:
- self.arcs[label] = new
-
- def __eq__(self, other):
- # Equality test -- ignore the nfaset instance variable
- assert isinstance(other, DFAState)
- if self.isfinal != other.isfinal:
- return False
- # Can't just return self.arcs == other.arcs, because that
- # would invoke this method recursively, with cycles...
- if len(self.arcs) != len(other.arcs):
- return False
- for label, next in self.arcs.items():
- if next is not other.arcs.get(label):
- return False
- return True
-
- __hash__ = None # For Py3 compatibility.
diff --git a/Parser/pgen/token.py b/Parser/pgen/token.py
index e7e8f3f..2cff62c 100644
--- a/Parser/pgen/token.py
+++ b/Parser/pgen/token.py
@@ -6,21 +6,21 @@ def generate_tokens(tokens):
for line in tokens:
line = line.strip()
- if not line or line.startswith('#'):
+ if not line or line.startswith("#"):
continue
name = line.split()[0]
yield (name, next(numbers))
- yield ('N_TOKENS', next(numbers))
- yield ('NT_OFFSET', 256)
+ yield ("N_TOKENS", next(numbers))
+ yield ("NT_OFFSET", 256)
def generate_opmap(tokens):
for line in tokens:
line = line.strip()
- if not line or line.startswith('#'):
+ if not line or line.startswith("#"):
continue
pieces = line.split()
@@ -35,4 +35,4 @@ def generate_opmap(tokens):
# with the token generation in "generate_tokens" because if this
# symbol is included in Grammar/Tokens, it will collide with !=
# as it has the same name (NOTEQUAL).
- yield ('<>', 'NOTEQUAL')
+ yield ("<>", "NOTEQUAL")